Index: polly/lib/Transform/MatmulOptimizer.cpp =================================================================== --- polly/lib/Transform/MatmulOptimizer.cpp +++ polly/lib/Transform/MatmulOptimizer.cpp @@ -13,10 +13,13 @@ #include "polly/ScopInfo.h" #include "polly/ScopPass.h" #include "polly/Simplify.h" +#include "polly/Support/GICHelper.h" #include "polly/Support/ISLTools.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" @@ -126,6 +129,18 @@ "macro-kernel, by Nr, the parameter of the micro-kernel"), cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt + PMBasedTCOpts("polly-tc-opt", + cl::desc("Perform optimizations of tensor contractions based " + "on pattern matching"), + cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); + +static cl::opt + PMBasedMMMOpts("polly-matmul-opt", + cl::desc("Perform optimizations of matrix multiplications " + "based on pattern matching"), + cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory)); + namespace { /// Parameters of the micro kernel. /// @@ -160,6 +175,42 @@ int k = -1; }; +/// Parameters of the tensor contraction operands. +/// +/// Parameters, which describe access relations that represent operands of the +/// tensor contraction. +struct TCInfoTy { + // Memory accesses that represent reading from tensors, which are operands of + // the tensor contraction. + MemoryAccess *A = nullptr; + MemoryAccess *B = nullptr; + // Memory accesses that represent reading from and writing into the tensor, + // which contains the result of the tensor contraction. + MemoryAccess *ReadFromC = nullptr; + MemoryAccess *WriteToC = nullptr; + // Input dimensions of the schedule space, which represent free + // indices of tensors. + SmallDenseSet I; + SmallDenseSet J; + // Input dimensions of the schedule space, which represent contracted + // indices of tensors. + SmallDenseSet P; + // Sizes of tensor dimensions for corresponding input dimensions of + // the schedule space. The size of the tensor dimension can be larger than + // the size of the corresponding input dimension of the schedule space. + // This does not correspond to a tensor contraction. However, such a pattern + // will be optimized by the transformation. + SmallVector DimensionSizes; + SmallVector ADimensions; + SmallVector BDimensions; + SmallVector CDimensions; + // Permutations of indices of I, J, and P, which describe operands of + // the tensor contraction and its result. + SmallVector OrderedI; + SmallVector OrderedJ; + SmallVector OrderedP; +}; + /// Create an isl::union_set, which describes the option of the form /// [isolate[] -> unroll[x]]. /// @@ -1024,14 +1075,579 @@ return false; } +/// Get the dimension size. +/// +/// Return the size of the dimension @p Pos, which is obtained from @p SAI. +/// Return -1 in the case of the first dimension of a multi-dimensional array, +/// since the ScopArrayInfo class does not carry size information. +/// +/// @param SAI The information about the array. +/// @param Pos The position of the dimension. +/// @return The size of the dimension. +static int getDimSize(const ScopArrayInfo *SAI, unsigned Pos) { + if (Pos == 0) + return -1; + const llvm::SCEV *SCEVDimSize = SAI->getDimensionSize(Pos); + assert(SCEVDimSize); + auto *ConstantDimSize = dyn_cast(SCEVDimSize); + assert(ConstantDimSize); + auto *IntDimSize = dyn_cast(ConstantDimSize->getValue()); + assert(IntDimSize); + return IntDimSize->getSExtValue(); +} + +/// Check whether the access relation has the specified form. +/// +/// Check that the access relation @p AccMap has the form T[I0, …, In], where +/// indexes I0, …, In are specified by @p Dimensions. +/// +/// @param Domain The domain of the access relation. +/// @param AccMap The access relation to be checked. +/// @param Dimensions The permutation of the subset of the input dimensions. +/// @return True if @p AccMap has the expected form and false, +/// otherwise. +static bool isCorrectAccessMap(isl::set Domain, isl::map AccMap, + ArrayRef Dimensions) { + isl::space Space = AccMap.get_space(); + if (unsignedFromIslSize(Space.dim(isl::dim::out)) != Dimensions.size()) + return false; + isl::map Universe = isl::map::universe(Space); + + // Create an access relation of the following form: + // [I0, …, Im] -> [Il, …, In], where indexes + // Il, …, In are specified by @p Dimensions. + isl::map PossibleTensor = isl::manage(Universe.copy()); + for (unsigned i = 0; i < Dimensions.size(); i++) { + const int InPos = Dimensions[i]; + if (InPos == -1) + return false; + PossibleTensor = + PossibleTensor.equate(isl::dim::in, InPos, isl::dim::out, i); + } + + AccMap = AccMap.intersect_domain(Domain); + PossibleTensor = PossibleTensor.intersect_domain(Domain); + + // If AccMap spans entire domain (Non-partial write), + // compute FirstPos and SecondPos. + // If AccMap != PossibleTensor here (the two maps have been gisted at + // this point), it means that the writes are not complete, or in other + // words, it is a Partial write and Partial writes must be rejected. + return AccMap.is_equal(PossibleTensor); +} + +/// Check whether the access represents the tensor contraction operand. +/// +/// Check that the access relation @p AccMap has the form T[i1, …, in]. +/// Obtained indexes i1, …, in, their sizes and their permutation are stored +/// into @p IndexSet, @p DimensionSizes, and @p Dimensions, respectively. +/// +/// @param Domain The domain of the access relation. +/// @param AccMap The access relation to be checked. +/// @param IndexSet The subset of the input dimensions. +/// @param DimensionSizes Sizes of the input dimensions of @p Dimensions. +/// @param Dimensions The permutation of the subset of the input dimensions. +/// @return True if @p AccMap has the expected form and false, +/// otherwise. +static bool isTCOperandAcc(isl::set Domain, isl::map AccMap, + SmallDenseSet &IndexSet, + SmallVectorImpl &DimensionSizes, + SmallVectorImpl &Dimensions) { + isl::id Id = AccMap.get_tuple_id(isl::dim::out); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); + assert(SAI && "AccMap should represent memory access"); + + // Fix values of output dimensions with respect to their positions. + isl::map CheckMap = isl::manage(AccMap.copy()); + unsigned OutDimNum = unsignedFromIslSize(CheckMap.dim(isl::dim::out)); + for (unsigned i = 0; i < OutDimNum; i++) + CheckMap = CheckMap.fix_si(isl::dim::out, i, i); + + // In the case of the tensor contraction, values of output dimensions are + // fixed and form a permutation of a subset of values of input dimensions. + // Try to obtain the permutation and sizes of corresponding input dimensions. + Dimensions.assign(OutDimNum, -1); + for (unsigned i : rangeIslSize(0, CheckMap.dim(isl::dim::in))) { + isl::val Val = isl::manage( + isl_map_plain_get_val_if_fixed(CheckMap.get(), isl_dim_in, i)); + if (!Val.is_int()) + continue; + int OutPos = -1; + llvm::APInt ValAPInt = APIntFromVal(Val); + if (ValAPInt.isSignedIntN(32)) + OutPos = ValAPInt.getSExtValue(); + assert((OutPos != -1) && (OutPos < static_cast(OutDimNum))); + if (IndexSet.count(i)) + return false; + IndexSet.insert(i); + Dimensions[OutPos] = i; + if (DimensionSizes[i] <= 0) + DimensionSizes[i] = getDimSize(SAI, OutPos); + } + + return isCorrectAccessMap(Domain, AccMap, Dimensions); +} + +/// Find the intersection of two sets. +/// +/// Find the intersection of the set @p A and the set @p B. +/// +/// @param A, B Sets to intersect. +/// @return The set intersection. +static SmallDenseSet intersect(const SmallDenseSet &A, + const SmallDenseSet &B) { + SmallDenseSet Intersection = A; + set_intersect(Intersection, B); + return Intersection; +} + +/// Determine the access that writes to the tensor, which contains +/// the result of the tensor contraction. +/// +/// @param Domain The domain of the statement. +/// @param Stmt The statement, which writes to memory. +/// @param TCI The information about the tensor contraction. +/// @param IandJIndexSet The set, which contains free indexes of tensors. +/// @return The determined MemoryAccess, or nullptr if there is no necessary +/// access within the SCoP. +static MemoryAccess *getWriteAccess(isl::set Domain, ScopStmt *Stmt, + TCInfoTy &TCI, + SmallDenseSet &IandJIndexSet) { + TCI.WriteToC = nullptr; + SmallVector Accesses = getAccessesInOrder(*Stmt); + for (MemoryAccess *MemA : reverse(Accesses)) { + if (!MemA->isLatestArrayKind()) + continue; + // The last memory access should be a write memory access. + if (!MemA->isWrite()) + return nullptr; + + isl::map AccMap = MemA->getLatestAccessRelation(); + if (!isTCOperandAcc(Domain, AccMap, IandJIndexSet, TCI.DimensionSizes, + TCI.CDimensions)) + return nullptr; + + return MemA; + } + return nullptr; +} + +/// Determine an access, which reads elements of an operand of the tensor +/// contraction +/// +/// @param MemAccessPtr The access, which reads elements of the tensor. +/// @param IndexSet The set, which contains indexes of the tensors. +/// @param IandJIndexSet The set, which contains free indexes of tensors. +/// @param Dimensions The permutation of the subset of the input dimensions. +/// @param TCI The information about the tensor contraction. +/// @return True if the memory access @p MemAccessPtr corresponds +/// to the tensor contraction. +static bool setReadAccess(MemoryAccess *MemAccessPtr, + const SmallDenseSet &IndexSet, + const SmallDenseSet &IandJIndexSet, + SmallVector Dimensions, TCInfoTy &TCI) { + if (!TCI.A) { + // Probably IndexSet is a union of I and P sets. + if (intersect(IndexSet, TCI.P).size() != TCI.P.size()) + return false; + + // Obtain the set I. + TCI.I = std::move(set_difference(IndexSet, TCI.P)); + if (intersect(IandJIndexSet, TCI.I).size() != TCI.I.size()) + return false; + + // Obtain the set J. + TCI.J = std::move(set_difference(IandJIndexSet, TCI.I)); + + // Set the first operand of the tensor contraction. + TCI.A = MemAccessPtr; + TCI.ADimensions = std::move(Dimensions); + return true; + } + + if (!TCI.B) { + // IndexSet should be a union of J and P sets. + if (!(intersect(IndexSet, TCI.J).size() == TCI.J.size()) && + (intersect(IndexSet, TCI.P).size() == TCI.P.size()) && + (IndexSet.size() == TCI.P.size() + TCI.J.size())) + return false; + + // Set the second operand of the tensor contraction. + TCI.B = MemAccessPtr; + TCI.BDimensions = std::move(Dimensions); + return true; + } + + return false; +} + +/// Check that all memory accesses of the statement, except from the last +/// one, are read memory accesses, which read elements of operands of the tensor +/// contraction and its result. +/// +/// @param Domain The domain of the statement. +/// @param Stmt The statement, which writes to memory. +/// @param TCI The information about the tensor contraction. +/// @param IandJIndexSet The set, which contains free indexes of tensors. +/// @return True if all read memory accesses of the statement @p Stmt correspond +/// to the tensor contraction. +static bool setReadAccesses(isl::set Domain, ScopStmt *Stmt, TCInfoTy &TCI, + SmallDenseSet &IandJIndexSet) { + TCI.A = nullptr; + TCI.B = nullptr; + TCI.ReadFromC = nullptr; + SmallVector Accesses = getAccessesInOrder(*Stmt); + for (auto *MemA = Accesses.begin(); *MemA != TCI.WriteToC; MemA++) { + MemoryAccess *MemAccessPtr = *MemA; + if (!MemAccessPtr->isLatestArrayKind()) + continue; + + // All memory accesses, except from the last one, should be read memory + // accesses. + if (MemAccessPtr->isWrite()) + return false; + + // There is only one memory access, which reads elements of the result of + // the tensor contraction. + isl::map AccMap = MemAccessPtr->getLatestAccessRelation(); + if (AccMap.is_equal(TCI.WriteToC->getLatestAccessRelation())) { + if (TCI.ReadFromC) + return false; + TCI.ReadFromC = MemAccessPtr; + continue; + } + + SmallVector Dimensions; + SmallDenseSet IndexSet; + if (!isTCOperandAcc(Domain, AccMap, IndexSet, TCI.DimensionSizes, + Dimensions)) + return false; + + if (!setReadAccess(MemAccessPtr, IndexSet, IandJIndexSet, Dimensions, TCI)) + return false; + } + + // Check that there are read memory accesses, which read elements of operands + // of the tensor contraction and its result. + return TCI.ReadFromC && TCI.A && TCI.B; +} + +/// Check accesses to operands of the tensor contraction. +/// +/// Check that accesses of the SCoP statement, which corresponds to +/// the partial schedule @p PartialSchedule, represent accesses +/// to the non-scalar operands of the tensor contraction. +/// +/// @param Domain The domain of the SCoP statement. +/// @param PartialSchedule The partial schedule of the SCoP statement. +/// @param TCI Parameters of the tensor contraction operands. +/// @return True if the corresponding SCoP statement +/// represents tensor contraction and false, +/// otherwise. +static bool containsOnlyTCAcc(isl::set Domain, isl::map PartialSchedule, + TCInfoTy &TCI) { + isl::id InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in); + ScopStmt *Stmt = static_cast(InputDimsId.get_user()); + + // In region statements, the order of memory accesses execution is not + // predictable at compile-time. + if ((Stmt->size() <= 1) || Stmt->isRegionStmt()) + return false; + + unsigned DimNum = unsignedFromIslSize(PartialSchedule.dim(isl::dim::in)); + TCI.DimensionSizes.resize(DimNum); + SmallDenseSet IandJIndexSet; + + TCI.WriteToC = getWriteAccess(Domain, Stmt, TCI, IandJIndexSet); + if (!TCI.WriteToC) + return false; + + if (intersect(IandJIndexSet, TCI.P).size() != 0) + return false; + + if (!setReadAccesses(Domain, Stmt, TCI, IandJIndexSet)) + return false; + + return true; +} + +/// Check that dependency corresponds to the tensor contraction. +/// +/// Check that the dependency has the form +/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) -> +/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP +/// statement. For this purpose, we analyze the set @p DepDelta, which +/// represents the differences between image elements and domain elements of +/// the corresponding map. +/// +/// @param DepDelta The set contains the differences between image elements +/// and corresponding domain elements of the map, which +/// represents the dependency. +/// @param Pos The position of the index ki. +/// @param BoundDeltas In the case of indexes of ki, the difference between +/// image elements and corresponding domain elements +/// corresponds to the difference between lexicographic +/// minimum and lexicographic maximum of the corresponding +/// dimension of the domain of the statement. +/// @param IndexSet Obtained indexes ki, which describe the dependency. +/// @return True if dependencies correspond to the tensor contraction +/// and false, otherwise. +static bool isTcDep(isl::set DepDelta, unsigned Pos, isl::set BoundDeltas, + SmallDenseSet *IndexSet) { + // Check the difference between the image element and the domain element + // in the case of the index ki. + if (!DepDelta.plain_get_val_if_fixed(isl::dim::set, Pos).is_one()) + return false; + + // Image elements and corresponding domain elements should be equal in + // the case of positions, which are lower than the specified position. + for (unsigned j = 0; j < Pos; j++) + if (!DepDelta.plain_get_val_if_fixed(isl::dim::set, j).is_zero()) + return false; + + // Compute a set, which is used to analyze how values of + // the domain are related to the map that describes the dependency. + isl::map DepDeltaNegToBoundDeltas = isl::map::from_domain_and_range( + isl::manage(isl_set_neg(DepDelta.copy())), BoundDeltas); + isl::set Complement = DepDeltaNegToBoundDeltas.deltas(); + + const unsigned DimNum = unsignedFromIslSize(DepDelta.dim(isl::dim::set)); + for (unsigned j = Pos + 1; j < DimNum; j++) { + if (!IndexSet->count(j)) { + // Check the difference between the image element and the domain element + // in the case of indexes, which do not describe the dependency. + if (DepDelta.plain_get_val_if_fixed(isl::dim::set, j).is_zero()) + continue; + return false; + } + + // In the case of other indexes, which describe the dependency, + // the difference between the image element and the domain element + // should be equal to the difference between lexicographic minimum and + // lexicographic maximum of the domain of the statement. + if (!Complement.plain_get_val_if_fixed(isl::dim::set, j).is_zero()) + return false; + } + + return true; +} + +/// Check that dependencies correspond to the tensor contraction. +/// +/// Check that there are only true dependencies of the form +/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) -> +/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP +/// statement represented by @p Schedule. Such dependencies are produced by +/// the tensor contraction. Obtained indexes ki are stored into @p IndexSet. +/// +/// @param Schedule The schedule of the SCoP statement. +/// @param D The SCoP dependencies. +/// @param Domain The domain of the statement. +/// @param IndexSet Obtained indexes ki, which describe the dependencies. +/// @return True if dependencies correspond to the tensor contraction +/// and false, otherwise. +static bool containsOnlyTcDeps(isl::map Schedule, const Dependences *D, + SmallDenseSet *IndexSet, isl::set Domain) { + isl::union_map Dep = D->getDependences(Dependences::TYPE_RAW); + isl::union_map Red = D->getDependences(Dependences::TYPE_RED); + if (!Red.is_null()) + Dep = Dep.unite(Red); + + isl::space DomainSpace = Schedule.get_space().domain(); + isl::space Space = DomainSpace.map_from_domain_and_range(DomainSpace); + isl::set DepDeltas = Dep.extract_map(Space).deltas(); + unsigned DeltasDimNum = unsignedFromIslSize(DepDeltas.dim(isl::dim::set)); + isl::pw_multi_aff LowerBound = Domain.lexmin_pw_multi_aff(); + isl::pw_multi_aff BoundSub = Domain.lexmax_pw_multi_aff().sub(LowerBound); + auto BoundDeltas = isl::manage(isl_set_from_pw_multi_aff(BoundSub.release())); + + for (int i = static_cast(DeltasDimNum) - 1; i >= 0; i--) { + // In the case of the tensor contraction, the difference between image + // elements and domain elements lies on a hyperplane where a dimension + // has the fixed value one. + isl::set Intersection = DepDeltas.fix_si(isl::dim::set, i, 1); + if (Intersection.is_empty()) + continue; + + if (!isTcDep(Intersection, i, BoundDeltas, IndexSet)) + return false; + + IndexSet->insert(i); + DepDeltas = DepDeltas.subtract(Intersection); + } + + // In the case of the tensor contraction, all dependencies should have + // the previously described form. + if ((DeltasDimNum == 0) || !DepDeltas.is_empty()) + return false; + + return true; +} + +/// Order the dimensions of operands of the tensor contraction. +/// +/// To improve the spatial locality of the data, we use a heuristic, which is +/// introduced in [1]. The heuristic logically reorders the tensor dimensions +/// and consists of the following steps: +/// +/// 1. Sort the dimensions in I and J by increasing stride in the tensor C. +/// 2. If tensor A contains the last dimension of C, swap A with B and I with J. +/// 3. Sort the dimensions in P by increasing stride in the tensor A. +/// +/// [1] - Devin Matthews. High-Performance Tensor Contraction without BLAS. +/// SIAM Journal on Scientific Computing, 40(1). 2016. DOI: 10.1137/16M108968X. +/// +/// @param TCI The information about tensors. +static void orderDimensions(TCInfoTy &TCI) { + // Sort the dimensions in I and J by increasing stride in the tensor C. + for (const int &Dimension : TCI.CDimensions) { + assert(!(TCI.I.count(Dimension) && TCI.J.count(Dimension))); + + if (TCI.I.count(Dimension)) { + TCI.OrderedI.push_back(Dimension); + continue; + } + + if (TCI.J.count(Dimension)) + TCI.OrderedJ.push_back(Dimension); + } + + // If tensor A contains the last dimension of C, swap A with B and I with J. + if (TCI.I.count(TCI.CDimensions.back())) { + MemoryAccess *Access = TCI.A; + TCI.A = TCI.B; + TCI.B = Access; + TCI.I.swap(TCI.J); + TCI.ADimensions.swap(TCI.BDimensions); + TCI.OrderedI.swap(TCI.OrderedJ); + } + + // Sort the dimensions in P by increasing stride in the tensor A. + for (const int &Dimension : TCI.ADimensions) + if (TCI.P.count(Dimension)) + TCI.OrderedP.push_back(Dimension); +} + +/// Check if the SCoP statement could probably be optimized with analytical +/// modeling. +/// +/// containsTC tries to determine whether the following conditions +/// are true: +/// +/// 1. The last memory access modeling an array, MA1, represents writing to +/// memory and has the form S(..., I, ..., J, ...) -> M(shuffle(I, J)), +/// where S is the SCoP statement under consideration and shuffle(I, J) +/// is a permutation of indexes of sets I and J. +/// 2. There are only true dependencies of the form +/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) -> +/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP +/// statement represented by @p Schedule and ki are indexes of the set P. +/// 3. SCoP contains only three access relations, MA2, MA3, and MA4 that +/// represent reading from memory and have the form +/// S(..., I, ..., P, ...) -> M(shuffle(I, P)), +/// S(..., P, ..., J, ...) -> M(shuffle(J, P)), +/// S(...) -> M(shuffle(I, J)), respectively. +/// +/// @param PartialSchedule The PartialSchedule that contains a SCoP statement +/// to check. +/// @param D The SCoP dependencies. +/// @param TCI Parameters of the tensor contraction operands. +/// @param Domain The domain of the statement. +/// @return True if dependencies and memory accesses correspond to the tensor +/// contraction and false, otherwise. +static bool containsTCInfoTy(isl::map PartialSchedule, const Dependences *D, + TCInfoTy &TCI, isl::set Domain) { + if (!containsOnlyTcDeps(PartialSchedule, D, &TCI.P, Domain)) + return false; + + // TODO: handle cases of scalar multiplication if needed. + if (TCI.P.size() == 0) + return false; + + if (!containsOnlyTCAcc(Domain, PartialSchedule, TCI)) + return false; + + // TODO: handle cases of GEMV if needed. + if ((TCI.I.size() == 0) || (TCI.J.size() == 0)) + return false; + + orderDimensions(TCI); + + return true; +} + +/// Check if this node contains a partial schedule that could +/// probably be optimized with analytical modeling. +/// +/// isTCPattern tries to determine whether the following conditions +/// are true: +/// 1. the partial schedule contains only one statement. +/// 2. there are exactly three input dimensions. +/// 3. there are four memory accesses that represent accesses to tensor +/// contraction operand. +/// 4. all memory accesses of the statement except from the last one, are +/// read memory accesses and the last one is a write memory access. +/// 5. all subscripts of the last memory access of the statement do not +/// contain the variable used in the inner loop. +/// +/// If this is the case, we could use an approach that is similar to +/// the one used to get close-to-peak performance of matrix multiplications. +/// +/// @param Node The node to check. +/// @param D The SCoP dependencies. +/// @param TCI Parameters of the tensor contraction operands. +static bool isTCPattern(isl::schedule_node Node, const Dependences *D, + TCInfoTy &TCI) { + Node = Node.child(0); + auto LeafType = isl_schedule_node_get_type(Node.get()); + isl::union_map PartialSchedule = Node.get_prefix_schedule_union_map(); + isl::union_set Domain = Node.domain(); + Node = Node.parent(); + + // The innermost band node is expected. + if (LeafType != isl_schedule_node_leaf || + isl_union_map_n_map(PartialSchedule.get()) != 1) + return false; + + // The partial schedule should contain only one statement. + if (isl_union_set_n_set(Domain.get()) != 1) + return false; + + auto HasFilterNode = false; + auto NodeType = isl_schedule_node_get_type(Node.get()); + + // Check that all predecessors of the node contain all band nodes + // for the statement, which represents the tensor contraction. + // Subsequently, such band nodes will be replaced by a single band node. + while (NodeType != isl_schedule_node_domain) { + if (HasFilterNode && (NodeType == isl_schedule_node_band)) + return false; + + if (NodeType == isl_schedule_node_filter) + HasFilterNode = true; + + Node = Node.parent(); + NodeType = isl_schedule_node_get_type(Node.get()); + } + + isl::map PartialScheduleMap = isl::map::from_union_map(PartialSchedule); + if (containsTCInfoTy(PartialScheduleMap, D, TCI, isl::set(Domain))) + return true; + + return false; +} + } // namespace isl::schedule_node polly::tryOptimizeMatMulPattern(isl::schedule_node Node, const llvm::TargetTransformInfo *TTI, const Dependences *D) { + TCInfoTy TCI; + if (PMBasedTCOpts && isTCPattern(Node, D, TCI)) + LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n"); MatMulInfoTy MMI; - if (isMatrMultPattern(Node, D, MMI)) { + if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) { LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); return optimizeMatMulPattern(Node, TTI, MMI); } Index: polly/lib/Transform/ScheduleOptimizer.cpp =================================================================== --- polly/lib/Transform/ScheduleOptimizer.cpp +++ polly/lib/Transform/ScheduleOptimizer.cpp @@ -302,6 +302,16 @@ /// @param Node The node to check. static bool isTileableBandNode(isl::schedule_node Node); + /// Check if this node is a band node we want to transform using pattern + /// matching. + /// + /// We look for innermost band nodes where individual dimensions are marked as + /// permutable. There is no restriction on the number of individual + /// dimensions. + /// + /// @param Node The node to check. + static bool isPMOptimizableBandNode(isl::schedule_node Node); + /// Pre-vectorizes one scheduling dimension of a schedule band. /// /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and @@ -446,7 +456,10 @@ return true; } -bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { +/// Check if this node is a permutable band node, which has only one child. +/// +/// @param Node The node to check. +static bool isPermutableOneTimeParentBandNode(isl::schedule_node Node) { if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band) return false; @@ -456,6 +469,13 @@ if (!isl_schedule_node_band_get_permutable(Node.get())) return false; + return true; +} + +bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { + if (!isPermutableOneTimeParentBandNode(Node)) + return false; + auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); if (unsignedFromIslSize(Space.dim(isl::dim::set)) <= 1u) @@ -464,6 +484,13 @@ return isSimpleInnermostBand(Node); } +bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node) { + if (!isPermutableOneTimeParentBandNode(Node)) + return false; + + return isSimpleInnermostBand(Node); +} + __isl_give isl::schedule_node ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) { if (FirstLevelTiling) { @@ -509,10 +536,8 @@ assert(OAI && "Expecting optimization options"); isl::schedule_node Node = isl::manage(NodeArg); - if (!isTileableBandNode(Node)) - return Node.release(); - if (OAI->PatternOpts) { + if (OAI->PatternOpts && isPMOptimizableBandNode(Node)) { isl::schedule_node PatternOptimizedSchedule = tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D); if (!PatternOptimizedSchedule.is_null()) { @@ -521,6 +546,9 @@ } } + if (!isTileableBandNode(Node)) + return Node.release(); + if (OAI->Postopts) Node = applyTileBandOpt(Node); Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll @@ -1,7 +1,7 @@ ; RUN: opt %loadPolly \ ; RUN: -polly-pattern-matching-based-opts=true \ ; RUN: -polly-optree -polly-delicm -polly-simplify \ -; RUN: -polly-opt-isl -debug < %s 2>&1 \ +; RUN: -polly-opt-isl -polly-tc-opt=true -debug < %s 2>&1 \ ; RUN: | FileCheck %s ; REQUIRES: asserts @@ -10,6 +10,36 @@ ; is not through the original memory access, but trough a PHI node that was ; delicmed. This test covers the polybench 2mm and 3mm cases. ; +; This test case generates the following schedule, which contans filters: +; +; domain: "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1599 and +; 0 <= i1 <= 1799 and +; 0 <= i2 <= 2199; +; Stmt_for_body3[i0, i1] : 0 <= i0 <= 1599 and +; 0 <= i1 <= 1799; +; Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and +; 0 <= i1 <= 1799 }" +; child: +; sequence: +; - filter: "{ Stmt_for_body3[i0, i1] }" +; child: +; schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] }, { Stmt_for_body3[i0, i1] -> [(i1)] }]" +; permutable: 1 +; coincident: [ 1, 1 ] +; - filter: "{ Stmt_for_body3_last[i0, i1] }" +; child: +; schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] }, { Stmt_for_body3_last[i0, i1] -> [(i1)] }]" +; permutable: 1 +; coincident: [ 1, 1 ] +; - filter: "{ Stmt_for_body8[i0, i1, i2] }" +; child: +; schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] }, +; { Stmt_for_body8[i0, i1, i2] -> [(i1)] }, +; { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]" +; permutable: 1 +; coincident: [ 1, 1, 0 ] +; +; CHECK: The tensor contraction pattern was detected ; CHECK: The matrix multiplication pattern was detected ; Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll @@ -0,0 +1,108 @@ +; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \ +; RUN: -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; +; Check that the pattern matching detects the tensor contraction pattern +; after a full run of -polly-delicm. This test case generates the following +; schedule, which contans two band nodes. Without DeLICM two statement are +; generated. +; +; domain: "{ Stmt5[i0, i1, i2, i3, i4, i5] : 0 <= i0 <= 31 and 0 <= i1 <= 31 and +; 0 <= i2 <= 31 and 0 <= i3 <= 31 and +; 0 <= i4 <= 31 and 0 <= i5 <= 31 }" +; child: +; schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i0)] }, +; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i1)] }, +; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i2)] }, +; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i4)] }, +; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i3)] }]" +; permutable: 1 +; coincident: [ 1, 1, 1, 1, 0 ] +; child: +; schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i5)] }]" +; permutable: 1 +; child: +; leaf +; +; for (i = 0; i < 32; i++) +; for (j = 0; j < 32; j++) +; for (k = 0; k < 32; ++k) +; for (l = 0; l < 32; ++l) +; for (w = 0; w < 32; ++w) +; for (q = 0; q < 32; ++q) +; C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal fastcc void @kernel_tc([32 x [32 x [32 x double]]]* nocapture %C, [32 x [32 x [32 x double]]]* nocapture readonly %A, [32 x [32 x [32 x double]]]* nocapture readonly %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc50, %entry + %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc50 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader + %indvars.iv16 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next17, %for.inc47 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader + %indvars.iv13 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next14, %for.inc44 ] + br label %for.cond10.preheader + +for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader + %indvars.iv10 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next11, %for.inc41 ] + br label %for.cond13.preheader + +for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader + %indvars.iv7 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next8, %for.inc38 ] + %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv19, i64 %indvars.iv16, i64 %indvars.iv13, i64 %indvars.iv7 + %.pre = load double, double* %arrayidx37, align 8 + br label %for.body15 + +for.body15: ; preds = %for.body15, %for.cond13.preheader + %i = phi double [ %.pre, %for.cond13.preheader ], [ %add, %for.body15 ] + %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ] + %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv19, i64 %indvars.iv10, i64 %indvars.iv16, i64 %indvars.iv + %i1 = load double, double* %arrayidx21, align 8 + %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv7, i64 %indvars.iv10, i64 %indvars.iv13 + %i2 = load double, double* %arrayidx29, align 8 + %mul = fmul fast double %i2, %i1 + %add = fadd fast double %i, %mul + store double %add, double* %arrayidx37, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 32 + br i1 %exitcond.not, label %for.inc38, label %for.body15 + +for.inc38: ; preds = %for.body15 + %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1 + %exitcond9.not = icmp eq i64 %indvars.iv.next8, 32 + br i1 %exitcond9.not, label %for.inc41, label %for.cond13.preheader + +for.inc41: ; preds = %for.inc38 + %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1 + %exitcond12.not = icmp eq i64 %indvars.iv.next11, 32 + br i1 %exitcond12.not, label %for.inc44, label %for.cond10.preheader + +for.inc44: ; preds = %for.inc41 + %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 + %exitcond15.not = icmp eq i64 %indvars.iv.next14, 32 + br i1 %exitcond15.not, label %for.inc47, label %for.cond7.preheader + +for.inc47: ; preds = %for.inc44 + %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 + %exitcond18.not = icmp eq i64 %indvars.iv.next17, 32 + br i1 %exitcond18.not, label %for.inc50, label %for.cond4.preheader + +for.inc50: ; preds = %for.inc47 + %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 + %exitcond21.not = icmp eq i64 %indvars.iv.next20, 32 + br i1 %exitcond21.not, label %for.end52, label %for.cond1.preheader + +for.end52: ; preds = %for.inc50 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll @@ -1,5 +1,5 @@ ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false \ -; RUN: -debug < %s 2>&1| FileCheck %s +; RUN: -debug -polly-tc-opt < %s 2>&1| FileCheck %s ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1| FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-ast -analyze < %s | FileCheck %s --check-prefix=PARALLEL-AST ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1| FileCheck %s --check-prefix=STATS -match-full-lines @@ -15,6 +15,7 @@ ; } ; ; CHECK-NOT: The matrix multiplication pattern was detected +; CHECK-NOT: The tensor contraction pattern was detected ; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected ; PARALLEL-AST: #pragma known-parallel ; PARALLEL-AST: #pragma known-parallel Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll @@ -8,7 +8,8 @@ ; RUN: -polly-target-1st-cache-level-size=32768 \ ; RUN: -polly-target-vector-register-bitwidth=256 \ ; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -polly-opt-isl -debug < %s 2>&1 \ +; RUN: -polly-opt-isl -debug \ +; RUN: -polly-tc-opt=true < %s 2>&1 \ ; RUN: | FileCheck %s ; REQUIRES: asserts ; @@ -16,6 +17,7 @@ ; in case scalar memory accesses were replaced by accesses to newly created ; arrays. ; +; CHECK: The tensor contraction pattern was detected ; CHECK: The matrix multiplication pattern was detected ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll @@ -1,5 +1,6 @@ ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s +; RUN: -debug-only=polly-opt-isl -disable-output \ +; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < _PB_NI; i++) @@ -14,6 +15,7 @@ ; } ; ; CHECK-NOT: The matrix multiplication pattern was detected +; CHECK-NOT: The tensor contraction pattern was detected target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll @@ -0,0 +1,64 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 1024; i++) +; for (j = 0; j < 1024; j++) +; for (l = 0; l < 64; ++l) +; for (w = 0; w < 64; ++w) +; C[i][j] += A[i][l][w] * B[w][j][l]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc30, %entry + %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader + %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader + %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ] + br label %for.body9 + +for.body9: ; preds = %for.body9, %for.cond7.preheader + %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ] + %arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv + %i = load double, double* %arrayidx13, align 8 + %arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37 + %i1 = load double, double* %arrayidx19, align 8 + %mul = fmul fast double %i1, %i + %arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40 + %i2 = load double, double* %arrayidx23, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx23, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 64 + br i1 %exitcond, label %for.body9, label %for.inc24 + +for.inc24: ; preds = %for.body9 + %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1 + %exitcond39 = icmp ne i64 %indvars.iv.next38, 64 + br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27 + +for.inc27: ; preds = %for.inc24 + %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1 + %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024 + br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30 + +for.inc30: ; preds = %for.inc27 + %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1 + %exitcond45 = icmp ne i64 %indvars.iv.next44, 1024 + br i1 %exitcond45, label %for.cond1.preheader, label %for.end32 + +for.end32: ; preds = %for.inc30 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll @@ -0,0 +1,64 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 32; i++) +; for (j = 0; j < 1024; j++) +; for (k = 0; k < 32; ++k) +; for (l = 0; l < 1024; ++l) +; C[i][j][k] += A[i][k][l] * B[l][j]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, [1024 x [32 x double]]* %C, [32 x [1024 x double]]* %A, [1024 x double]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc30, %entry + %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader + %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader + %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ] + br label %for.body9 + +for.body9: ; preds = %for.body9, %for.cond7.preheader + %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ] + %arrayidx13 = getelementptr inbounds [32 x [1024 x double]], [32 x [1024 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv + %i = load double, double* %arrayidx13, align 8 + %arrayidx17 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv40 + %i1 = load double, double* %arrayidx17, align 8 + %mul = fmul fast double %i1, %i + %arrayidx23 = getelementptr inbounds [1024 x [32 x double]], [1024 x [32 x double]]* %C, i64 %indvars.iv43, i64 %indvars.iv40, i64 %indvars.iv37 + %i2 = load double, double* %arrayidx23, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx23, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.body9, label %for.inc24 + +for.inc24: ; preds = %for.body9 + %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1 + %exitcond39 = icmp ne i64 %indvars.iv.next38, 32 + br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27 + +for.inc27: ; preds = %for.inc24 + %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1 + %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024 + br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30 + +for.inc30: ; preds = %for.inc27 + %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1 + %exitcond45 = icmp ne i64 %indvars.iv.next44, 32 + br i1 %exitcond45, label %for.cond1.preheader, label %for.end32 + +for.end32: ; preds = %for.inc30 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll @@ -0,0 +1,84 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 32; i++) +; for (j = 0; j < 32; j++) +; for (k = 0; k < 32; ++k) +; for (l = 0; l < 32; ++l) +; for (w = 0; w < 32; ++w) +; for (q = 0; q < 32; ++q) +; C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [32 x [32 x [32 x double]]]* %C, [32 x [32 x [32 x double]]]* %A, [32 x [32 x [32 x double]]]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc50, %entry + %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader + %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader + %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ] + br label %for.cond10.preheader + +for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader + %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ] + br label %for.cond13.preheader + +for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader + %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ] + br label %for.body15 + +for.body15: ; preds = %for.body15, %for.cond13.preheader + %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ] + %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv71, i64 %indvars.iv62, i64 %indvars.iv68, i64 %indvars.iv + %i = load double, double* %arrayidx21, align 8 + %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv59, i64 %indvars.iv62, i64 %indvars.iv65 + %i1 = load double, double* %arrayidx29, align 8 + %mul = fmul fast double %i1, %i + %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59 + %i2 = load double, double* %arrayidx37, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx37, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.body15, label %for.inc38 + +for.inc38: ; preds = %for.body15 + %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1 + %exitcond61 = icmp ne i64 %indvars.iv.next60, 32 + br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41 + +for.inc41: ; preds = %for.inc38 + %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1 + %exitcond64 = icmp ne i64 %indvars.iv.next63, 32 + br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44 + +for.inc44: ; preds = %for.inc41 + %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1 + %exitcond67 = icmp ne i64 %indvars.iv.next66, 32 + br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47 + +for.inc47: ; preds = %for.inc44 + %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1 + %exitcond70 = icmp ne i64 %indvars.iv.next69, 32 + br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50 + +for.inc50: ; preds = %for.inc47 + %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1 + %exitcond73 = icmp ne i64 %indvars.iv.next72, 32 + br i1 %exitcond73, label %for.cond1.preheader, label %for.end52 + +for.end52: ; preds = %for.inc50 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll @@ -0,0 +1,84 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 8; i++) +; for (j = 0; j < 8; j++) +; for (k = 0; k < 4; ++k) +; for (l = 0; l < 1024; ++l) +; for (w = 0; w < 1024; ++w) +; for (q = 0; q < 4; ++q) +; C[i][j][k][w][q] += A[q][k][j][l][i] * B[l][w]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc([8 x [4 x [1024 x [4 x double]]]]* %C, [4 x [8 x [1024 x [8 x double]]]]* %A, [1024 x double]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc50, %entry + %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader + %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader + %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ] + br label %for.cond10.preheader + +for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader + %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ] + br label %for.cond13.preheader + +for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader + %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ] + br label %for.body15 + +for.body15: ; preds = %for.body15, %for.cond13.preheader + %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ] + %arrayidx23 = getelementptr inbounds [4 x [8 x [1024 x [8 x double]]]], [4 x [8 x [1024 x [8 x double]]]]* %A, i64 %indvars.iv, i64 %indvars.iv65, i64 %indvars.iv68, i64 %indvars.iv62, i64 %indvars.iv71 + %i = load double, double* %arrayidx23, align 8 + %arrayidx27 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv62, i64 %indvars.iv59 + %i1 = load double, double* %arrayidx27, align 8 + %mul = fmul fast double %i1, %i + %arrayidx37 = getelementptr inbounds [8 x [4 x [1024 x [4 x double]]]], [8 x [4 x [1024 x [4 x double]]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59, i64 %indvars.iv + %i2 = load double, double* %arrayidx37, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx37, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 4 + br i1 %exitcond, label %for.body15, label %for.inc38 + +for.inc38: ; preds = %for.body15 + %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1 + %exitcond61 = icmp ne i64 %indvars.iv.next60, 1024 + br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41 + +for.inc41: ; preds = %for.inc38 + %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1 + %exitcond64 = icmp ne i64 %indvars.iv.next63, 1024 + br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44 + +for.inc44: ; preds = %for.inc41 + %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1 + %exitcond67 = icmp ne i64 %indvars.iv.next66, 4 + br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47 + +for.inc47: ; preds = %for.inc44 + %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1 + %exitcond70 = icmp ne i64 %indvars.iv.next69, 8 + br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50 + +for.inc50: ; preds = %for.inc47 + %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1 + %exitcond73 = icmp ne i64 %indvars.iv.next72, 8 + br i1 %exitcond73, label %for.cond1.preheader, label %for.end52 + +for.end52: ; preds = %for.inc50 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; /* C := alpha*A*B + beta*C */ @@ -15,6 +16,7 @@ ; after the interchanging of loops. ; ; CHECK-NOT: The matrix multiplication pattern was detected +; CHECK-NOT: The tensor contraction pattern was detected ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll =================================================================== --- /dev/null +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll @@ -0,0 +1,94 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug < %s 2>&1| FileCheck %s +; REQUIRES: asserts +; +; for (i = 0; i < 16; i++) +; for (j = 0; j < 16; j++) +; for (k = 0; k < 8; ++k) +; for (l = 0; l < 1024; ++l) +; for (w = 0; w < 8; ++w) +; for (q = 0; q < 8; ++q) +; for (x = 0; x < 8; ++x) +; C[i][j][k][w][q][x] += A[l][x][j][k] * B[w][q][l][i]; +; +; CHECK: The tensor contraction pattern was detected +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define internal void @kernel_tc([16 x [8 x [8 x [8 x [8 x double]]]]]* %C, [8 x [16 x [8 x double]]]* %A, [8 x [1024 x [16 x double]]]* %B) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc60, %entry + %indvars.iv85 = phi i64 [ 0, %entry ], [ %indvars.iv.next86, %for.inc60 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc57, %for.cond1.preheader + %indvars.iv82 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next83, %for.inc57 ] + br label %for.cond7.preheader + +for.cond7.preheader: ; preds = %for.inc54, %for.cond4.preheader + %indvars.iv79 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next80, %for.inc54 ] + br label %for.cond10.preheader + +for.cond10.preheader: ; preds = %for.inc51, %for.cond7.preheader + %indvars.iv76 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next77, %for.inc51 ] + br label %for.cond13.preheader + +for.cond13.preheader: ; preds = %for.inc48, %for.cond10.preheader + %indvars.iv73 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next74, %for.inc48 ] + br label %for.cond16.preheader + +for.cond16.preheader: ; preds = %for.inc45, %for.cond13.preheader + %indvars.iv70 = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next71, %for.inc45 ] + br label %for.body18 + +for.body18: ; preds = %for.body18, %for.cond16.preheader + %indvars.iv = phi i64 [ 0, %for.cond16.preheader ], [ %indvars.iv.next, %for.body18 ] + %arrayidx24 = getelementptr inbounds [8 x [16 x [8 x double]]], [8 x [16 x [8 x double]]]* %A, i64 %indvars.iv76, i64 %indvars.iv, i64 %indvars.iv82, i64 %indvars.iv79 + %i = load double, double* %arrayidx24, align 8 + %arrayidx32 = getelementptr inbounds [8 x [1024 x [16 x double]]], [8 x [1024 x [16 x double]]]* %B, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv76, i64 %indvars.iv85 + %i1 = load double, double* %arrayidx32, align 8 + %mul = fmul fast double %i1, %i + %arrayidx44 = getelementptr inbounds [16 x [8 x [8 x [8 x [8 x double]]]]], [16 x [8 x [8 x [8 x [8 x double]]]]]* %C, i64 %indvars.iv85, i64 %indvars.iv82, i64 %indvars.iv79, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv + %i2 = load double, double* %arrayidx44, align 8 + %add = fadd fast double %i2, %mul + store double %add, double* %arrayidx44, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 8 + br i1 %exitcond, label %for.body18, label %for.inc45 + +for.inc45: ; preds = %for.body18 + %indvars.iv.next71 = add nuw nsw i64 %indvars.iv70, 1 + %exitcond72 = icmp ne i64 %indvars.iv.next71, 8 + br i1 %exitcond72, label %for.cond16.preheader, label %for.inc48 + +for.inc48: ; preds = %for.inc45 + %indvars.iv.next74 = add nuw nsw i64 %indvars.iv73, 1 + %exitcond75 = icmp ne i64 %indvars.iv.next74, 8 + br i1 %exitcond75, label %for.cond13.preheader, label %for.inc51 + +for.inc51: ; preds = %for.inc48 + %indvars.iv.next77 = add nuw nsw i64 %indvars.iv76, 1 + %exitcond78 = icmp ne i64 %indvars.iv.next77, 1024 + br i1 %exitcond78, label %for.cond10.preheader, label %for.inc54 + +for.inc54: ; preds = %for.inc51 + %indvars.iv.next80 = add nuw nsw i64 %indvars.iv79, 1 + %exitcond81 = icmp ne i64 %indvars.iv.next80, 8 + br i1 %exitcond81, label %for.cond7.preheader, label %for.inc57 + +for.inc57: ; preds = %for.inc54 + %indvars.iv.next83 = add nuw nsw i64 %indvars.iv82, 1 + %exitcond84 = icmp ne i64 %indvars.iv.next83, 16 + br i1 %exitcond84, label %for.cond4.preheader, label %for.inc60 + +for.inc60: ; preds = %for.inc57 + %indvars.iv.next86 = add nuw nsw i64 %indvars.iv85, 1 + %exitcond87 = icmp ne i64 %indvars.iv.next86, 16 + br i1 %exitcond87, label %for.cond1.preheader, label %for.end62 + +for.end62: ; preds = %for.inc60 + ret void +} Index: polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll =================================================================== --- polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll +++ polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll @@ -1,12 +1,13 @@ ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -debug < %s 2>&1| FileCheck %s +; RUN: -debug -polly-tc-opt=true < %s 2>&1| FileCheck %s ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ ; RUN: -polly-target-throughput-vector-fma=1 \ ; RUN: -polly-target-latency-vector-fma=8 \ ; RUN: -polly-target-1st-cache-level-size=32768 \ ; RUN: -polly-target-vector-register-bitwidth=256 \ ; RUN: -polly-target-2nd-cache-level-size=262144 -polly-ast \ -; RUN: -analyze < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS +; RUN: -analyze -polly-tc-opt=true < %s | \ +; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS ; REQUIRES: asserts ; ; C := A * B + C @@ -20,6 +21,7 @@ ; for (j = 0; j < _PB_NJ; j++) ; C[i][j] += A[i][k] * B[k][j]; ; +; CHECK: The tensor contraction pattern was detected ; CHECK: The matrix multiplication pattern was detected ; ; PATTERN-MATCHING-OPTS: // 1st level tiling - Tiles