Index: include/polly/ScheduleOptimizer.h =================================================================== --- include/polly/ScheduleOptimizer.h +++ include/polly/ScheduleOptimizer.h @@ -10,7 +10,9 @@ #ifndef POLLY_SCHEDULEOPTIMIZER_H #define POLLY_SCHEDULEOPTIMIZER_H +#include "polly/ScopInfo.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "isl/isl-noexceptions.h" namespace llvm { @@ -20,6 +22,73 @@ struct isl_schedule_node; +/// Parameters for the classification algorithm. +class Reference { +public: + // Access type (read or write). + unsigned AccessType; + // Id access (array's name) + std::string ID; + // Integer domain for an array reference with respect to + // the outermost loop nest. + isl::set Domain; + // Element accessed for each outer loop iteration. + llvm::SmallVector ElementAccessed; + // Step + llvm::SmallVector Step; + // Relative indexes order. + llvm::SmallVector LoopOrder; + + // classificaton + enum class ReferenceKind { + SingleElement, + Chunk, + Neighbourhood, + Full, + Shared, + } Type; + + // Construct a Reference object. + Reference(unsigned AccessType, std::string ID, isl::set Domain, + llvm::SmallVector ElementAccessed, + llvm::SmallVector Step, + llvm::SmallVector LoopOrder); + ~Reference(); +}; + +/// Each Skeleton conatins the access pattern of a given Stmt. +/// The end goal of my GSoC project would be to classify the +/// Stmts based on the access patterns they performed. Once a +/// given Stmt has been classified can we understand a set +/// of transformation that we know are beneficial for the given +/// Stmt? +class Skeleton { +public: + llvm::SmallVector References; + std::string StmtBaseName; + + // Construct a Skeleton object. + Skeleton(llvm::SmallVector References, + std::string StmtBaseName); + + ~Skeleton(); + + // get the name of the Stmt the Skeleton + // belongs to. + const char *getStmtBaseName() const; + + // get the references. + const llvm::SmallVector &getReferences() const; + + // print the Skeleton. + void print(llvm::raw_ostream &OS) const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + // Dump a representation. + void dump() const; +#endif +}; + /// Parameters of the micro kernel. /// /// Parameters, which determine sizes of rank-1 (i.e., outer product) update @@ -52,6 +121,7 @@ struct OptimizerAdditionalInfoTy { const llvm::TargetTransformInfo *TTI; const Dependences *D; + const llvm::SmallVector &Skeletons; }; /// Parameters of the matrix multiplication operands. Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -423,6 +423,9 @@ /// @returns True, if the arrays are compatible, False otherwise. bool isCompatibleWith(const ScopArrayInfo *Array) const; + /// return the Scop for this SAI. + Scop *getScop() const { return &S; } + private: void addDerivedSAI(ScopArrayInfo *DerivedSAI) { DerivedSAIs.insert(DerivedSAI); Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -1325,6 +1325,58 @@ return false; } +static bool +containsStreamingAccesses(const SmallVector &References) { + + // only Chunk or SingleElement can have streaming behaviour. + for (auto const &it : References) + if (it.Type != Reference::ReferenceKind::Chunk) + return false; + for (auto const &it : References) + if (it.Type != Reference::ReferenceKind::SingleElement) + return false; + + assert(References.size() != 1 && "Single Reference"); + + // check if the loop indexes have the same order. + for (std::size_t i = 1, e = References.size(); i != e; ++i) { + if (!std::equal(References[0].LoopOrder.begin(), + References[0].LoopOrder.end(), + References[i].LoopOrder.begin())) + return false; + } + return true; +} + +/// Check if the node contains a Stmt with streaming behavior. +/// +/// @param Node. Node to be examined. +/// @param SK. The Skeleton array containing all the references for all the +/// Stmts. +/// 1. The skeleton should look like: CHUNK -> CHUNK or ELEMENT -> ELEMENT. +/// 2. The partial index order should be the same. +/// 3. TODO:It should have a stride in the innermost dimension. +/// 4. TODO: Dependencies? + +static bool isStreaming(isl::schedule_node Node, + const SmallVector &SK) { + isl::union_map PartialSchedule = isl::manage( + isl_schedule_node_band_get_partial_schedule_union_map(Node.get())); + if (PartialSchedule.n_map() != 1) + return false; + isl::map SinglePartialSchedule = isl::map::from_union_map(PartialSchedule); + isl::id InputDimsId = SinglePartialSchedule.get_tuple_id(isl::dim::in); + ScopStmt *Stmt = static_cast(InputDimsId.get_user()); + bool Streaming = false; + for (auto const &it : SK) { + if (strcmp(it.StmtBaseName.c_str(), Stmt->getBaseName()) != 0) + continue; + if (containsStreamingAccesses(it.References)) + Streaming = true; + } + return Streaming; +} + __isl_give isl_schedule_node * ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node, void *User) { @@ -1334,6 +1386,12 @@ const OptimizerAdditionalInfoTy *OAI = static_cast(User); + // avoid tiling. + if (isStreaming(isl::manage_copy(Node), OAI->Skeletons)) { + LLVM_DEBUG(dbgs() << "Streaming detected\n"); + return Node; + } + MatMulInfoTy MMI; if (PMBasedOpts && User && isMatrMultPattern(isl::manage_copy(Node), OAI->D, MMI)) { @@ -1467,6 +1525,390 @@ &Version); } +/// Get the domain based on the bounds. +/// +/// @param MemAcc The memory access for which we want to +/// compute the domain. +/// @return The resulting domain. + +static isl::set getDomain(const MemoryAccess *MemAcc) { + + auto ArrayInfo = MemAcc->getLatestScopArrayInfo(); + unsigned NumDims = ArrayInfo->getNumberOfDimensions(); + + if (NumDims == 0) + return isl::set::universe(ArrayInfo->getSpace()); + + Scop *S = ArrayInfo->getScop(); + isl::union_map Accesses = S->getAccesses(); + isl::union_map sched = ArrayInfo->getScop()->getSchedule(); + + Accesses = Accesses.apply_domain(sched); + + isl::union_set AccessRange = Accesses.range(); + AccessRange = AccessRange.coalesce(); + AccessRange = AccessRange.detect_equalities(); + AccessRange = AccessRange.coalesce(); + + if (AccessRange.is_empty()) + return isl::set::empty(ArrayInfo->getSpace()); + + isl::set AccessSet = AccessRange.extract_set(ArrayInfo->getSpace()); + isl::local_space LS = isl::local_space(ArrayInfo->getSpace()); + isl::set Dom = isl::set::universe(ArrayInfo->getSpace()); + + for (unsigned i = 0; i < NumDims; ++i) { + isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, i); + isl::pw_aff OuterMin = AccessSet.dim_min(i); + isl::pw_aff OuterMax = AccessSet.dim_max(i); + OuterMin = OuterMin.add_dims(isl::dim::in, Val.dim(isl::dim::in)); + OuterMax = OuterMax.add_dims(isl::dim::in, Val.dim(isl::dim::in)); + OuterMin = OuterMin.set_tuple_id(isl::dim::in, ArrayInfo->getBasePtrId()); + OuterMax = OuterMax.set_tuple_id(isl::dim::in, ArrayInfo->getBasePtrId()); + Dom = Dom.intersect(OuterMin.le_set(Val)); + Dom = Dom.intersect(OuterMax.ge_set(Val)); + } + return Dom; +} + +/// get the step. +/// +/// @param MemAcc The memory access for which we want to compute the step. +/// @param Sched The Scop schedule. +/// @return The step in each dimension for the @p MemAcc. + +static SmallVector getStep(MemoryAccess *MemAcc, + const isl::map &Sched) { + + // LLVM_DEBUG(dbgs() << MemAcc->getNumSubscripts() << "\n"); + llvm::SmallVector Steps; + isl::val Val; + for (unsigned u = 0; u < MemAcc->getNumSubscripts(); ++u) { + bool relate = false; + isl::pw_aff SubscriptAsAff = MemAcc->getPwAff(MemAcc->getSubscript(u)); + for (unsigned uu = 0; uu < SubscriptAsAff.dim(isl::dim::in); ++uu) { + SubscriptAsAff.foreach_piece([&](isl::set S, isl::aff Aff) -> isl::stat { + isl::val Val = Aff.get_coefficient_val(isl::dim::in, uu); + if (!Val.is_zero()) { + relate = true; + // LLVM_DEBUG(dbgs() << "Subscript relates to a dim\n"); + // before pushing back the element we check if + // it belongs to a single dimensional array since + // in this case the subscripts embed also information + // on the element size. + const ScopArrayInfo *SAI = MemAcc->getScopArrayInfo(); + if (SAI->getNumberOfDimensions() == 1) { + int SizeInBytes = SAI->getElemSizeInBytes(); + Val = Val.div_ui(SizeInBytes); + } + Steps.push_back(Val); + } + return isl::stat::ok; + }); + } + /// if we have a subscript that does not relate to any dimension + /// we push a zero. + if (!relate) { + // LLVM_DEBUG(dbgs() << "Subscript does not relates to any dim. hence + // zero\n"); + isl::val Val(MemAcc->getStatement()->getIslCtx(), 0); + Steps.push_back(Val); + } + } + return Steps; +} + +/// Find the element accessed by each memory access +/// for every iteration of the outermost loop. +/// In addition, the function +/// records the relative indexes order. +/// Three cases: +/// 1. The access is dependent on the outer loops +/// 2. The access is dependent on the inner loops +/// 3. The access is not dependent on any loops +/// +/// @param MemAcc The memory access for which we want to compute +/// the number of element(s) accessed. +/// @param Sched The Scop schedule. +/// @return A std::pair. The first element represents the +/// element accessed in each loop dimension. The +/// second element records the relative indexes order. + +static std::pair, llvm::SmallVector> +getElementAccessed(MemoryAccess *MemAcc, const isl::map &Sched, + const isl::set &AccessSet) { + + // MemAcc->updateDimensionality(); + llvm::SmallVector ElementAccessed; + llvm::SmallVector LoopOrder; + isl::map AccMap = MemAcc->getLatestAccessRelation(); + AccMap = AccMap.intersect_domain(MemAcc->getStatement()->getDomain()); + isl::pw_multi_aff MultiAff = isl::pw_multi_aff::from_map(AccMap); + unsigned IndexAccessMap = 0; + + /// iterate across the dimensions of the accessed memory location. + for (unsigned u = 0; u < AccMap.dim(isl::dim::out); ++u) { + isl::pw_aff PwAff = MultiAff.get_pw_aff(u); + PwAff.foreach_piece([&](isl::set S, isl::aff Aff) -> isl::stat { + bool Relate = false; + /// check if a dimension relates to a loop index. + for (unsigned LoopDim = 0; LoopDim < AccMap.dim(isl::dim::in); + ++LoopDim) { + isl::val V = Aff.get_coefficient_val(isl::dim::in, LoopDim); + if (!V.is_zero()) { + isl::pw_aff OuterMin, OuterMax; + if (LoopDim == 0) { + // LLVM_DEBUG(dbgs() << "dimension relates to outer loops" << "\n"); + isl::local_space Ls = + isl::local_space(AccessSet.get_space().params()); + isl::aff One = isl::aff(Ls); + One = One.add_constant_si(1); + isl::val Val = One.get_constant_val(); + ElementAccessed.push_back(Val); + } else { + // LLVM_DEBUG(dbgs() << "dimension relates to inner loops" << "\n"); + OuterMin = AccessSet.dim_min(IndexAccessMap); + OuterMax = AccessSet.dim_max(IndexAccessMap); + OuterMax = OuterMax.sub(OuterMin); + + assert(OuterMax.n_piece() == 1); + OuterMax.foreach_piece([&](isl::set S, isl::aff Aff) -> isl::stat { + isl::val Val = Aff.get_constant_val(); + ElementAccessed.push_back(Val); + return isl::stat::ok; + }); + } + LoopOrder.push_back(LoopDim); + Relate = true; + IndexAccessMap++; + } + // The dimension does not relate. + else if (V.is_zero()) { + if (!Relate && LoopDim == AccMap.dim(isl::dim::in) - 1) { + // LLVM_DEBUG(dbgs() << "found dimension that does not relate to any + // loop index\n"); + isl::local_space Ls = + isl::local_space(AccessSet.get_space().params()); + isl::aff One = isl::aff(Ls); + One = One.add_constant_si(1); + isl::val Val = One.get_constant_val(); + ElementAccessed.push_back(Val); + } + } + } + return isl::stat::ok; + }); + } + return std::make_pair(ElementAccessed, LoopOrder); +} + +/// check if only a single element is accessed. +/// +/// @param E The element(s) accessed by a given reference. +/// @return True in case @p E contains a 1 value in each +/// loop dimension, false otherwise. + +static bool doesAccessSingleElementArray(llvm::SmallVector &E) { + bool isElement = true; + for (auto it : E) { + if (!it.is_one()) + isElement = false; + } + return isElement; +} + +/// extract the species for each memory reference. +/// @param R The array of memory references to be classified. + +static void extractSpecies(llvm::SmallVector &R) { + assert(!R.empty() && "Structure is empty"); + for (auto &it : R) { + if (it.Step[0].is_zero() && it.AccessType == 1) { + it.Type = Reference::ReferenceKind::Full; + } else if (it.Step[0].is_zero() && it.AccessType == 2) { + it.Type = Reference::ReferenceKind::Shared; + } else if (doesAccessSingleElementArray(it.ElementAccessed)) { + it.Type = Reference::ReferenceKind::SingleElement; + } else { + it.Type = Reference::ReferenceKind::Chunk; + } + } +} + +/// helper function. To be removed. +/// Get the string representation for ReferenceKind. +/// +/// @param t The ReferenceKind to be converted to string format. +/// @return The String format for @p. +static std::string getTypeAsString(Reference::ReferenceKind t) { + switch (t) { + case Reference::ReferenceKind::Chunk: + return "CHUNK"; + case Reference::ReferenceKind::SingleElement: + return "SINGLE_ELEMENT"; + case Reference::ReferenceKind::Shared: + return "SHARED"; + case Reference::ReferenceKind::Full: + return "FULL"; + default: + return "ND"; + } +} + +/// Collect the 5-tuples. +/// 5-tuples classification: +/// 1. Reference name +/// 2. Type of access (read or write) +/// 3. Integer domain of the array reference w.r.t. the loop bounds +/// 4. Number of element accessed per outer loop iteration. +/// 5. Step +/// Additionally we record also the relative loop indexes order. +/// +/// @param Stmt The Stmt that contains the references to be classified. +/// @result References and collected information. + +static SmallVector collectReferenceInfo(ScopStmt &Stmt) { + + // LLVM_DEBUG(dbgs() << "****" << "\n"); + // Stmt.dump(); + // LLVM_DEBUG(dbgs() << "****" << "\n"); + + auto Accesses = getAccessesInOrder(Stmt); + llvm::SmallVector References; + + for (auto *MemI = Accesses.begin(), *MemE = Accesses.end(); MemI != MemE; + MemI++) { + + auto *MemAccessPtr = *MemI; + + // TODO: Fix for PHI, ExitPHI, Value and Intrinsic. + // At the moment use -polly-position = early. + // if (MemAccessPtr->isMemoryIntrinsic()) + // continue; + + enum MemoryKind MK = MemAccessPtr->getOriginalKind(); + if (MK != MemoryKind::Array) + continue; + + /// 5-tuples classification. + /// Access type, read or write. + unsigned AccessType = MemAccessPtr->getType(); + /// Access ID. + std::string AccessID = MemAccessPtr->getLatestScopArrayInfo()->getName(); + /// Access Domain w.r.t. outermost loop. + isl::set Domain = getDomain(MemAccessPtr); + /// Elements accessed per outermost loop iteration. + isl::map ScheduleMap = Stmt.getSchedule(); + llvm::SmallVector ElementAccessed; + llvm::SmallVector LoopOrder; + std::tie(ElementAccessed, LoopOrder) = + getElementAccessed(MemAccessPtr, ScheduleMap, Domain); + assert(!ElementAccessed.empty() && "ElementAccessed did not initialise"); + assert(!LoopOrder.empty() && "LoopOrder did not initialise"); + /// get the step. + llvm::SmallVector Step; + Step = getStep(MemAccessPtr, ScheduleMap); + assert(!Step.empty() && "getStep did not initialise"); + + Reference R(AccessType, AccessID, Domain, ElementAccessed, Step, LoopOrder); + + References.push_back(R); + } + + return References; +} + +/// Check Stmt profitability. +/// TODO: find a better profitability check. +/// +/// @param Stmt A given Stmt detected in the Scop. +/// @return True if the @p Stmt is deemed profitable to be classified, +/// false otherwise. + +static bool isProfitableStmt(ScopStmt &Stmt) { + + if (Stmt.isEmpty()) { + LLVM_DEBUG(dbgs() << "Profitability: Empty Stmt" + << "\n"); + return false; + } + + /// TODO: find a better profitability check. + /// maybe it is better to count the number of instructions (i.e. load/store). + if (getAccessesInOrder(Stmt).size() <= 2) { + LLVM_DEBUG(dbgs() << "Profitability: Few Mem. references" + << "\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Profitability: Stmt is profitable" + << "\n"); + return true; +} + +/// Collect the statements for the species classification. +/// Refer to the following paper for more details: +/// Cedric Nugteren, Pieter Custers, and Henk Corporaal. 2013. +/// Algorithmic species: A classification of affine loop nests for parallel +/// programming. ACM Trans. Archit. Code Optim. 9, 4, Article 40 (January 2013), +/// 25 pages. DOI=http://dx.doi.org/10.1145/2400682.2400699 or +/// https://docs.google.com/document/d/1OOMg37pXI3bBkHMAtQsfszYwJ6-GvpO9e4zQ2qf44tM/edit +/// +/// @param Scop. The Scop to be analyzed. +/// @return The Skeletons array. Each skeleton represents +/// the access pattern of a given Stmt. + +static llvm::SmallVector collectStmtForClassification(Scop &S) { + llvm::SmallVector Skeletons; + for (auto &Stmt : S) { + if (isProfitableStmt(Stmt)) { + std::string StmtBaseName = Stmt.getBaseName(); + assert(!StmtBaseName.empty() && "Stmt Name did not initialise"); + llvm::SmallVector References = collectReferenceInfo(Stmt); + assert(!References.empty() && "References did not initialise"); + extractSpecies(References); + Skeleton S(References, StmtBaseName); + Skeletons.push_back(S); + } else { + LLVM_DEBUG(dbgs() << "Skip Stmt" + << "\n"); + } + } + return Skeletons; +} + +/// Construct a new Skeleton object. +Skeleton::Skeleton(llvm::SmallVector References, + std::string StmtBaseName) { + this->References = References; + this->StmtBaseName = StmtBaseName; +} + +/// Returns the Stmt name the Skeleton belongs to. +const char *Skeleton::getStmtBaseName() const { return StmtBaseName.c_str(); } + +/// Returns the references detected in the Stmt. +const llvm::SmallVector &Skeleton::getReferences() const { + return References; +} + +Skeleton::~Skeleton() = default; + +/// Construct a new Reference object. +Reference::Reference(unsigned AccessType, std::string ID, isl::set Domain, + llvm::SmallVector ElementAccessed, + llvm::SmallVector Step, + llvm::SmallVector LoopOrder) { + this->AccessType = AccessType; + this->ID = ID; + this->Domain = Domain; + this->ElementAccessed = ElementAccessed; + this->Step = Step; + this->LoopOrder = LoopOrder; +} + +Reference::~Reference() = default; + bool IslScheduleOptimizer::runOnScop(Scop &S) { // Skip SCoPs in case they're already optimised by PPCGCodeGeneration if (S.isToBeSkipped()) @@ -1619,10 +2061,19 @@ Function &F = S.getFunction(); auto *TTI = &getAnalysis().getTTI(F); - const OptimizerAdditionalInfoTy OAI = {TTI, const_cast(&D)}; + llvm::SmallVector Skeletons = collectStmtForClassification(S); + const OptimizerAdditionalInfoTy OAI = {TTI, const_cast(&D), + Skeletons}; auto NewSchedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); walkScheduleTreeForStatistics(NewSchedule, 2); +// TODO: this is print also without debug enable. understand why. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + for (auto const &it : Skeletons) { + it.dump(); + } +#endif + if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) return false; @@ -1640,6 +2091,46 @@ return false; } +void Skeleton::print(raw_ostream &OS) const { + OS << "Skeleton:\n"; + OS << "Stmt name :=" << getStmtBaseName() << "\n"; + const llvm::SmallVector &R = getReferences(); + for (auto const &it : R) { + if (it.AccessType == 2) + OS << "[" << it.ID << " " << getTypeAsString(it.Type) << "]" + << "->"; + } + for (auto const &it : R) { + if (it.AccessType != 2) + OS << "[" << it.ID << " " << getTypeAsString(it.Type) << "]"; + } + OS << "\n"; + for (auto const &it : R) { + OS << "AccessType" << it.AccessType << "\n"; + OS << "Array ID" << it.ID << "\n"; + OS << "Domain" << it.Domain << "\n"; + OS << "Element(s) accessed per outer loop iteration"; + for (auto const &iit : it.ElementAccessed) { + OS << "[" << iit.to_str() << "]"; + } + OS << "\n"; + OS << "Step in each loop dimension"; + for (auto const &iit : it.Step) { + OS << "[" << iit.to_str() << "]"; + } + OS << "\n"; + OS << "Partial loop order"; + for (auto const &iit : it.LoopOrder) { + OS << "[" << iit << "]"; + } + OS << "\n"; + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Skeleton::dump() const { print(llvm::dbgs()); } +#endif + void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const { isl_printer *p; char *ScheduleStr;