Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -1325,6 +1325,51 @@ return false; } +static bool checkForStreamingAccesses(const std::vector &References) { + + for(auto it = References.begin(); it != References.end(); it++) { + if(it->Type != Reference::CHUNK && it->Type != Reference::SINGLE_ELEMENT) + return false; + } + + assert(References.size() != 1); + + for(std::size_t i = 1; i < References.size(); ++i) { + if(!std::equal(References[0].LoopOrder.begin(),References[0].LoopOrder.end(), + References[i].LoopOrder.begin())) + return false; + } + + return true; + +} + +/// @param Node. Check if the node contains a streaming kernel. +/// @param SK. The Skeleton array containing all the references for all the Stmts. +/// 1. The skeleton should look like: CHUNK -> CHUNK or ELEMENT -> ELEMENT. +/// 2. The partial index order should be the same. +/// 3. TODO:It should have a stride in the innermost dimension. +/// 4. TODO: Dependencies? + +static bool isStreaming(isl::schedule_node Node, const std::vector &SK) { + isl::union_map PartialSchedule = isl::manage( + isl_schedule_node_band_get_partial_schedule_union_map(Node.get())); + if(PartialSchedule.n_map() != 1) + return false; + isl::map NewPartialSchedule = isl::map::from_union_map(PartialSchedule); + isl::id InputDimsId = NewPartialSchedule.get_tuple_id(isl::dim::in); + ScopStmt *Stmt = static_cast(InputDimsId.get_user()); + bool Streaming = false; + std::vector::const_iterator it = SK.begin(); + for(; it != SK.end(); it++) { + if(strcmp(it->StmtBaseName, Stmt->getBaseName()) != 0) + continue; + if(checkForStreamingAccesses(it->References)) + Streaming = true; + } + return Streaming; +} + __isl_give isl_schedule_node * ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node, void *User) { @@ -1334,6 +1379,12 @@ const OptimizerAdditionalInfoTy *OAI = static_cast(User); + /// avoid tiling. + if(isStreaming(isl::manage_copy(Node), OAI->Skeletons)) { + DEBUG(dbgs() << "Streaming detected\n"); + return Node; + } + MatMulInfoTy MMI; if (PMBasedOpts && User && isMatrMultPattern(isl::manage_copy(Node), OAI->D, MMI)) { @@ -1467,6 +1518,431 @@ &Version); } +/// Get the domain based on the bounds. + +/// @param MemAcc. The memory access for which we want to +/// compute the domain w.r.t. loop bounds. +/// @param Dom. The resulting domain. + +static void getDomain(MemoryAccess *MemAcc, isl::set &Dom) { + + auto ArrayInfo = MemAcc->getLatestScopArrayInfo(); + unsigned NumDims = ArrayInfo->getNumberOfDimensions(); + + if (NumDims == 0) + Dom = isl::set::universe(ArrayInfo->getSpace()); + + Scop *S = ArrayInfo->getScop(); + isl::union_map Accesses = S->getAccesses(); + isl::union_map sched = ArrayInfo->getScop()->getSchedule(); + + Accesses = Accesses.apply_domain(sched); + + isl::union_set AccessRange = Accesses.range(); + AccessRange = AccessRange.coalesce(); + AccessRange = AccessRange.detect_equalities(); + AccessRange = AccessRange.coalesce(); + + if (AccessRange.is_empty()) + Dom = isl::set::empty(ArrayInfo->getSpace()); + + isl::set AccessSet = AccessRange.extract_set(ArrayInfo->getSpace()); + isl::local_space LS = isl::local_space(ArrayInfo->getSpace()); + Dom = isl::set::universe(ArrayInfo->getSpace()); + + for (unsigned i = 0; i < NumDims; ++i) { + isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, i); + isl::pw_aff OuterMin = AccessSet.dim_min(i); + isl::pw_aff OuterMax = AccessSet.dim_max(i); + OuterMin = OuterMin.add_dims(isl::dim::in, Val.dim(isl::dim::in)); + OuterMax = OuterMax.add_dims(isl::dim::in, Val.dim(isl::dim::in)); + OuterMin = OuterMin.set_tuple_id(isl::dim::in, ArrayInfo->getBasePtrId()); + OuterMax = OuterMax.set_tuple_id(isl::dim::in, ArrayInfo->getBasePtrId()); + Dom = Dom.intersect(OuterMin.le_set(Val)); + Dom = Dom.intersect(OuterMax.ge_set(Val)); + } +} + +///NOTE: At the moment I am not able to quantify the step. +/// so we push a '1' if we detect a step X along one loop dimension, 0 otherwise. + +/// get the step. + +/// @param MemAcc. The memory access for which we want to compute the step. +/// @param Sched. The Scop schedule. +/// @param HasStride. The vector to be filled. 1 if we detect a stride (step) +/// along a dimension, 0 otherwise. The dimensions (loop indexes) are ordered from outermost +/// to innermost. + +static void getStep(MemoryAccess *MemAcc, isl::map &Sched, + /*unsigned LoopLevel,*/ std::vector &HasStride) { + + unsigned DimNumOut = Sched.dim(isl::dim::out); + unsigned StrideDetected = false; + for (unsigned uu = 0; uu < DimNumOut; uu++) { + StrideDetected = false; + // the loop for which you want to compute the step + // should be the innermost one. + isl::map NewSched = + permuteDimensions(Sched, isl::dim::out, uu, DimNumOut - 1); + isl::set Deltas = MemAcc->getStride(NewSched); + //DEBUG(dbgs() << "DELTAS@ " << Deltas << "\n"); + isl::pw_multi_aff MultAff = isl::pw_multi_aff::from_set(Deltas); + if (MultAff.n_piece() == 0) + continue; + if (MultAff.n_piece() > 1) + continue; + //DEBUG(dbgs() << MultAff << "\n"); + for (unsigned u = 0; u < MultAff.dim(isl::dim::out); ++u) { + isl::pw_aff PwAff = MultAff.get_pw_aff(u); + PwAff.foreach_piece([&](isl::set S, isl::aff Aff) -> isl::stat { + isl::val Val = Aff.get_constant_val(); + if (!Val.is_zero()) { + //DEBUG(dbgs() << "Stride on dimension :" << uu << "\n"); + HasStride.push_back(true); + StrideDetected = true; + } + return isl::stat::ok; + }); + } + if (!StrideDetected) + HasStride.push_back(false); + } + } + +/// Find the element accessed by each memory access +/// for every iteration of the outermost loop. +/// In addition, the function +/// records the relative indexes order. +/// Three cases: +/// 1. The access is dependent on the outer loops +/// 2. The access is dependent on the inner loops +/// 3. The access is not dependent on any loops + +/// @param MemAcc. The memory access for which we want to compute +/// the number of element(s) accessed. +/// @param Sched. The Scop schedule. +/// @param ElementAccessed. The array to be filled with the number +/// of elements accessed every iteration for each loop index. +/// @param LoopOrder. The array to be filled with the relative loop +/// indexes order. + +static void getElementAccessed(MemoryAccess *MemAcc, isl::map &Sched, + isl::set &AccessSet, + std::vector &ElementAccessed, + std::vector &LoopOrder) { + + isl::map AccMap = MemAcc->getLatestAccessRelation(); + AccMap = AccMap.intersect_domain(MemAcc->getStatement()->getDomain()); + isl::pw_multi_aff MultiAff = isl::pw_multi_aff::from_map(AccMap); + unsigned IndexAccessMap = 0; + + /// iterate across the dimensions of the accessed memory location. + for (unsigned u = 0; u < AccMap.dim(isl::dim::out); ++u) { + isl::pw_aff PwAff = MultiAff.get_pw_aff(u); + PwAff.foreach_piece([&](isl::set S, isl::aff Aff) -> isl::stat { + bool Relate = false; + /// check if a dimension relates to a loop index. + for (unsigned LoopDim = 0; LoopDim < AccMap.dim(isl::dim::in); + ++LoopDim) { + isl::val V = Aff.get_coefficient_val(isl::dim::in, LoopDim); + if (!V.is_zero()) { + isl::pw_aff OuterMin, OuterMax; + if (LoopDim == 0) { + // DEBUG(dbgs() << "dimension relates to outer loops" << "\n"); + isl::local_space Ls = + isl::local_space(AccessSet.get_space().params()); + isl::aff One = isl::aff(Ls); + One = One.add_constant_si(1); + isl::val Val = One.get_constant_val(); + ElementAccessed.push_back(Val); + } else { + // DEBUG(dbgs() << "dimension relates to inner loops" << "\n"); + OuterMin = AccessSet.dim_min(IndexAccessMap); + OuterMax = AccessSet.dim_max(IndexAccessMap); + OuterMax = OuterMax.sub(OuterMin); + + assert(OuterMax.n_piece() == 1); + OuterMax.foreach_piece([&](isl::set S, isl::aff Aff) -> isl::stat { + isl::val Val = Aff.get_constant_val(); + ElementAccessed.push_back(Val); + return isl::stat::ok; + }); + } + LoopOrder.push_back(LoopDim); + Relate = true; + IndexAccessMap++; + } + // The dimension does not relate. + else if (V.is_zero()) { + if (!Relate && LoopDim == AccMap.dim(isl::dim::in) - 1) { + // DEBUG(dbgs() << "found dimension that does not relate to any loop + // index\n"); + isl::local_space Ls = + isl::local_space(AccessSet.get_space().params()); + isl::aff One = isl::aff(Ls); + One = One.add_constant_si(1); + isl::val Val = One.get_constant_val(); + ElementAccessed.push_back(Val); + } + } + } + return isl::stat::ok; + }); + } +} + + +/// check if only a single element is accessed. + +static bool accessSingleElement(std::vector &E) { + bool isElement = true; + std::vector::iterator it = E.begin(); + while (it != E.end()) { + if (!it->is_one()) + isElement = false; + it++; + } + return isElement; +} + +/// extract the species for the given array of memory references. +/// @param the array of memory references to be classified. + +static void extractSpecies(std::vector &R /*, int LoopLevel*/) { + assert(!R.empty() && "Structure is empty"); + std::vector::iterator it = R.begin(); + while (it != R.end()) { + if (!it->HasStride[0] && it->AccessType == 1) { + it->Type = Reference::FULL; + // DEBUG(dbgs() << "[ " << it->Name << " ]" << " FULL " << "\n"); + } else if (!it->HasStride[0] && it->AccessType == 2) { + // DEBUG(dbgs() << it->Step[0].is_zero() << "\n"); + it->Type = Reference::SHARED; + // DEBUG(dbgs() << "[ " << it->Name << " ]" << " SHARED " << "\n"); + } + + else if (accessSingleElement(it->ElementAccessed)) { + it->Type = Reference::SINGLE_ELEMENT; + // DEBUG(dbgs() << "[ " << it->Name << " ]" << " ELEMENT " << "\n"); + } + + else { + it->Type = Reference::CHUNK; + // DEBUG(dbgs() << "[ " << it->Name << " ]" << " CHUNK " << "\n"); + } + it++; + } +} + +/// helper function. To be removed. + +static std::string getTypeAsString(int Index) { + switch (Index) { + case 0: + return "SINGLE_ELEMENT"; + case 1: + return "CHUNK"; + case 2: + return "NEIGHBORHOOD"; + case 3: + return "FULL"; + case 4: + return "SHARED"; + default: + return "ND"; + } +} + +/// helper function. to be removed. + +static void printStructure(std::vector &R, const char *StmtBaseName) { + assert(!R.empty() && "Structure is empty"); + // DEBUG(dbgs() << R.size() << "\n"); + DEBUG(dbgs() << "Stmt Name := " << StmtBaseName << "\n"); + std::vector::iterator it = R.begin(); + while (it != R.end()) { + if (it->AccessType == 2) + DEBUG(dbgs() << "[" << it->Name << " " << getTypeAsString(it->Type) << "]" + << " -> "); + it++; + } + it = R.begin(); + while (it != R.end()) { + if (it->AccessType != 2) + DEBUG(dbgs() << "[" << it->Name << " " << getTypeAsString(it->Type) + << "]"); + it++; + } + DEBUG(dbgs() << "\n"); + while (!R.empty()) { + Reference RR = R.back(); + DEBUG(dbgs() << "AccessType@: " << RR.AccessType << "\n"); + DEBUG(dbgs() << "Name@: " << RR.Name << "\n"); + DEBUG(dbgs() << "Domain@: " << RR.Domain << "\n"); + DEBUG(dbgs() << "ElementAccessed for each outer loop iteration :="); + while (!RR.ElementAccessed.empty()) { + isl::val Val = RR.ElementAccessed.front(); + DEBUG(dbgs() << "[" << Val.to_str() << "]"); + RR.ElementAccessed.erase(RR.ElementAccessed.begin()); + } + DEBUG(dbgs() << "\n"); + DEBUG(dbgs() << "StrideOnDimension :="); + while (!RR.HasStride.empty()) { + bool Val = RR.HasStride.front(); + DEBUG(dbgs() << "[" << Val << "]"); + RR.HasStride.erase(RR.HasStride.begin()); + } + DEBUG(dbgs() << "\n"); + DEBUG(dbgs() << "Partial loop order :="); + while (!RR.LoopOrder.empty()) { + DEBUG(dbgs() << "[" << RR.LoopOrder.front() << "]"); + RR.LoopOrder.erase(RR.LoopOrder.begin()); + } + DEBUG(dbgs() << "\n"); + R.pop_back(); + } +} + + +/// Collect the 5-tuples. +/// 5-tuples classification: +/// 1. Reference name +/// 2. Type of access (read or write) +/// 3. Integer domain of the array reference w.r.t. the loop bounds +/// 4. Number of element accessed per outer loop iteration. +/// 5. Step + +/// @param Stmt. The Stmt to be classified. +/// @param References. The vector that will be filled with +/// all the references detected in the Stmt and the 5-tuples information. + +static void classifyReference(ScopStmt &Stmt, + std::vector &References) { + + auto Accesses = getAccessesInOrder(Stmt); + + /// Used to populate References. + int Index = 0; + + for (auto *MemA = Accesses.begin(); MemA != Accesses.end(); MemA++) { + + auto *MemAccessPtr = *MemA; + + if (MemAccessPtr->isMemoryIntrinsic()) + continue; + + + /* + enum MemoryKind MK = MemAccessPtr->getOriginalKind(); + if(MK == MemoryKind::Array) { + DEBUG(dbgs() << "Array kind\n"); + DEBUG(dbgs() << MemAccessPtr->getOriginalBaseAddr() << "\n"); + } + if(MK == MemoryKind::Value) { + DEBUG(dbgs() << "Value kind\n"); + DEBUG(dbgs() << MemAccessPtr->getOriginalBaseAddr() << "\n"); + } + if(MK == MemoryKind::PHI) { + DEBUG(dbgs() << "PHI kind\n"); + DEBUG(dbgs() << MemAccessPtr->getOriginalBaseAddr() << "\n"); + auto PHI = cast(MemAccessPtr->getAccessInstruction()); + PHI->dump(); + } + if(MK == MemoryKind::ExitPHI) { + DEBUG(dbgs() << "Exit kind\n"); + DEBUG(dbgs() << MemAccessPtr->getOriginalBaseAddr() << "\n"); + } + */ + + enum MemoryKind MK = MemAccessPtr->getOriginalKind(); + if (MK != MemoryKind::Array) + continue; + + /// 5-tuples classification. + + References.push_back(Reference()); + + /// get access type. + References[Index].AccessType = MemAccessPtr->getType(); + + /// get reference name. + References[Index].Name = MemAccessPtr->getLatestScopArrayInfo()->getName(); + + // unsigned LoopLevel = 0; + /// get domain w.r.t loop bounds + getDomain(MemAccessPtr, References[Index].Domain); + isl::map ScheduleMap = Stmt.getSchedule(); + /// get elements accessed per outer loop iteration. + getElementAccessed(MemAccessPtr, ScheduleMap, /*LoopLevel,*/ + References[Index].Domain, + References[Index].ElementAccessed, + References[Index].LoopOrder); + /// get the step. + getStep(MemAccessPtr, ScheduleMap, + /*LoopLevel,*/ References[Index].HasStride); + Index++; + } +} + +/// Check Stmt profitability. +/// TODO: find a better profitability check. +/// @param Stmt is a given statement. Returns true if the +/// statement is deemed profitable to be classified, false otherwise. + +static bool isProfitableStmt(ScopStmt &Stmt) { + + if (Stmt.isEmpty()) { + DEBUG(dbgs() << "Profitability: Empty Stmt" + << "\n"); + return false; + } + + /// TODO: find a better profitability check. + /// maybe it is better to count the number of instructions (i.e. load/store). + if (getAccessesInOrder(Stmt).size() <= 2) { + DEBUG(dbgs() << "Profitability: Few Mem. references" + << "\n"); + return false; + } + + DEBUG(dbgs() << "Profitability: Stmt is profitable" + << "\n"); + return true; +} + +/// Collect the statements for the species classification. +/// Refer to the following paper for more details: +/// Cedric Nugteren, Pieter Custers, and Henk Corporaal. 2013. +/// Algorithmic species: A classification of affine loop nests for parallel programming. +/// ACM Trans. Archit. Code Optim. 9, 4, Article 40 (January 2013), 25 pages. +/// DOI=http://dx.doi.org/10.1145/2400682.2400699 +/// or +/// https://docs.google.com/document/d/1OOMg37pXI3bBkHMAtQsfszYwJ6-GvpO9e4zQ2qf44tM/edit + +/// @param Scop. The Scop to be analyzed. +/// @param Skeleton vector. For each statement in the Scop +/// we collect all the access references. +/// The access references are used later on to extract the species +/// of the statement based on the 5-tuples classification. + +static void collectStmtForClassification(Scop &S, std::vector &SK) { + int Index = 0; + for (auto &Stmt : S) { + if (isProfitableStmt(Stmt)) { + SK.push_back(Skeleton()); + SK[Index].StmtBaseName = Stmt.getBaseName(); + classifyReference(Stmt, SK[Index].References); + extractSpecies(SK[Index].References); + //printStructure(SK[Index].References, SK[Index].StmtBaseName); + Index++; + } else { + DEBUG(dbgs() << "Skip Stmt" + << "\n"); + } + } +} + bool IslScheduleOptimizer::runOnScop(Scop &S) { // Skip SCoPs in case they're already optimised by PPCGCodeGeneration if (S.isToBeSkipped()) @@ -1619,10 +2095,15 @@ Function &F = S.getFunction(); auto *TTI = &getAnalysis().getTTI(F); - const OptimizerAdditionalInfoTy OAI = {TTI, const_cast(&D)}; + std::vector Skeletons = std::vector(); + collectStmtForClassification(S, Skeletons); + const OptimizerAdditionalInfoTy OAI = {TTI, const_cast(&D), Skeletons}; auto NewSchedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); walkScheduleTreeForStatistics(NewSchedule, 2); + for(std::size_t i = 0; i != Skeletons.size(); ++i) + printStructure(Skeletons[i].References, Skeletons[i].StmtBaseName); + if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) return false;