Index: polly/trunk/include/polly/ScopInfo.h =================================================================== --- polly/trunk/include/polly/ScopInfo.h +++ polly/trunk/include/polly/ScopInfo.h @@ -1936,6 +1936,10 @@ /// Scop constructor; invoked from ScopBuilder::buildScop. Scop(Region &R, ScalarEvolution &SE, LoopInfo &LI, ScopDetection::DetectionContext &DC, OptimizationRemarkEmitter &ORE); + + /// Return the LoopInfo used for this Scop. + LoopInfo *getLI() const { return Affinator.getLI(); } + //@} /// Initialize this ScopBuilder. @@ -3018,6 +3022,25 @@ /// Return whether @p Inst has a use outside of this SCoP. bool isEscaping(Instruction *Inst); + + struct ScopStatistics { + int NumAffineLoops = 0; + int NumBoxedLoops = 0; + + int NumValueWrites = 0; + int NumValueWritesInLoops = 0; + int NumPHIWrites = 0; + int NumPHIWritesInLoops = 0; + int NumSingletonWrites = 0; + int NumSingletonWritesInLoops = 0; + }; + + /// Collect statistic about this SCoP. + /// + /// These are most commonly used for LLVM's static counters (Statistic.h) in + /// various places. If statistics are disabled, only zeros are returned to + /// avoid the overhead. + ScopStatistics getStatistics() const; }; /// Print Scop scop to raw_ostream OS. Index: polly/trunk/include/polly/Simplify.h =================================================================== --- polly/trunk/include/polly/Simplify.h +++ polly/trunk/include/polly/Simplify.h @@ -41,7 +41,16 @@ /// The order in which implicit writes are executed relative to each other is /// undefined. llvm::SmallVector getAccessesInOrder(ScopStmt &Stmt); -llvm::Pass *createSimplifyPass(); + +/// Create a Simplify pass +/// +/// @param CallNo Disambiguates this instance for when there are multiple +/// instances of this pass in the pass manager. It is used only to +/// keep the statistics apart and has no influence on the +/// simplification itself. +/// +/// @return The Simplify pass. +llvm::Pass *createSimplifyPass(int CallNo = 0); } // namespace polly namespace llvm { Index: polly/trunk/include/polly/Support/SCEVAffinator.h =================================================================== --- polly/trunk/include/polly/Support/SCEVAffinator.h +++ polly/trunk/include/polly/Support/SCEVAffinator.h @@ -73,6 +73,9 @@ /// Check an AddRec for the loop @p L is cached. bool hasNSWAddRecForLoop(llvm::Loop *L) const; + /// Return the LoopInfo used by thi object. + llvm::LoopInfo *getLI() const { return &LI; } + private: /// Key to identify cached expressions. using CacheKey = std::pair; Index: polly/trunk/lib/Analysis/PruneUnprofitable.cpp =================================================================== --- polly/trunk/lib/Analysis/PruneUnprofitable.cpp +++ polly/trunk/lib/Analysis/PruneUnprofitable.cpp @@ -25,12 +25,36 @@ "Number of SCoPs considered for unprofitability pruning"); STATISTIC(ScopsPruned, "Number of pruned SCoPs because it they cannot be " "optimized in a significant way"); +STATISTIC(ScopsSurvived, "Number of SCoPs after pruning"); + +STATISTIC(NumPrunedLoops, "Number of pruned loops"); +STATISTIC(NumPrunedBoxedLoops, "Number of pruned boxed loops"); +STATISTIC(NumPrunedAffineLoops, "Number of pruned affine loops"); + +STATISTIC(NumLoopsInScop, "Number of loops in scops after pruning"); +STATISTIC(NumBoxedLoops, "Number of boxed loops in SCoPs after pruning"); +STATISTIC(NumAffineLoops, "Number of affine loops in SCoPs after pruning"); class PruneUnprofitable : public ScopPass { private: PruneUnprofitable(const PruneUnprofitable &) = delete; const PruneUnprofitable &operator=(const PruneUnprofitable &) = delete; + void updateStatistics(Scop &S, bool Pruned) { + auto ScopStats = S.getStatistics(); + if (Pruned) { + ScopsPruned++; + NumPrunedLoops += ScopStats.NumAffineLoops + ScopStats.NumBoxedLoops; + NumPrunedBoxedLoops += ScopStats.NumBoxedLoops; + NumPrunedAffineLoops += ScopStats.NumAffineLoops; + } else { + ScopsSurvived++; + NumLoopsInScop += ScopStats.NumAffineLoops + ScopStats.NumBoxedLoops; + NumBoxedLoops += ScopStats.NumBoxedLoops; + NumAffineLoops += ScopStats.NumAffineLoops; + } + } + public: static char ID; explicit PruneUnprofitable() : ScopPass(ID) {} @@ -52,8 +76,10 @@ if (!S.isProfitable(true)) { DEBUG(dbgs() << "SCoP pruned because it probably cannot be optimized in " "a significant way\n"); - ScopsPruned++; S.invalidate(PROFITABLE, DebugLoc()); + updateStatistics(S, true); + } else { + updateStatistics(S, false); } return false; Index: polly/trunk/lib/Analysis/ScopInfo.cpp =================================================================== --- polly/trunk/lib/Analysis/ScopInfo.cpp +++ polly/trunk/lib/Analysis/ScopInfo.cpp @@ -121,7 +121,11 @@ STATISTIC(AssumptionsDelinearization, "Number of delinearization assumptions taken."); +STATISTIC(NumScops, "Number of feasible SCoPs after ScopInfo"); STATISTIC(NumLoopsInScop, "Number of loops in scops"); +STATISTIC(NumBoxedLoops, "Number of boxed loops in SCoPs after ScopInfo"); +STATISTIC(NumAffineLoops, "Number of affine loops in SCoPs after ScopInfo"); + STATISTIC(NumScopsDepthOne, "Number of scops with maximal loop depth 1"); STATISTIC(NumScopsDepthTwo, "Number of scops with maximal loop depth 2"); STATISTIC(NumScopsDepthThree, "Number of scops with maximal loop depth 3"); @@ -131,6 +135,17 @@ "Number of scops with maximal loop depth 6 and larger"); STATISTIC(MaxNumLoopsInScop, "Maximal number of loops in scops"); +STATISTIC(NumValueWrites, "Number of scalar value writes after ScopInfo"); +STATISTIC( + NumValueWritesInLoops, + "Number of scalar value writes nested in affine loops after ScopInfo"); +STATISTIC(NumPHIWrites, "Number of scalar phi writes after ScopInfo"); +STATISTIC(NumPHIWritesInLoops, + "Number of scalar phi writes nested in affine loops after ScopInfo"); +STATISTIC(NumSingletonWrites, "Number of singleton writes after ScopInfo"); +STATISTIC(NumSingletonWritesInLoops, + "Number of singleton writes nested in affine loops after ScopInfo"); + // The maximal number of basic sets we allow during domain construction to // be created. More complex scops will result in very high compile time and // are also unlikely to result in good code @@ -5160,6 +5175,47 @@ return false; } +Scop::ScopStatistics Scop::getStatistics() const { + ScopStatistics Result; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS) + auto LoopStat = ScopDetection::countBeneficialLoops(&R, *SE, *getLI(), 0); + + int NumTotalLoops = LoopStat.NumLoops; + Result.NumBoxedLoops = getBoxedLoops().size(); + Result.NumAffineLoops = NumTotalLoops - Result.NumBoxedLoops; + + for (const ScopStmt &Stmt : *this) { + isl::set Domain = Stmt.getDomain().intersect_params(getContext()); + bool IsInLoop = Stmt.getNumIterators() >= 1; + for (MemoryAccess *MA : Stmt) { + if (!MA->isWrite()) + continue; + + if (MA->isLatestValueKind()) { + Result.NumValueWrites += 1; + if (IsInLoop) + Result.NumValueWritesInLoops += 1; + } + + if (MA->isLatestAnyPHIKind()) { + Result.NumPHIWrites += 1; + if (IsInLoop) + Result.NumPHIWritesInLoops += 1; + } + + isl::set AccSet = + MA->getAccessRelation().intersect_domain(Domain).range(); + if (AccSet.is_singleton()) { + Result.NumSingletonWrites += 1; + if (IsInLoop) + Result.NumSingletonWritesInLoops += 1; + } + } + } +#endif + return Result; +} + raw_ostream &polly::operator<<(raw_ostream &OS, const Scop &scop) { scop.print(OS, PollyPrintInstructions); return OS; @@ -5177,7 +5233,11 @@ AU.setPreservesAll(); } -void updateLoopCountStatistic(ScopDetection::LoopStats Stats) { +void updateLoopCountStatistic(ScopDetection::LoopStats Stats, + Scop::ScopStatistics ScopStats) { + assert(Stats.NumLoops == ScopStats.NumAffineLoops + ScopStats.NumBoxedLoops); + + NumScops++; NumLoopsInScop += Stats.NumLoops; MaxNumLoopsInScop = std::max(MaxNumLoopsInScop.getValue(), (unsigned)Stats.NumLoops); @@ -5194,6 +5254,16 @@ NumScopsDepthFive++; else NumScopsDepthLarger++; + + NumAffineLoops += ScopStats.NumAffineLoops; + NumBoxedLoops += ScopStats.NumBoxedLoops; + + NumValueWrites += ScopStats.NumValueWrites; + NumValueWritesInLoops += ScopStats.NumValueWritesInLoops; + NumPHIWrites += ScopStats.NumPHIWrites; + NumPHIWritesInLoops += ScopStats.NumPHIWritesInLoops; + NumSingletonWrites += ScopStats.NumSingletonWrites; + NumSingletonWritesInLoops += ScopStats.NumSingletonWritesInLoops; } bool ScopInfoRegionPass::runOnRegion(Region *R, RGPassManager &RGM) { @@ -5213,11 +5283,13 @@ ScopBuilder SB(R, AC, AA, DL, DT, LI, SD, SE); S = SB.getScop(); // take ownership of scop object +#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS) if (S) { ScopDetection::LoopStats Stats = ScopDetection::countBeneficialLoops(&S->getRegion(), SE, LI, 0); - updateLoopCountStatistic(Stats); + updateLoopCountStatistic(Stats, S->getStatistics()); } +#endif return false; } @@ -5268,9 +5340,11 @@ std::unique_ptr S = SB.getScop(); if (!S) continue; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS) ScopDetection::LoopStats Stats = ScopDetection::countBeneficialLoops(&S->getRegion(), SE, LI, 0); - updateLoopCountStatistic(Stats); + updateLoopCountStatistic(Stats, S->getStatistics()); +#endif bool Inserted = RegionToScopMap.insert({R, std::move(S)}).second; assert(Inserted && "Building Scop for the same region twice!"); (void)Inserted; Index: polly/trunk/lib/CodeGen/CodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/CodeGeneration.cpp +++ polly/trunk/lib/CodeGen/CodeGeneration.cpp @@ -56,6 +56,13 @@ cl::location(polly::PerfMonitoring), cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +STATISTIC(ScopsProcessed, "Number of SCoP processed"); +STATISTIC(CodegenedScops, "Number of successfully generated SCoPs"); +STATISTIC(CodegenedAffineLoops, + "Number of original affine loops in SCoPs that have been generated"); +STATISTIC(CodegenedBoxedLoops, + "Number of original boxed loops in SCoPs that have been generated"); + namespace polly { /// Mark a basic block unreachable. /// @@ -162,6 +169,11 @@ if (!AstRoot) return false; + // Collect statistics. Do it before we modify the IR to avoid having it any + // influence on the result. + auto ScopStats = S.getStatistics(); + ScopsProcessed++; + auto &DL = S.getFunction().getParent()->getDataLayout(); Region *R = &S.getRegion(); assert(!R->isTopLevelRegion() && "Top level regions are not supported"); @@ -249,6 +261,10 @@ NodeBuilder.create(AstRoot); NodeBuilder.finalize(); fixRegionInfo(*EnteringBB->getParent(), *R->getParent(), RI); + + CodegenedScops++; + CodegenedAffineLoops += ScopStats.NumAffineLoops; + CodegenedBoxedLoops += ScopStats.NumBoxedLoops; } Function *F = EnteringBB->getParent(); Index: polly/trunk/lib/CodeGen/IslAst.cpp =================================================================== --- polly/trunk/lib/CodeGen/IslAst.cpp +++ polly/trunk/lib/CodeGen/IslAst.cpp @@ -78,6 +78,19 @@ cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +STATISTIC(ScopsProcessed, "Number of SCoPs processed"); +STATISTIC(ScopsBeneficial, "Number of beneficial SCoPs"); +STATISTIC(BeneficialAffineLoops, "Number of beneficial affine loops"); +STATISTIC(BeneficialBoxedLoops, "Number of beneficial boxed loops"); + +STATISTIC(NumForLoops, "Number of for-loops"); +STATISTIC(NumParallel, "Number of parallel for-loops"); +STATISTIC(NumInnermostParallel, "Number of innermost parallel for-loops"); +STATISTIC(NumOutermostParallel, "Number of outermost parallel for-loops"); +STATISTIC(NumReductionParallel, "Number of reduction-parallel for-loops"); +STATISTIC(NumExecutedInParallel, "Number of for-loops executed in parallel"); +STATISTIC(NumIfConditions, "Number of if-conditions"); + namespace polly { /// Temporary information used when building the ast. struct AstBuildUserInfo { @@ -401,6 +414,41 @@ return true; } +/// Collect statistics for the syntax tree rooted at @p Ast. +static void walkAstForStatistics(__isl_keep isl_ast_node *Ast) { + assert(Ast); + isl_ast_node_foreach_descendant_top_down( + Ast, + [](__isl_keep isl_ast_node *Node, void *User) -> isl_bool { + switch (isl_ast_node_get_type(Node)) { + case isl_ast_node_for: + NumForLoops++; + if (IslAstInfo::isParallel(Node)) + NumParallel++; + if (IslAstInfo::isInnermostParallel(Node)) + NumInnermostParallel++; + if (IslAstInfo::isOutermostParallel(Node)) + NumOutermostParallel++; + if (IslAstInfo::isReductionParallel(Node)) + NumReductionParallel++; + if (IslAstInfo::isExecutedInParallel(Node)) + NumExecutedInParallel++; + break; + + case isl_ast_node_if: + NumIfConditions++; + break; + + default: + break; + } + + // Continue traversing subtrees. + return isl_bool_true; + }, + nullptr); +} + IslAst::IslAst(Scop &Scop) : S(Scop), Root(nullptr), RunCondition(nullptr), Ctx(Scop.getSharedIslCtx()) {} @@ -421,6 +469,11 @@ if (!benefitsFromPolly(S, PerformParallelTest)) return; + auto ScopStats = S.getStatistics(); + ScopsBeneficial++; + BeneficialAffineLoops += ScopStats.NumAffineLoops; + BeneficialBoxedLoops += ScopStats.NumBoxedLoops; + isl_ctx *Ctx = S.getIslCtx(); isl_options_set_ast_build_atomic_upper_bound(Ctx, true); isl_options_set_ast_build_detect_min_max(Ctx, true); @@ -454,6 +507,7 @@ RunCondition = buildRunCondition(S, Build); Root = isl_ast_build_node_from_schedule(Build, S.getScheduleTree().release()); + walkAstForStatistics(Root); isl_ast_build_free(Build); } @@ -692,6 +746,8 @@ if (Scop.isToBeSkipped()) return false; + ScopsProcessed++; + const Dependences &D = getAnalysis().getDependences(Dependences::AL_Statement); Index: polly/trunk/lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- polly/trunk/lib/CodeGen/IslNodeBuilder.cpp +++ polly/trunk/lib/CodeGen/IslNodeBuilder.cpp @@ -53,6 +53,11 @@ STATISTIC(VersionedScops, "Number of SCoPs that required versioning."); +STATISTIC(SequentialLoops, "Number of generated sequential for-loops"); +STATISTIC(ParallelLoops, "Number of generated parallel for-loops"); +STATISTIC(VectorLoops, "Number of generated vector for-loops"); +STATISTIC(IfConditions, "Number of generated if-conditions"); + static cl::opt PollyGenerateRTCPrint( "polly-codegen-emit-rtc-print", cl::desc("Emit code that prints the runtime check result dynamically."), @@ -480,6 +485,8 @@ isl_ast_node_free(For); isl_ast_expr_free(Iterator); + + VectorLoops++; } namespace { @@ -571,6 +578,8 @@ isl_ast_node_free(For); isl_ast_expr_free(Iterator); isl_id_free(IteratorID); + + SequentialLoops++; } /// Remove the BBs contained in a (sub)function from the dominator tree. @@ -720,6 +729,8 @@ isl_ast_node_free(For); isl_ast_expr_free(Iterator); isl_id_free(IteratorID); + + ParallelLoops++; } /// Return whether any of @p Node's statements contain partial accesses. @@ -813,6 +824,8 @@ Builder.SetInsertPoint(&MergeBB->front()); isl_ast_node_free(If); + + IfConditions++; } __isl_give isl_id_to_ast_expr * Index: polly/trunk/lib/Support/RegisterPasses.cpp =================================================================== --- polly/trunk/lib/Support/RegisterPasses.cpp +++ polly/trunk/lib/Support/RegisterPasses.cpp @@ -328,13 +328,13 @@ PM.add(polly::createPolyhedralInfoPass()); if (EnableSimplify) - PM.add(polly::createSimplifyPass()); + PM.add(polly::createSimplifyPass(0)); if (EnableForwardOpTree) PM.add(polly::createForwardOpTreePass()); if (EnableDeLICM) PM.add(polly::createDeLICMPass()); if (EnableSimplify) - PM.add(polly::createSimplifyPass()); + PM.add(polly::createSimplifyPass(1)); if (ImportJScop) PM.add(polly::createJSONImporterPass()); Index: polly/trunk/lib/Transform/DeLICM.cpp =================================================================== --- polly/trunk/lib/Transform/DeLICM.cpp +++ polly/trunk/lib/Transform/DeLICM.cpp @@ -61,6 +61,16 @@ STATISTIC(TargetsMapped, "Number of stores used for at least one mapping"); STATISTIC(DeLICMScopsModified, "Number of SCoPs optimized"); +STATISTIC(NumValueWrites, "Number of scalar value writes after DeLICM"); +STATISTIC(NumValueWritesInLoops, + "Number of scalar value writes nested in affine loops after DeLICM"); +STATISTIC(NumPHIWrites, "Number of scalar phi writes after DeLICM"); +STATISTIC(NumPHIWritesInLoops, + "Number of scalar phi writes nested in affine loops after DeLICM"); +STATISTIC(NumSingletonWrites, "Number of singleton writes after DeLICM"); +STATISTIC(NumSingletonWritesInLoops, + "Number of singleton writes nested in affine loops after DeLICM"); + isl::union_map computeReachingOverwrite(isl::union_map Schedule, isl::union_map Writes, bool InclPrevWrite, @@ -1402,6 +1412,14 @@ collapseToUnused(S); + auto ScopStats = S.getStatistics(); + NumValueWrites += ScopStats.NumValueWrites; + NumValueWritesInLoops += ScopStats.NumValueWritesInLoops; + NumPHIWrites += ScopStats.NumPHIWrites; + NumPHIWritesInLoops += ScopStats.NumPHIWritesInLoops; + NumSingletonWrites += ScopStats.NumSingletonWrites; + NumSingletonWritesInLoops += ScopStats.NumSingletonWritesInLoops; + return false; } Index: polly/trunk/lib/Transform/ForwardOpTree.cpp =================================================================== --- polly/trunk/lib/Transform/ForwardOpTree.cpp +++ polly/trunk/lib/Transform/ForwardOpTree.cpp @@ -56,6 +56,16 @@ STATISTIC(ScopsModified, "Number of SCoPs with at least one forwarded tree"); +STATISTIC(NumValueWrites, "Number of scalar value writes after OpTree"); +STATISTIC(NumValueWritesInLoops, + "Number of scalar value writes nested in affine loops after OpTree"); +STATISTIC(NumPHIWrites, "Number of scalar phi writes after OpTree"); +STATISTIC(NumPHIWritesInLoops, + "Number of scalar phi writes nested in affine loops after OpTree"); +STATISTIC(NumSingletonWrites, "Number of singleton writes after OpTree"); +STATISTIC(NumSingletonWritesInLoops, + "Number of singleton writes nested in affine loops after OpTree"); + namespace { /// The state of whether an operand tree was/can be forwarded. @@ -844,6 +854,15 @@ DEBUG(dbgs() << "\nFinal Scop:\n"); DEBUG(dbgs() << S); + // Update statistics + auto ScopStats = S.getStatistics(); + NumValueWrites += ScopStats.NumValueWrites; + NumValueWritesInLoops += ScopStats.NumValueWritesInLoops; + NumPHIWrites += ScopStats.NumPHIWrites; + NumPHIWritesInLoops += ScopStats.NumPHIWritesInLoops; + NumSingletonWrites += ScopStats.NumSingletonWrites; + NumSingletonWritesInLoops += ScopStats.NumSingletonWritesInLoops; + return false; } Index: polly/trunk/lib/Transform/ScheduleOptimizer.cpp =================================================================== --- polly/trunk/lib/Transform/ScheduleOptimizer.cpp +++ polly/trunk/lib/Transform/ScheduleOptimizer.cpp @@ -237,6 +237,33 @@ "transformations is applied on the schedule tree"), cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +STATISTIC(ScopsProcessed, "Number of scops processed"); +STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); +STATISTIC(ScopsOptimized, "Number of scops optimized"); + +STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized"); +STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized"); + +#define THREE_STATISTICS(VARNAME, DESC) \ + static llvm::Statistic VARNAME[3] = { \ + {DEBUG_TYPE, #VARNAME "0", DESC " (original)", {0}, false}, \ + {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)", {0}, false}, \ + {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)", {0}, false}} + +THREE_STATISTICS(NumBands, "Number of bands"); +THREE_STATISTICS(NumBandMembers, "Number of band members"); +THREE_STATISTICS(NumCoincident, "Number of coincident band members"); +THREE_STATISTICS(NumPermutable, "Number of permutable bands"); +THREE_STATISTICS(NumFilters, "Number of filter nodes"); +THREE_STATISTICS(NumExtension, "Number of extension nodes"); + +STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied"); +STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied"); +STATISTIC(RegisterTileOpts, "Number of register tiling applied"); +STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied"); +STATISTIC(MatMulOpts, + "Number of matrix multiplication patterns detected and optimized"); + /// Create an isl::union_set, which describes the isolate option based on /// IsolateDomain. /// @@ -368,6 +395,7 @@ if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf) Node = Node.parent(); auto LoopMarker = isl::id::alloc(Node.get_ctx(), "SIMD", nullptr); + PrevectOpts++; return Node.insert_mark(LoopMarker); } @@ -456,17 +484,23 @@ __isl_give isl::schedule_node ScheduleTreeOptimizer::standardBandOpts(isl::schedule_node Node, void *User) { - if (FirstLevelTiling) + if (FirstLevelTiling) { Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes, FirstLevelDefaultTileSize); + FirstLevelTileOpts++; + } - if (SecondLevelTiling) + if (SecondLevelTiling) { Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes, SecondLevelDefaultTileSize); + SecondLevelTileOpts++; + } - if (RegisterTiling) + if (RegisterTiling) { Node = applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize); + RegisterTileOpts++; + } if (PollyVectorizerChoice == VECTORIZER_NONE) return Node; @@ -1235,6 +1269,7 @@ isMatrMultPattern(isl::manage(isl_schedule_node_copy(Node)), OAI->D, MMI)) { DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); + MatMulOpts++; return optimizeMatMulPattern(isl::manage(Node), OAI->TTI, MMI).release(); } @@ -1308,6 +1343,52 @@ char IslScheduleOptimizer::ID = 0; +/// Collect statistics for the schedule tree. +/// +/// @param Schedule The schedule tree to analyze. If not a schedule tree it is +/// ignored. +/// @param Version The version of the schedule tree that is analyzed. +/// 0 for the original schedule tree before any transformation. +/// 1 for the schedule tree after isl's rescheduling. +/// 2 for the schedule tree after optimizations are applied +/// (tiling, pattern matching) +static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { + auto Root = Schedule.get_root(); + if (!Root) + return; + + Root.foreach_ancestor_top_down([Version]( + isl::schedule_node Node) -> isl::stat { + switch (isl_schedule_node_get_type(Node.get())) { + case isl_schedule_node_band: { + NumBands[Version]++; + if (isl_schedule_node_band_get_permutable(Node.get()) == isl_bool_true) + NumPermutable[Version]++; + + int CountMembers = isl_schedule_node_band_n_member(Node.get()); + NumBandMembers[Version] += CountMembers; + for (int i = 0; i < CountMembers; i += 1) { + if (Node.band_member_get_coincident(i)) + NumCoincident[Version]++; + } + } break; + + case isl_schedule_node_filter: + NumFilters[Version]++; + break; + + case isl_schedule_node_extension: + NumExtension[Version]++; + break; + + default: + break; + } + + return isl::stat::ok; + }); +} + bool IslScheduleOptimizer::runOnScop(Scop &S) { // Skip SCoPs in case they're already optimised by PPCGCodeGeneration @@ -1352,6 +1433,9 @@ if (!Domain) return false; + ScopsProcessed++; + walkScheduleTreeForStatistics(S.getScheduleTree(), 0); + isl::union_map Validity = give(D.getDependences(ValidityKinds)); isl::union_map Proximity = give(D.getDependences(ProximityKinds)); @@ -1432,11 +1516,15 @@ auto Schedule = SC.compute_schedule(); isl_options_set_on_error(Ctx, OnErrorStatus); + walkScheduleTreeForStatistics(Schedule, 1); + // In cases the scheduler is not able to optimize the code, we just do not // touch the schedule. if (!Schedule) return false; + ScopsRescheduled++; + DEBUG({ auto *P = isl_printer_to_str(Ctx); P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); @@ -1451,10 +1539,16 @@ auto *TTI = &getAnalysis().getTTI(F); const OptimizerAdditionalInfoTy OAI = {TTI, const_cast(&D)}; auto NewSchedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); + walkScheduleTreeForStatistics(NewSchedule, 1); if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) return false; + auto ScopStats = S.getStatistics(); + ScopsOptimized++; + NumAffineLoopsOptimized += ScopStats.NumAffineLoops; + NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops; + S.setScheduleTree(NewSchedule.release()); S.markAsOptimized(); Index: polly/trunk/lib/Transform/Simplify.cpp =================================================================== --- polly/trunk/lib/Transform/Simplify.cpp +++ polly/trunk/lib/Transform/Simplify.cpp @@ -27,25 +27,44 @@ namespace { +#define TWO_STATISTICS(VARNAME, DESC) \ + static llvm::Statistic VARNAME[2] = { \ + {DEBUG_TYPE, #VARNAME "0", DESC " (first)", {0}, false}, \ + {DEBUG_TYPE, #VARNAME "1", DESC " (second)", {0}, false}} + /// Number of max disjuncts we allow in removeOverwrites(). This is to avoid /// that the analysis of accesses in a statement is becoming too complex. Chosen /// to be relatively small because all the common cases should access only few /// array elements per statement. static int const SimplifyMaxDisjuncts = 4; -STATISTIC(ScopsProcessed, "Number of SCoPs processed"); -STATISTIC(ScopsModified, "Number of SCoPs simplified"); +TWO_STATISTICS(ScopsProcessed, "Number of SCoPs processed"); +TWO_STATISTICS(ScopsModified, "Number of SCoPs simplified"); -STATISTIC(TotalOverwritesRemoved, "Number of removed overwritten writes"); -STATISTIC(TotalWritesCoalesced, "Number of writes coalesced with another"); -STATISTIC(TotalRedundantWritesRemoved, - "Number of writes of same value removed in any SCoP"); -STATISTIC(TotalEmptyPartialAccessesRemoved, - "Number of empty partial accesses removed"); -STATISTIC(TotalDeadAccessesRemoved, "Number of dead accesses removed"); -STATISTIC(TotalDeadInstructionsRemoved, - "Number of unused instructions removed"); -STATISTIC(TotalStmtsRemoved, "Number of statements removed in any SCoP"); +TWO_STATISTICS(TotalOverwritesRemoved, "Number of removed overwritten writes"); +TWO_STATISTICS(TotalWritesCoalesced, "Number of writes coalesced with another"); +TWO_STATISTICS(TotalRedundantWritesRemoved, + "Number of writes of same value removed in any SCoP"); +TWO_STATISTICS(TotalEmptyPartialAccessesRemoved, + "Number of empty partial accesses removed"); +TWO_STATISTICS(TotalDeadAccessesRemoved, "Number of dead accesses removed"); +TWO_STATISTICS(TotalDeadInstructionsRemoved, + "Number of unused instructions removed"); +TWO_STATISTICS(TotalStmtsRemoved, "Number of statements removed in any SCoP"); + +TWO_STATISTICS(NumValueWrites, "Number of scalar value writes after Simplify"); +TWO_STATISTICS( + NumValueWritesInLoops, + "Number of scalar value writes nested in affine loops after Simplify"); +TWO_STATISTICS(NumPHIWrites, + "Number of scalar phi writes after the first simplification"); +TWO_STATISTICS( + NumPHIWritesInLoops, + "Number of scalar phi writes nested in affine loops after Simplify"); +TWO_STATISTICS(NumSingletonWrites, "Number of singleton writes after Simplify"); +TWO_STATISTICS( + NumSingletonWritesInLoops, + "Number of singleton writes nested in affine loops after Simplify"); static bool isImplicitRead(MemoryAccess *MA) { return MA->isRead() && MA->isOriginalScalarKind(); @@ -100,6 +119,10 @@ class Simplify : public ScopPass { private: + /// The invocation id (if there are multiple instances in the pass manager's + /// pipeline) to determine which statistics to update. + int CallNo; + /// The last/current SCoP that is/has been processed. Scop *S; @@ -176,7 +199,7 @@ Stmt.removeSingleMemoryAccess(MA); OverwritesRemoved++; - TotalOverwritesRemoved++; + TotalOverwritesRemoved[CallNo]++; } // Unconditional writes overwrite other values. @@ -315,7 +338,7 @@ // We removed MA, OtherMA takes its role. MA = OtherMA; - TotalWritesCoalesced++; + TotalWritesCoalesced[CallNo]++; WritesCoalesced++; // Don't look for more candidates. @@ -437,7 +460,7 @@ Stmt.removeSingleMemoryAccess(MA); RedundantWritesRemoved++; - TotalRedundantWritesRemoved++; + TotalRedundantWritesRemoved[CallNo]++; } } } @@ -476,7 +499,7 @@ StmtsRemoved = NumStmtsBefore - S->getSize(); DEBUG(dbgs() << "Removed " << StmtsRemoved << " (of " << NumStmtsBefore << ") statements\n"); - TotalStmtsRemoved += StmtsRemoved; + TotalStmtsRemoved[CallNo] += StmtsRemoved; } /// Remove accesses that have an empty domain. @@ -501,7 +524,7 @@ for (MemoryAccess *MA : DeferredRemove) { Stmt.removeSingleMemoryAccess(MA); EmptyPartialAccessesRemoved++; - TotalEmptyPartialAccessesRemoved++; + TotalEmptyPartialAccessesRemoved[CallNo]++; } } } @@ -530,7 +553,7 @@ Stmt->removeSingleMemoryAccess(MA); DeadAccessesRemoved++; - TotalDeadAccessesRemoved++; + TotalDeadAccessesRemoved[CallNo]++; } // Remove all non-reachable instructions. @@ -548,7 +571,7 @@ DEBUG(dbgs() << "Removing "; Inst->print(dbgs()); dbgs() << " because it is not used\n"); DeadInstructionsRemoved++; - TotalDeadInstructionsRemoved++; + TotalDeadInstructionsRemoved[CallNo]++; continue; } @@ -595,7 +618,7 @@ public: static char ID; - explicit Simplify() : ScopPass(ID) {} + explicit Simplify(int CallNo = 0) : ScopPass(ID), CallNo(CallNo) {} virtual void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredTransitive(); @@ -610,7 +633,7 @@ // Prepare processing of this SCoP. this->S = &S; - ScopsProcessed++; + ScopsProcessed[CallNo]++; DEBUG(dbgs() << "Removing partial writes that never happen...\n"); removeEmptyPartialAccesses(); @@ -632,10 +655,18 @@ removeUnnecessaryStmts(); if (isModified()) - ScopsModified++; + ScopsModified[CallNo]++; DEBUG(dbgs() << "\nFinal Scop:\n"); DEBUG(dbgs() << S); + auto ScopStats = S.getStatistics(); + NumValueWrites[CallNo] += ScopStats.NumValueWrites; + NumValueWritesInLoops[CallNo] += ScopStats.NumValueWritesInLoops; + NumPHIWrites[CallNo] += ScopStats.NumPHIWrites; + NumPHIWritesInLoops[CallNo] += ScopStats.NumPHIWritesInLoops; + NumSingletonWrites[CallNo] += ScopStats.NumSingletonWrites; + NumSingletonWritesInLoops[CallNo] += ScopStats.NumSingletonWritesInLoops; + return false; } @@ -688,7 +719,7 @@ } } // namespace polly -Pass *polly::createSimplifyPass() { return new Simplify(); } +Pass *polly::createSimplifyPass(int CallNo) { return new Simplify(CallNo); } INITIALIZE_PASS_BEGIN(Simplify, "polly-simplify", "Polly - Simplify", false, false)