diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -572,7 +572,7 @@ PMBuilder.Inliner = createFunctionInliningPass( CodeGenOpts.OptimizationLevel, CodeGenOpts.OptimizeSize, (!CodeGenOpts.SampleProfileFile.empty() && - CodeGenOpts.PrepareForThinLTO)); + (CodeGenOpts.PrepareForThinLTO || CodeGenOpts.PrepareForLTO))); } PMBuilder.OptLevel = CodeGenOpts.OptimizationLevel; diff --git a/clang/test/CodeGen/pgo-sample-thinlto-summary.c b/clang/test/CodeGen/pgo-sample-thinlto-summary.c --- a/clang/test/CodeGen/pgo-sample-thinlto-summary.c +++ b/clang/test/CodeGen/pgo-sample-thinlto-summary.c @@ -1,9 +1,12 @@ +// Tests to ensure that *LTO pre-link compiles don't perform optimizations +// that can lead to subpar SamplePGO matching in the LTO backends. + // RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=SAMPLEPGO -// RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -flto=thin -o - 2>&1 | FileCheck %s -check-prefix=THINLTO +// RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -flto=thin -o - 2>&1 | FileCheck %s -check-prefix=LTO +// RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -flto -o - 2>&1 | FileCheck %s -check-prefix=LTO // RUN: %clang_cc1 -O2 -fexperimental-new-pass-manager -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=SAMPLEPGO -// RUN: %clang_cc1 -O2 -fexperimental-new-pass-manager -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -flto=thin -o - 2>&1 | FileCheck %s -check-prefix=THINLTO -// Checks if hot call is inlined by normal compile, but not inlined by -// thinlto compile. +// RUN: %clang_cc1 -O2 -fexperimental-new-pass-manager -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -flto=thin -o - 2>&1 | FileCheck %s -check-prefix=LTO +// RUN: %clang_cc1 -O2 -fexperimental-new-pass-manager -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -flto -o - 2>&1 | FileCheck %s -check-prefix=LTO int baz(int); int g; @@ -13,34 +16,38 @@ g += baz(i); } +// Checks if hot call is inlined by normal compile, but not inlined by +// (thin)lto pre-link compile. // SAMPLEPGO-LABEL: define {{(dso_local )?}}void @bar -// THINLTO-LABEL: define {{(dso_local )?}}void @bar +// LTO-LABEL: define {{(dso_local )?}}void @bar // SAMPLEPGO-NOT: call{{.*}}foo -// THINLTO: call{{.*}}foo +// LTO: call{{.*}}foo void bar(int n) { for (int i = 0; i < n; i++) foo(i); } -// Checks if loop unroll is invoked by normal compile, but not thinlto compile. +// Checks if loop unroll is invoked by normal compile, but not (thin)lto +// pre-link compile. // SAMPLEPGO-LABEL: define {{(dso_local )?}}void @unroll -// THINLTO-LABEL: define {{(dso_local )?}}void @unroll +// LTO-LABEL: define {{(dso_local )?}}void @unroll // SAMPLEPGO: call{{.*}}baz // SAMPLEPGO: call{{.*}}baz -// THINLTO: call{{.*}}baz -// THINLTO-NOT: call{{.*}}baz +// LTO: call{{.*}}baz +// LTO-NOT: call{{.*}}baz void unroll() { for (int i = 0; i < 2; i++) baz(i); } -// Checks that icp is not invoked for ThinLTO, but invoked for normal samplepgo. +// Checks that icp is not invoked for (Thin)LTO pre-link compile, but invoked +// for normal samplepgo. // SAMPLEPGO-LABEL: define {{(dso_local )?}}void @icp -// THINLTO-LABEL: define {{(dso_local )?}}void @icp +// LTO-LABEL: define {{(dso_local )?}}void @icp // SAMPLEPGO: if.true.direct_targ // FIXME: the following condition needs to be reversed once // LTOPreLinkDefaultPipeline is customized. -// THINLTO-NOT: if.true.direct_targ +// LTO-NOT: if.true.direct_targ void icp(void (*p)()) { p(); } diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -127,15 +127,15 @@ std::vector InnerPipeline; }; - /// ThinLTO phase. + /// LTO phase. /// - /// This enumerates the LLVM ThinLTO optimization phases. - enum class ThinLTOPhase { - /// No ThinLTO behavior needed. + /// This enumerates the LLVM (Thin)LTO optimization phases. + enum class LTOPhase { + /// No LTO behavior needed. None, - /// ThinLTO prelink (summary) phase. + /// LTO prelink phase. PreLink, - /// ThinLTO postlink (backend compile) phase. + /// LTO postlink (backend compile) phase. PostLink }; @@ -284,10 +284,9 @@ /// require some transformations for semantic reasons, they should explicitly /// build them. /// - /// \p Phase indicates the current ThinLTO phase. + /// \p Phase indicates the current LTO phase. FunctionPassManager - buildFunctionSimplificationPipeline(OptimizationLevel Level, - ThinLTOPhase Phase, + buildFunctionSimplificationPipeline(OptimizationLevel Level, LTOPhase Phase, bool DebugLogging = false); /// Construct the core LLVM module canonicalization and simplification @@ -304,10 +303,9 @@ /// require some transformations for semantic reasons, they should explicitly /// build them. /// - /// \p Phase indicates the current ThinLTO phase. + /// \p Phase indicates the current LTO phase. ModulePassManager - buildModuleSimplificationPipeline(OptimizationLevel Level, - ThinLTOPhase Phase, + buildModuleSimplificationPipeline(OptimizationLevel Level, LTOPhase Phase, bool DebugLogging = false); /// Construct the core LLVM module optimization pipeline. @@ -338,9 +336,10 @@ /// only intended for use when attempting to optimize code. If frontends /// require some transformations for semantic reasons, they should explicitly /// build them. - ModulePassManager buildPerModuleDefaultPipeline(OptimizationLevel Level, - bool DebugLogging = false, - bool LTOPreLink = false); + ModulePassManager + buildPerModuleDefaultPipeline(OptimizationLevel Level, + bool DebugLogging = false, + LTOPhase Phase = LTOPhase::None); /// Build a pre-link, ThinLTO-targeting default optimization pipeline to /// a pass manager. diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -383,10 +383,8 @@ C(LAM); } -FunctionPassManager -PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, - ThinLTOPhase Phase, - bool DebugLogging) { +FunctionPassManager PassBuilder::buildFunctionSimplificationPipeline( + OptimizationLevel Level, LTOPhase Phase, bool DebugLogging) { assert(Level != O0 && "Must request optimizations!"); FunctionPassManager FPM(DebugLogging); @@ -465,10 +463,10 @@ C(LPM2, Level); LPM2.addPass(LoopDeletionPass()); - // Do not enable unrolling in PreLinkThinLTO phase during sample PGO + // Do not enable unrolling in PreLink LTO phase during sample PGO // because it changes IR to makes profile annotation in back compile // inaccurate. - if ((Phase != ThinLTOPhase::PreLink || !PGOOpt || + if ((Phase != LTOPhase::PreLink || !PGOOpt || PGOOpt->Action != PGOOptions::SampleUse) && PTO.LoopUnrolling) LPM2.addPass(LoopFullUnrollPass(Level, /*OnlyWhenForced=*/false, @@ -648,10 +646,8 @@ return getInlineParams(OptLevel, SizeLevel); } -ModulePassManager -PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, - ThinLTOPhase Phase, - bool DebugLogging) { +ModulePassManager PassBuilder::buildModuleSimplificationPipeline( + OptimizationLevel Level, LTOPhase Phase, bool DebugLogging) { ModulePassManager MPM(DebugLogging); bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); @@ -659,9 +655,8 @@ // In ThinLTO mode, when flattened profile is used, all the available // profile information will be annotated in PreLink phase so there is // no need to load the profile again in PostLink. - bool LoadSampleProfile = - HasSampleProfile && - !(FlattenedProfileUsed && Phase == ThinLTOPhase::PostLink); + bool LoadSampleProfile = HasSampleProfile && !(FlattenedProfileUsed && + Phase == LTOPhase::PostLink); // During the ThinLTO backend phase we perform early indirect call promotion // here, before globalopt. Otherwise imported available_externally functions @@ -677,7 +672,7 @@ // command line. E.g. for flattened profiles where we will not be reloading // the sample profile in the ThinLTO backend, we ideally shouldn't have to // provide the sample profile file. - if (Phase == ThinLTOPhase::PostLink && !LoadSampleProfile) + if (Phase == LTOPhase::PostLink && !LoadSampleProfile) MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile)); // Do basic inference of function attributes from known properties of system @@ -709,18 +704,18 @@ // the debug info. MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, - Phase == ThinLTOPhase::PreLink)); + Phase == LTOPhase::PreLink)); // Cache ProfileSummaryAnalysis once to avoid the potential need to insert // RequireAnalysisPass for PSI before subsequent non-module passes. MPM.addPass(RequireAnalysisPass()); - // Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard - // for the profile annotation to be accurate in the ThinLTO backend. - if (Phase != ThinLTOPhase::PreLink) + // Do not invoke ICP in the LTOPrelink phase as it makes it hard + // for the profile annotation to be accurate in the LTO backend. + if (Phase != LTOPhase::PreLink) // We perform early indirect call promotion here, before globalopt. // This is important for the ThinLTO backend phase because otherwise // imported available_externally functions look unreferenced and are // removed. - MPM.addPass(PGOIndirectCallPromotion(Phase == ThinLTOPhase::PostLink, + MPM.addPass(PGOIndirectCallPromotion(Phase == LTOPhase::PostLink, true /* SamplePGO */)); } @@ -758,7 +753,7 @@ MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM))); // Add all the requested passes for instrumentation PGO, if requested. - if (PGOOpt && Phase != ThinLTOPhase::PostLink && + if (PGOOpt && Phase != LTOPhase::PostLink && (PGOOpt->Action == PGOOptions::IRInstr || PGOOpt->Action == PGOOptions::IRUse)) { addPGOInstrPasses(MPM, DebugLogging, Level, @@ -767,7 +762,7 @@ PGOOpt->ProfileRemappingFile); MPM.addPass(PGOIndirectCallPromotion(false, false)); } - if (PGOOpt && Phase != ThinLTOPhase::PostLink && + if (PGOOpt && Phase != LTOPhase::PostLink && PGOOpt->CSAction == PGOOptions::CSIRInstr) MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile)); @@ -800,7 +795,7 @@ // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO // because it makes profile annotation in the backend inaccurate. InlineParams IP = getInlineParamsFromOptLevel(Level); - if (Phase == ThinLTOPhase::PreLink && PGOOpt && + if (Phase == LTOPhase::PreLink && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) IP.HotCallSiteThreshold = 0; MainCGPipeline.addPass(InlinerPass(IP)); @@ -946,6 +941,13 @@ OptimizePM.addPass(InstCombinePass()); + // Do not enable unrolling in PreLink LTO phase during sample PGO + // because it changes IR to makes profile annotation in back compile + // inaccurate. + bool DoLoopUnrolling = + (!LTOPreLink || !PGOOpt || PGOOpt->Action != PGOOptions::SampleUse) && + PTO.LoopUnrolling; + // Unroll small loops to hide loop backedge latency and saturate any parallel // execution resources of an out-of-order processor. We also then need to // clean up redundancies and loop invariant code. @@ -953,12 +955,12 @@ // combiner for cleanup here so that the unrolling and LICM can be pipelined // across the loop nests. // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && PTO.LoopUnrolling) { + if (EnableUnrollAndJam && DoLoopUnrolling) { OptimizePM.addPass( createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level))); } OptimizePM.addPass(LoopUnrollPass( - LoopUnrollOptions(Level, /*OnlyWhenForced=*/!PTO.LoopUnrolling, + LoopUnrollOptions(Level, /*OnlyWhenForced=*/!DoLoopUnrolling, PTO.ForgetAllSCEVInLoopUnroll))); OptimizePM.addPass(WarnMissedTransformationsPass()); OptimizePM.addPass(InstCombinePass()); @@ -1020,7 +1022,7 @@ ModulePassManager PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, - bool DebugLogging, bool LTOPreLink) { + bool DebugLogging, LTOPhase Phase) { assert(Level != O0 && "Must request optimizations for the default pipeline!"); ModulePassManager MPM(DebugLogging); @@ -1036,11 +1038,11 @@ MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); // Add the core simplification pipeline. - MPM.addPass(buildModuleSimplificationPipeline(Level, ThinLTOPhase::None, - DebugLogging)); + MPM.addPass(buildModuleSimplificationPipeline(Level, Phase, DebugLogging)); // Now add the optimization pipeline. - MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging, LTOPreLink)); + MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging, + Phase == LTOPhase::PreLink)); return MPM; } @@ -1065,7 +1067,7 @@ // If we are planning to perform ThinLTO later, we don't bloat the code with // unrolling/vectorization/... now. Just simplify the module as much as we // can. - MPM.addPass(buildModuleSimplificationPipeline(Level, ThinLTOPhase::PreLink, + MPM.addPass(buildModuleSimplificationPipeline(Level, LTOPhase::PreLink, DebugLogging)); // Run partial inlining pass to partially inline functions that have @@ -1116,7 +1118,7 @@ MPM.addPass(ForceFunctionAttrsPass()); // Add the core simplification pipeline. - MPM.addPass(buildModuleSimplificationPipeline(Level, ThinLTOPhase::PostLink, + MPM.addPass(buildModuleSimplificationPipeline(Level, LTOPhase::PostLink, DebugLogging)); // Now add the optimization pipeline. @@ -1130,8 +1132,7 @@ bool DebugLogging) { assert(Level != O0 && "Must request optimizations for the default pipeline!"); // FIXME: We should use a customized pre-link pipeline! - return buildPerModuleDefaultPipeline(Level, DebugLogging, - /* LTOPreLink */true); + return buildPerModuleDefaultPipeline(Level, DebugLogging, LTOPhase::PreLink); } ModulePassManager @@ -1151,7 +1152,7 @@ // Load sample profile before running the LTO optimization pipeline. MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, - false /* ThinLTOPhase::PreLink */)); + false /* LTOPhase::PreLink */)); // Cache ProfileSummaryAnalysis once to avoid the potential need to insert // RequireAnalysisPass for PSI before subsequent non-module passes. MPM.addPass(RequireAnalysisPass()); diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -498,12 +498,12 @@ MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true, !PGOSampleUse.empty())); - // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops + // For SamplePGO in the *LTO compile phase, we do not want to unroll loops // as it will change the CFG too much to make the 2nd profile annotation // in backend more difficult. - bool PrepareForThinLTOUsingPGOSampleProfile = - PrepareForThinLTO && !PGOSampleUse.empty(); - if (PrepareForThinLTOUsingPGOSampleProfile) + bool PrepareForLTOUsingPGOSampleProfile = + (PrepareForThinLTO || PrepareForLTO) && !PGOSampleUse.empty(); + if (PrepareForLTOUsingPGOSampleProfile) DisableUnrollLoops = true; // Infer attributes about declarations if possible. @@ -530,12 +530,12 @@ addExtensionsToPM(EP_Peephole, MPM); MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE - // For SamplePGO in ThinLTO compile phase, we do not want to do indirect + // For SamplePGO in the *LTO compile phase, we do not want to do indirect // call promotion as it will change the CFG too much to make the 2nd // profile annotation in backend more difficult. - // PGO instrumentation is added during the compile phase for ThinLTO, do + // PGO instrumentation is added during the compile phase for *LTO, do // not run it a second time - if (DefaultOrPreLinkPipeline && !PrepareForThinLTOUsingPGOSampleProfile) + if (DefaultOrPreLinkPipeline && !PrepareForLTOUsingPGOSampleProfile) addPGOInstrPasses(MPM); // Create profile COMDAT variables. Lld linker wants to see all variables