Index: llvm/trunk/include/llvm/Passes/PassBuilder.h =================================================================== --- llvm/trunk/include/llvm/Passes/PassBuilder.h +++ llvm/trunk/include/llvm/Passes/PassBuilder.h @@ -274,7 +274,8 @@ /// require some transformations for semantic reasons, they should explicitly /// build them. ModulePassManager buildModuleOptimizationPipeline(OptimizationLevel Level, - bool DebugLogging = false); + bool DebugLogging = false, + bool LTOPreLink = false); /// Build a per-module default optimization pipeline. /// @@ -288,7 +289,8 @@ /// require some transformations for semantic reasons, they should explicitly /// build them. ModulePassManager buildPerModuleDefaultPipeline(OptimizationLevel Level, - bool DebugLogging = false); + bool DebugLogging = false, + bool LTOPreLink = false); /// Build a pre-link, ThinLTO-targeting default optimization pipeline to /// a pass manager. Index: llvm/trunk/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/trunk/lib/Passes/PassBuilder.cpp +++ llvm/trunk/lib/Passes/PassBuilder.cpp @@ -703,14 +703,6 @@ if (EnableSyntheticCounts && !PGOOpt) MPM.addPass(SyntheticCountsPropagation()); - // Split out cold code. Splitting is done before inlining because 1) the most - // common kinds of cold regions can (a) be found before inlining and (b) do - // not grow after inlining, and 2) inhibiting inlining of cold code improves - // code size & compile time. Split after Mem2Reg to make code model estimates - // more accurate, but before InstCombine to allow it to clean things up. - if (EnableHotColdSplit && Phase != ThinLTOPhase::PostLink) - MPM.addPass(HotColdSplittingPass()); - // Require the GlobalsAA analysis for the module so we can query it within // the CGSCC pipeline. MPM.addPass(RequireAnalysisPass()); @@ -769,9 +761,8 @@ return MPM; } -ModulePassManager -PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, - bool DebugLogging) { +ModulePassManager PassBuilder::buildModuleOptimizationPipeline( + OptimizationLevel Level, bool DebugLogging, bool LTOPreLink) { ModulePassManager MPM(DebugLogging); // Optimize globals now that the module is fully simplified. @@ -880,6 +871,12 @@ // alignment information, try to re-derive it here. OptimizePM.addPass(AlignmentFromAssumptionsPass()); + // Split out cold code. Splitting is done late to avoid hiding context from + // other optimizations and inadvertently regressing performance. The tradeoff + // is that this has a higher code size cost than splitting early. + if (EnableHotColdSplit && !LTOPreLink) + MPM.addPass(HotColdSplittingPass()); + // LoopSink pass sinks instructions hoisted by LICM, which serves as a // canonicalization pass that enables other optimizations. As a result, // LoopSink pass needs to be a very late IR pass to avoid undoing LICM @@ -923,7 +920,7 @@ ModulePassManager PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, - bool DebugLogging) { + bool DebugLogging, bool LTOPreLink) { assert(Level != O0 && "Must request optimizations for the default pipeline!"); ModulePassManager MPM(DebugLogging); @@ -943,7 +940,7 @@ DebugLogging)); // Now add the optimization pipeline. - MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging)); + MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging, LTOPreLink)); return MPM; } @@ -1027,7 +1024,8 @@ bool DebugLogging) { assert(Level != O0 && "Must request optimizations for the default pipeline!"); // FIXME: We should use a customized pre-link pipeline! - return buildPerModuleDefaultPipeline(Level, DebugLogging); + return buildPerModuleDefaultPipeline(Level, DebugLogging, + /*LTOPreLink=*/true); } ModulePassManager @@ -1208,6 +1206,11 @@ // CFI is disabled. MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); + // Enable splitting late in the FullLTO post-link pipeline. This is done in + // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses). + if (EnableHotColdSplit) + MPM.addPass(HotColdSplittingPass()); + // Add late LTO optimization passes. // Delete basic blocks, which optimization passes may have killed. MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass())); Index: llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -529,11 +529,6 @@ if (DefaultOrPreLinkPipeline && !PrepareForThinLTOUsingPGOSampleProfile) addPGOInstrPasses(MPM); - // Split out cold code before inlining. See comment in the new PM - // (\ref buildModuleSimplificationPipeline). - if (EnableHotColdSplit && DefaultOrPreLinkPipeline) - MPM.add(createHotColdSplittingPass()); - // We add a module alias analysis pass here. In part due to bugs in the // analysis infrastructure this "works" in that the analysis stays alive // for the entire SCC pass run below. @@ -730,6 +725,11 @@ MPM.add(createConstantMergePass()); // Merge dup global constants } + // See comment in the new PM for justification of scheduling splitting at + // this stage (\ref buildModuleSimplificationPipeline). + if (EnableHotColdSplit && !(PrepareForLTO || PrepareForThinLTO)) + MPM.add(createHotColdSplittingPass()); + if (MergeFunctions) MPM.add(createMergeFunctionsPass()); @@ -918,6 +918,11 @@ void PassManagerBuilder::addLateLTOOptimizationPasses( legacy::PassManagerBase &PM) { + // See comment in the new PM for justification of scheduling splitting at + // this stage (\ref buildLTODefaultPipeline). + if (EnableHotColdSplit) + PM.add(createHotColdSplittingPass()); + // Delete basic blocks, which optimization passes may have killed. PM.add(createCFGSimplificationPass()); Index: llvm/trunk/test/Other/X86/lto-hot-cold-split.ll =================================================================== --- llvm/trunk/test/Other/X86/lto-hot-cold-split.ll +++ llvm/trunk/test/Other/X86/lto-hot-cold-split.ll @@ -1,10 +1,10 @@ ; RUN: opt -module-summary %s -o %t.bc -; RUN: llvm-lto -hot-cold-split=true -thinlto-action=run %t.bc -debug-pass=Structure 2>&1 | FileCheck %s -check-prefix=OLDPM-THINLTO-POSTLINK-Os +; RUN: llvm-lto -hot-cold-split=true -thinlto-action=run %t.bc -debug-pass=Structure 2>&1 | FileCheck %s -check-prefix=OLDPM-ANYLTO-POSTLINK-Os +; RUN: llvm-lto -hot-cold-split=true %t.bc -debug-pass=Structure 2>&1 | FileCheck %s -check-prefix=OLDPM-ANYLTO-POSTLINK-Os ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -; OLDPM-THINLTO-POSTLINK-Os-LABEL: Pass Arguments -; OLDPM-THINLTO-POSTLINK-Os-NOT: Hot Cold Splitting +; OLDPM-ANYLTO-POSTLINK-Os: Hot Cold Splitting Index: llvm/trunk/test/Other/new-pm-pgo.ll =================================================================== --- llvm/trunk/test/Other/new-pm-pgo.ll +++ llvm/trunk/test/Other/new-pm-pgo.ll @@ -13,7 +13,6 @@ ; GEN: Running pass: PGOInstrumentationGen ; USE: Running pass: PGOInstrumentationUse ; USE: Running pass: PGOIndirectCallPromotion -; SPLIT: Running pass: HotColdSplittingPass ; USE: Running pass: PGOMemOPSizeOpt ; SAMPLE_USE_O: Running pass: ModuleToFunctionPassAdaptor<{{.*}}AddDiscriminatorsPass{{.*}}> ; SAMPLE_USE_PRE_LINK: Running pass: ModuleToFunctionPassAdaptor<{{.*}}AddDiscriminatorsPass{{.*}}> @@ -27,6 +26,7 @@ ; SAMPLE_USE_POST_LINK-NOT: Running pass: GlobalOptPass ; SAMPLE_USE_POST_LINK: Running pass: PGOIndirectCallPromotion ; SAMPLE_GEN: Running pass: ModuleToFunctionPassAdaptor<{{.*}}AddDiscriminatorsPass{{.*}}> +; SPLIT: Running pass: HotColdSplittingPass define void @foo() { ret void Index: llvm/trunk/test/Other/opt-hot-cold-split.ll =================================================================== --- llvm/trunk/test/Other/opt-hot-cold-split.ll +++ llvm/trunk/test/Other/opt-hot-cold-split.ll @@ -1,22 +1,19 @@ ; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=DEFAULT-Os ; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -passes='lto-pre-link' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=LTO-PRELINK-Os ; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -passes='thinlto-pre-link' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=THINLTO-PRELINK-Os +; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -passes='lto' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=LTO-POSTLINK-Os ; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -passes='thinlto' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=THINLTO-POSTLINK-Os ; REQUIRES: asserts -; Splitting should occur after Mem2Reg and should be followed by InstCombine. +; Splitting should occur late. -; DEFAULT-Os: Promote Memory to Register ; DEFAULT-Os: Hot Cold Splitting -; DEFAULT-Os: Combine redundant instructions +; DEFAULT-Os: Simplify the CFG -; LTO-PRELINK-Os-LABEL: Starting llvm::Module pass manager run. -; LTO-PRELINK-Os: Running pass: {{.*}}PromotePass -; LTO-PRELINK-Os: Running pass: HotColdSplittingPass +; LTO-PRELINK-Os-NOT: pass: HotColdSplittingPass -; THINLTO-PRELINK-Os-LABEL: Running analysis: PassInstrumentationAnalysis -; THINLTO-PRELINK-Os: Running pass: {{.*}}PromotePass -; THINLTO-PRELINK-Os: Running pass: HotColdSplittingPass +; THINLTO-PRELINK-Os-NOT: Running pass: HotColdSplittingPass -; THINLTO-POSTLINK-Os-NOT: HotColdSplitting +; LTO-POSTLINK-Os: HotColdSplitting +; THINLTO-POSTLINK-Os: HotColdSplitting Index: llvm/trunk/test/Other/pass-pipelines.ll =================================================================== --- llvm/trunk/test/Other/pass-pipelines.ll +++ llvm/trunk/test/Other/pass-pipelines.ll @@ -41,7 +41,6 @@ ; PGOUSE: Function Integration/Inlining ; PGOUSE: PGOInstrumentationUsePass ; PGOUSE: PGOIndirectCallPromotion -; SPLIT: Hot Cold Splitting ; PGOUSE: CallGraph Construction ; CHECK-O2-NEXT: Globals Alias Analysis ; CHECK-O2-NEXT: Call Graph SCC Pass Manager @@ -100,6 +99,7 @@ ; the runtime unrolling though. ; CHECK-O2: Loop Pass Manager ; CHECK-O2-NEXT: Loop Invariant Code Motion +; SPLIT: Hot Cold Splitting ; CHECK-O2: FunctionPass Manager ; CHECK-O2: Loop Pass Manager ; CHECK-O2-NEXT: Loop Sink