Index: llvm/trunk/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Scalar.h +++ llvm/trunk/include/llvm/Transforms/Scalar.h @@ -267,7 +267,7 @@ // FunctionPass *createCFGSimplificationPass( unsigned Threshold = 1, bool ForwardSwitchCond = false, - bool ConvertSwitch = false, bool KeepLoops = true, + bool ConvertSwitch = false, bool KeepLoops = true, bool SinkCommon = false, std::function Ftor = nullptr); //===----------------------------------------------------------------------===// Index: llvm/trunk/include/llvm/Transforms/Scalar/SimplifyCFG.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Scalar/SimplifyCFG.h +++ llvm/trunk/include/llvm/Transforms/Scalar/SimplifyCFG.h @@ -39,7 +39,8 @@ : SimplifyCFGPass(SimplifyCFGOptions() .forwardSwitchCondToPhi(false) .convertSwitchToLookupTable(false) - .needCanonicalLoops(true)) {} + .needCanonicalLoops(true) + .sinkCommonInsts(false)) {} /// Construct a pass with optional optimizations. Index: llvm/trunk/include/llvm/Transforms/Utils/Local.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/Local.h +++ llvm/trunk/include/llvm/Transforms/Utils/Local.h @@ -63,16 +63,20 @@ bool ForwardSwitchCondToPhi; bool ConvertSwitchToLookupTable; bool NeedCanonicalLoop; + bool SinkCommonInsts; AssumptionCache *AC; SimplifyCFGOptions(unsigned BonusThreshold = 1, bool ForwardSwitchCond = false, bool SwitchToLookup = false, bool CanonicalLoops = true, + bool SinkCommon = false, AssumptionCache *AssumpCache = nullptr) : BonusInstThreshold(BonusThreshold), ForwardSwitchCondToPhi(ForwardSwitchCond), ConvertSwitchToLookupTable(SwitchToLookup), - NeedCanonicalLoop(CanonicalLoops), AC(AssumpCache) {} + NeedCanonicalLoop(CanonicalLoops), + SinkCommonInsts(SinkCommon), + AC(AssumpCache) {} // Support 'builder' pattern to set members by name at construction time. SimplifyCFGOptions &bonusInstThreshold(int I) { @@ -91,6 +95,10 @@ NeedCanonicalLoop = B; return *this; } + SimplifyCFGOptions &sinkCommonInsts(bool B) { + SinkCommonInsts = B; + return *this; + } SimplifyCFGOptions &setAssumptionCache(AssumptionCache *Cache) { AC = Cache; return *this; Index: llvm/trunk/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/trunk/lib/Passes/PassBuilder.cpp +++ llvm/trunk/lib/Passes/PassBuilder.cpp @@ -747,21 +747,24 @@ // Cleanup after the loop optimization passes. OptimizePM.addPass(InstCombinePass()); - // Now that we've formed fast to execute loop structures, we do further // optimizations. These are run afterward as they might block doing complex // analyses and transforms such as what are needed for loop vectorization. - // Optimize parallel scalar instruction chains into SIMD instructions. - OptimizePM.addPass(SLPVectorizerPass()); - - // Cleanup after all of the vectorizers. Simplification passes like CVP and + // Cleanup after loop vectorization, etc. Simplification passes like CVP and // GVN, loop transforms, and others have already run, so it's now better to // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions(). - forwardSwitchCondToPhi(true). - convertSwitchToLookupTable(true). - needCanonicalLoops(false))); + forwardSwitchCondToPhi(true). + convertSwitchToLookupTable(true). + needCanonicalLoops(false). + sinkCommonInsts(true))); + + // Optimize parallel scalar instruction chains into SIMD instructions. + OptimizePM.addPass(SLPVectorizerPass()); + OptimizePM.addPass(InstCombinePass()); // Unroll small loops to hide loop backedge latency and saturate any parallel Index: llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -365,7 +365,7 @@ // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) - addPass(createCFGSimplificationPass(1, true, true, false)); + addPass(createCFGSimplificationPass(1, true, true, false, true)); // Run LoopDataPrefetch // Index: llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp +++ llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp @@ -385,7 +385,7 @@ // ldrex/strex loops to simplify this, but it needs tidying up. if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass( - 1, false, false, true, [this](const Function &F) { + 1, false, false, true, true, [this](const Function &F) { const auto &ST = this->TM->getSubtarget(F); return ST.hasAnyDataBarrier() && !ST.isThumb1Only(); })); Index: llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -630,6 +630,13 @@ addInstructionCombiningPass(MPM); } + // Cleanup after loop vectorization, etc. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. + MPM.add(createCFGSimplificationPass(1, true, true, false, true)); + if (RunSLPAfterLoopVectorization && SLPVectorize) { MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. if (OptLevel > 1 && ExtraVectorizerPasses) { @@ -638,9 +645,6 @@ } addExtensionsToPM(EP_Peephole, MPM); - // Switches to lookup tables and other transforms that may not be considered - // canonical by other IR passes. - MPM.add(createCFGSimplificationPass(1, true, true, false)); addInstructionCombiningPass(MPM); if (!DisableUnrollLoops) { Index: llvm/trunk/lib/Transforms/Scalar/SimplifyCFGPass.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ llvm/trunk/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -61,6 +61,11 @@ "forward-switch-cond", cl::Hidden, cl::init(false), cl::desc("Forward switch condition to phi ops (default = false)")); +static cl::opt UserSinkCommonInsts( + "sink-common-insts", cl::Hidden, cl::init(false), + cl::desc("Sink common instructions (default = false)")); + + STATISTIC(NumSimpl, "Number of blocks simplified"); /// If we have more than one empty (other than phi node) return blocks, @@ -205,6 +210,9 @@ Options.NeedCanonicalLoop = UserKeepLoops.getNumOccurrences() ? UserKeepLoops : Opts.NeedCanonicalLoop; + Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences() + ? UserSinkCommonInsts + : Opts.SinkCommonInsts; } PreservedAnalyses SimplifyCFGPass::run(Function &F, @@ -226,6 +234,7 @@ CFGSimplifyPass(unsigned Threshold = 1, bool ForwardSwitchCond = false, bool ConvertSwitch = false, bool KeepLoops = true, + bool SinkCommon = false, std::function Ftor = nullptr) : FunctionPass(ID), PredicateFtor(std::move(Ftor)) { @@ -246,6 +255,10 @@ Options.NeedCanonicalLoop = UserKeepLoops.getNumOccurrences() ? UserKeepLoops : KeepLoops; + + Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences() + ? UserSinkCommonInsts + : SinkCommon; } bool runOnFunction(Function &F) override { @@ -276,7 +289,8 @@ FunctionPass * llvm::createCFGSimplificationPass(unsigned Threshold, bool ForwardSwitchCond, bool ConvertSwitch, bool KeepLoops, + bool SinkCommon, std::function Ftor) { return new CFGSimplifyPass(Threshold, ForwardSwitchCond, ConvertSwitch, - KeepLoops, std::move(Ftor)); + KeepLoops, SinkCommon, std::move(Ftor)); } Index: llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp +++ llvm/trunk/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5728,7 +5728,7 @@ BasicBlock *BB = BI->getParent(); BasicBlock *Succ = BI->getSuccessor(0); - if (SinkCommon && SinkThenElseCodeToEnd(BI)) + if (SinkCommon && Options.SinkCommonInsts && SinkThenElseCodeToEnd(BI)) return true; // If the Terminator is the only non-phi instruction, simplify the block. Index: llvm/trunk/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll =================================================================== --- llvm/trunk/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll +++ llvm/trunk/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll @@ -1,4 +1,4 @@ -; RUN: opt -simplifycfg -S < %s | FileCheck %s +; RUN: opt -simplifycfg -sink-common-insts -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: llvm/trunk/test/Other/new-pm-defaults.ll =================================================================== --- llvm/trunk/test/Other/new-pm-defaults.ll +++ llvm/trunk/test/Other/new-pm-defaults.ll @@ -197,8 +197,8 @@ ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy Index: llvm/trunk/test/Other/new-pm-thinlto-defaults.ll =================================================================== --- llvm/trunk/test/Other/new-pm-thinlto-defaults.ll +++ llvm/trunk/test/Other/new-pm-thinlto-defaults.ll @@ -185,8 +185,8 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass -; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-POSTLINK-O-NEXT: Running analysis: OuterAnalysisManagerProxy Index: llvm/trunk/test/Transforms/PhaseOrdering/simplifycfg-options.ll =================================================================== --- llvm/trunk/test/Transforms/PhaseOrdering/simplifycfg-options.ll +++ llvm/trunk/test/Transforms/PhaseOrdering/simplifycfg-options.ll @@ -76,10 +76,8 @@ ; ALL-NEXT: [[XI:%.*]] = load double, double* [[XI_PTR]], align 8 ; ALL-NEXT: [[YI:%.*]] = load double, double* [[YI_PTR]], align 8 ; ALL-NEXT: [[CMP:%.*]] = fcmp ogt double [[XI]], [[YI]] -; ALL-NEXT: [[Y_SINK:%.*]] = select i1 [[CMP]], double* [[X]], double* [[Y]] -; ALL-NEXT: [[YI_PTR_AGAIN:%.*]] = getelementptr double, double* [[Y_SINK]], i64 [[I]] -; ALL-NEXT: [[YI_AGAIN:%.*]] = load double, double* [[YI_PTR_AGAIN]], align 8 -; ALL-NEXT: ret double [[YI_AGAIN]] +; ALL-NEXT: [[XI_YI:%.*]] = select i1 [[CMP]], double [[XI]], double [[YI]] +; ALL-NEXT: ret double [[XI_YI]] ; entry: %xi_ptr = getelementptr double, double* %x, i64 %i Index: llvm/trunk/test/Transforms/SimplifyCFG/no-md-sink.ll =================================================================== --- llvm/trunk/test/Transforms/SimplifyCFG/no-md-sink.ll +++ llvm/trunk/test/Transforms/SimplifyCFG/no-md-sink.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -simplifycfg -S | FileCheck %s +; RUN: opt < %s -simplifycfg -sink-common-insts -S | FileCheck %s define i1 @test1(i1 zeroext %flag, i8* %y) #0 { entry: Index: llvm/trunk/test/Transforms/SimplifyCFG/sink-common-code.ll =================================================================== --- llvm/trunk/test/Transforms/SimplifyCFG/sink-common-code.ll +++ llvm/trunk/test/Transforms/SimplifyCFG/sink-common-code.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -simplifycfg -S | FileCheck -enable-var-scope %s +; RUN: opt < %s -simplifycfg -sink-common-insts -S | FileCheck -enable-var-scope %s define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) { entry: