diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -80,6 +80,41 @@ extern cl::opt EnableLoopInterleaving; extern cl::opt EnableLoopVectorization; +/// A marker to determine if extra passes after loop vectorization should be +/// run. +struct ShouldRunExtraVectorPasses + : public AnalysisInfoMixin { + static AnalysisKey Key; + struct Result {}; + + Result run(Function &F, FunctionAnalysisManager &FAM) { return Result(); } +}; + +/// A pass manager to conditionally run a set of extra function passes after +/// vectorization. The manager contains 2 set of passes: The first set of +/// passes to run unconditonally and the second set is only run if the +/// ShouldRunExtraVectorPasses analysis is cached (LoopVectorize sets it if +/// extra simplifications could be beneficial). +class ExtraVectorPassManager : public FunctionPassManager { + /// Set of passes to run conditionally. + FunctionPassManager ConditionalPasses; + +public: + template void addConditionalPass(PassT &&P) { + ConditionalPasses.addPass(std::move(P)); + } + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) { + bool RunExtraPasses = AM.getCachedResult(F); + + auto PA = FunctionPassManager::run(F, AM); + if (RunExtraPasses) + PA.intersect(ConditionalPasses.run(F, AM)); + PA.abandon(); + return PA; + } +}; + struct LoopVectorizeOptions { /// If false, consider all loops for interleaving. /// If true, only loops that explicitly request interleaving are considered. diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -964,6 +964,9 @@ FPM.addPass(LoopVectorizePass( LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); + ExtraVectorPassManager ExtraPasses; + + FunctionPassManager &NextPasses = ExtraVectorizerPasses ? ExtraPasses : FPM; if (IsFullLTO) { // The vectorizer may have significantly shortened a loop body; unroll // again. Unroll small loops to hide loop backedge latency and saturate any @@ -974,21 +977,21 @@ // across the loop nests. // We do UnrollAndJam in a separate LPM to ensure it happens before unroll if (EnableUnrollAndJam && PTO.LoopUnrolling) - FPM.addPass(createFunctionToLoopPassAdaptor( + NextPasses.addPass(createFunctionToLoopPassAdaptor( LoopUnrollAndJamPass(Level.getSpeedupLevel()))); - FPM.addPass(LoopUnrollPass(LoopUnrollOptions( + NextPasses.addPass(LoopUnrollPass(LoopUnrollOptions( Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, PTO.ForgetAllSCEVInLoopUnroll))); - FPM.addPass(WarnMissedTransformationsPass()); + NextPasses.addPass(WarnMissedTransformationsPass()); } if (!IsFullLTO) { // Eliminate loads by forwarding stores from the previous iteration to loads // of the current iteration. - FPM.addPass(LoopLoadEliminationPass()); + NextPasses.addPass(LoopLoadEliminationPass()); } // Cleanup after the loop optimization passes. - FPM.addPass(InstCombinePass()); + NextPasses.addPass(InstCombinePass()); if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { // At higher optimization levels, try to clean up any runtime overlap and @@ -997,20 +1000,21 @@ // common computations, hoist loop-invariant aspects out of any outer loop, // and unswitch the runtime checks if possible. Once hoisted, we may have // dead (or speculatable) control flows or more combining opportunities. - FPM.addPass(EarlyCSEPass()); - FPM.addPass(CorrelatedValuePropagationPass()); - FPM.addPass(InstCombinePass()); + ExtraPasses.addConditionalPass(EarlyCSEPass()); + ExtraPasses.addConditionalPass(CorrelatedValuePropagationPass()); + ExtraPasses.addConditionalPass(InstCombinePass()); LoopPassManager LPM; LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3)); - FPM.addPass( + ExtraPasses.addConditionalPass( RequireAnalysisPass()); - FPM.addPass( + ExtraPasses.addConditionalPass( createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); - FPM.addPass(SimplifyCFGPass()); - FPM.addPass(InstCombinePass()); + ExtraPasses.addConditionalPass(SimplifyCFGPass()); + ExtraPasses.addConditionalPass(InstCombinePass()); + FPM.addPass(std::move(ExtraPasses)); } // Now that we've formed fast to execute loop structures, we do further diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -202,6 +202,7 @@ FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis()) FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis()) FUNCTION_ANALYSIS("should-not-run-function-passes", ShouldNotRunFunctionPassesAnalysis()) +FUNCTION_ANALYSIS("should-run-extra-vector-passes", ShouldRunExtraVectorPasses()) FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis()) FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis()) FUNCTION_ANALYSIS("targetir", diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -428,6 +428,8 @@ namespace llvm { +AnalysisKey ShouldRunExtraVectorPasses::Key; + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -10747,8 +10749,14 @@ PA.preserve(); PA.preserve(); } - if (!Result.MadeCFGChange) + + if (Result.MadeCFGChange) { + // Making CFG changes likely means a loop got vectorized. Indicate that extra simplification passes should be run. + AM.getResult(F); + PA.preserve(); + } else { PA.preserveSet(); + } return PA; } diff --git a/llvm/test/Other/opt-pipeline-vector-passes.ll b/llvm/test/Other/opt-pipeline-vector-passes.ll --- a/llvm/test/Other/opt-pipeline-vector-passes.ll +++ b/llvm/test/Other/opt-pipeline-vector-passes.ll @@ -1,6 +1,7 @@ -; RUN: opt -disable-verify -debug-pass-manager -passes='default' -S %s 2>&1 | FileCheck %s --check-prefixes=O1 -; RUN: opt -disable-verify -debug-pass-manager -passes='default' -S %s 2>&1 | FileCheck %s --check-prefixes=O2 -; RUN: opt -disable-verify -debug-pass-manager -passes='default' -extra-vectorizer-passes -S %s 2>&1 | FileCheck %s --check-prefixes=O2_EXTRA +; RUN: opt -disable-verify -debug-pass-manager -passes='default' -force-vector-width=4 -S %s 2>&1 | FileCheck %s --check-prefixes=O1 +; RUN: opt -disable-verify -debug-pass-manager -passes='default' -force-vector-width=4 -S %s 2>&1 | FileCheck %s --check-prefixes=O2 +; RUN: opt -disable-verify -debug-pass-manager -passes='default' -force-vector-width=0 -extra-vectorizer-passes -S %s 2>&1 | FileCheck %s --check-prefixes=O2 +; RUN: opt -disable-verify -debug-pass-manager -passes='default' -force-vector-width=4 -extra-vectorizer-passes -S %s 2>&1 | FileCheck %s --check-prefixes=O2_EXTRA ; REQUIRES: asserts @@ -32,14 +33,20 @@ ; O2_EXTRA: Running pass: EarlyCSEPass ; O2_EXTRA: Running pass: VectorCombinePass -define i64 @f(i1 %cond) { +define i64 @f(i1 %cond, i32* %src, i32* %dst) { entry: br label %loop loop: %i = phi i64 [ 0, %entry ], [ %inc, %loop ] - %inc = add i64 %i, 1 - br i1 %cond, label %loop, label %exit + %src.i = getelementptr i32, i32* %src, i64 %i + %src.v = load i32, i32* %src.i + %add = add i32 %src.v, 10 + %dst.i = getelementptr i32, i32* %dst, i64 %i + store i32 %add, i32* %dst.i + %inc = add nuw nsw i64 %i, 1 + %ec = icmp ne i64 %inc, 1000 + br i1 %ec, label %loop, label %exit exit: ret i64 %i