Index: include/llvm/Analysis/CGSCCPassManager.h =================================================================== --- include/llvm/Analysis/CGSCCPassManager.h +++ include/llvm/Analysis/CGSCCPassManager.h @@ -91,7 +91,10 @@ #include "llvm/ADT/PriorityWorklist.h" #include "llvm/Analysis/LazyCallGraph.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/ValueHandle.h" namespace llvm { @@ -607,6 +610,199 @@ return CGSCCToFunctionPassAdaptor(std::move(Pass), DebugLogging); } + +/// A helper that repeats an SCC pass each time an indirect call is refined to +/// a direct call by that pass. +/// +/// While the CGSCC pass manager works to re-visit SCCs and RefSCCs as they +/// change shape, we may also want to repeat an SCC pass if it simply refines +/// an indirect call to a direct call, even if doing so does not alter the +/// shape of the graph. Note that this only pertains to direct calls to +/// functions where IPO across the SCC may be able to compute more precise +/// results. For intrinsics, we assume scalar optimizations already can fully +/// reason about them. +/// +/// This repeatition has the potential to be very large however, as each one +/// might refine a single call site. As a consequence, in practice we use an +/// upper bound on the number of repetitions to limit things. +template +class DevirtSCCRepeatedPass + : public PassInfoMixin> { +public: + explicit DevirtSCCRepeatedPass(PassT Pass, int MaxRepetitions, + bool DebugLogging = false) + : Pass(std::move(Pass)), MaxRepetitions(MaxRepetitions), + DebugLogging(DebugLogging) {} + // We have to explicitly define all the special member functions because MSVC + // refuses to generate them. + DevirtSCCRepeatedPass(const DevirtSCCRepeatedPass &Arg) + : Pass(Arg.Pass), MaxRepetitions(Arg.MaxRepetitions), + DebugLogging(Arg.DebugLogging) {} + DevirtSCCRepeatedPass(DevirtSCCRepeatedPass &&Arg) + : Pass(std::move(Arg.Pass)), MaxRepetitions(Arg.MaxRepetitions), + DebugLogging(Arg.DebugLogging) {} + friend void swap(DevirtSCCRepeatedPass &LHS, + DevirtSCCRepeatedPass &RHS) { + using std::swap; + swap(LHS.Pass, RHS.Pass); + swap(LHS.MaxRepetitions, RHS.MaxRepetitions); + swap(LHS.DebugLogging, RHS.DebugLogging); + } + DevirtSCCRepeatedPass &operator=(DevirtSCCRepeatedPass RHS) { + swap(*this, RHS); + return *this; + } + + /// Runs the wrapped pass up to \c MaxRepetitions on the SCC, iterating + /// whenever an indirect call is refined. + PreservedAnalyses run(LazyCallGraph::SCC &InitialC, CGSCCAnalysisManager &AM, + LazyCallGraph &CG, CGSCCUpdateResult &UR) { + PreservedAnalyses PA = PreservedAnalyses::all(); + + // The SCC may be refined while we are running passes over it, so set up + // a pointer that we can update. + LazyCallGraph::SCC *C = &InitialC; + + // Collect value handles for all of the indirect call sites. + SmallVector CallHandles; + + // Struct to track the counts of direct and indirect calls in each function + // of the SCC. + struct CallCount { + int Direct; + int Indirect; + }; + + // Put value handles on all of the indirect calls and return the number of + // direct calls for each function in the SCC. + auto ScanSCC = [](LazyCallGraph::SCC &C, + SmallVectorImpl &CallHandles) { + assert(CallHandles.empty() && "Must start with a clear set of handles."); + + SmallVector CallCounts; + for (LazyCallGraph::Node &N : C) { + CallCounts.push_back({0, 0}); + CallCount &Count = CallCounts.back(); + for (Instruction &I : instructions(N.getFunction())) + if (auto CS = CallSite(&I)) { + if (CS.getCalledFunction()) { + ++Count.Direct; + } else { + ++Count.Indirect; + CallHandles.push_back(WeakVH(&I)); + } + } + } + + return CallCounts; + }; + + // Populate the initial call handles and get the initial call counts. + auto CallCounts = ScanSCC(*C, CallHandles); + + for (int i = 0;; ++i) { + PreservedAnalyses PassPA = Pass.run(*C, AM, CG, UR); + + // If the SCC structure has changed, bail immediately and let the outer + // CGSCC layer handle any iteration to reflect the refined structure. + if (UR.UpdatedC && UR.UpdatedC != C) { + PA.intersect(std::move(PassPA)); + break; + } + + // Check that we didn't miss any update scenario. + assert(!UR.InvalidatedSCCs.count(C) && "Processing an invalid SCC!"); + assert(C->begin() != C->end() && "Cannot have an empty SCC!"); + assert((int)CallCounts.size() == C->size() && + "Cannot have changed the size of the SCC!"); + + // Check whether any of the handles were devirtualized. + auto IsDevirtualizedHandle = [](WeakVH &CallH) { + if (!CallH) + return false; + auto CS = CallSite(CallH); + if (!CS) + return false; + + // If the call is still indirect, leave it alone. + Function *F = CS.getCalledFunction(); + if (!F) + return false; + + // We now have a direct call where previously we had an indirect call, + // so iterate to process this devirtualization site. + return true; + }; + bool Devirt = any_of(CallHandles, IsDevirtualizedHandle); + + // Rescan to build up a new set of handles and count how many direct + // calls remain. If we decide to iterate, this also sets up the input to + // the next iteration. + CallHandles.clear(); + auto NewCallCounts = ScanSCC(*C, CallHandles); + + // If we haven't found an explicit devirtualization already see if we + // have decreased the number of indirect calls and increased the number + // of direct calls for any function in the SCC. This can be fooled by all + // manner of transformations such as DCE and other things, but seems to + // work well in practice. + if (!Devirt) + for (int i = 0, Size = C->size(); i < Size; ++i) + if (CallCounts[i].Indirect > NewCallCounts[i].Indirect && + CallCounts[i].Direct < NewCallCounts[i].Direct) { + Devirt = true; + break; + } + + if (!Devirt) { + PA.intersect(std::move(PassPA)); + break; + } + + // Otherwise, if we've already hit our max, we're done. + if (i >= MaxRepetitions) { + if (DebugLogging) + dbgs() << "Found another devirtualization after hitting the max " + "number of repetitions (" + << MaxRepetitions << ") on SCC: " << *C << "\n"; + PA.intersect(std::move(PassPA)); + break; + } + + if (DebugLogging) + dbgs() << "Repeating an SCC pass after finding a devirtualization in: " + << *C << "\n"; + + // Move over the new call counts in preparation for iterating. + CallCounts = std::move(NewCallCounts); + + // Update the analysis manager with each run and intersect the total set + // of preserved analyses so we're ready to iterate. + AM.invalidate(*C, PassPA); + PA.intersect(std::move(PassPA)); + } + + // Note that we don't add any preserved entries here unlike a more normal + // "pass manager" because we only handle invalidation *between* iterations, + // not after the last iteration. + return PA; + } + +private: + PassT Pass; + int MaxRepetitions; + bool DebugLogging; +}; + +/// \brief A function to deduce a function pass type and wrap it in the +/// templated adaptor. +template +DevirtSCCRepeatedPass +createDevirtSCCRepeatedPass(PassT Pass, int MaxRepetitions, + bool DebugLogging = false) { + return DevirtSCCRepeatedPass(std::move(Pass), MaxRepetitions, + DebugLogging); +} } #endif Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -582,6 +582,15 @@ return Count; } +static Optional parseDevirtPassName(StringRef Name) { + if (!Name.consume_front("devirt<") || !Name.consume_back(">")) + return None; + int Count; + if (Name.getAsInteger(0, Count) || Count <= 0) + return None; + return Count; +} + static bool isModulePassName(StringRef Name) { // Manually handle aliases for pre-configured pipeline fragments. if (Name.startswith("default") || Name.startswith("lto")) @@ -620,6 +629,8 @@ // Explicitly handle custom-parsed pass names. if (parseRepeatPassName(Name)) return true; + if (parseDevirtPassName(Name)) + return true; #define CGSCC_PASS(NAME, CREATE_PASS) \ if (Name == NAME) \ @@ -863,6 +874,15 @@ CGPM.addPass(createRepeatedPass(*Count, std::move(NestedCGPM))); return true; } + if (auto MaxRepetitions = parseDevirtPassName(Name)) { + CGSCCPassManager NestedCGPM(DebugLogging); + if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; + CGPM.addPass(createDevirtSCCRepeatedPass(std::move(NestedCGPM), + *MaxRepetitions, DebugLogging)); + return true; + } // Normal passes can't have pipelines. return false; } Index: test/Other/cgscc-devirt-iteration.ll =================================================================== --- /dev/null +++ test/Other/cgscc-devirt-iteration.ll @@ -0,0 +1,107 @@ +; The CGSCC pass manager includes an SCC iteration utility that tracks indirect +; calls that are turned into direct calls (devirtualization) and re-visits the +; SCC to expose those calls to the SCC-based IPO passes. We trigger +; devirtualization here with GVN which forwards a store through a load and to +; an indirect call. +; +; RUN: opt -aa-pipeline=basic-aa -passes='cgscc(function-attrs,function(gvn,instcombine))' -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=BEFORE +; RUN: opt -aa-pipeline=basic-aa -passes='cgscc(devirt<1>(function-attrs,function(gvn,instcombine)))' -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AFTER --check-prefix=AFTER1 +; RUN: opt -aa-pipeline=basic-aa -passes='cgscc(devirt<2>(function-attrs,function(gvn,instcombine)))' -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AFTER --check-prefix=AFTER2 + +declare void @readnone() readnone +; CHECK: Function Attrs: readnone +; CHECK: declare void @readnone() + +declare void @unknown() +; CHECK-NOT: Function Attrs +; CHECK: declare void @unknown() + +; The @test1 function checks that when we refine an indirect call to a direct +; call we revisit the SCC passes to reflect the more precise information. This +; is the basic functionality. + +define void @test1() { +; BEFORE-NOT: Function Attrs +; AFTER: Function Attrs: readnone +; CHECK: define void @test1() +entry: + %fptr = alloca void ()* + store void ()* @readnone, void ()** %fptr + %f = load void ()*, void ()** %fptr + call void %f() + ret void +} + +; The @test2_* functions check that when we need multiple (in this case 2) +; repeatitions to compute some state that is incrementally exposed with each +; one, the limit on repeatitions is enforced. So we make progress with +; one repeatition but not as much as with three. +; +; This is somewhat awkward to test because we have to contrive to have a state +; repeatition triggered and observed with very few passes. The technique here +; is to have one indirect call that can only be resolved when the entire SCC is +; deduced as readonce, and mark that indirect call at the call site as readonce +; to make that possible. + +declare void @readnone_with_arg(void ()**) readnone +; CHECK: Function Attrs: readnone +; CHECK: declare void @readnone_with_arg(void ()**) + +define void @test2_a(void ()** %ignore) { +; BEFORE-NOT: Function Attrs +; AFTER1: Function Attrs: readonly +; AFTER2: Function Attrs: readnone +; BEFORE: define void @test2_a(void ()** %ignore) +; AFTER: define void @test2_a(void ()** readnone %ignore) +entry: + %f1ptr = alloca void (void ()**)* + store void (void ()**)* @readnone_with_arg, void (void ()**)** %f1ptr + %f1 = load void (void ()**)*, void (void ()**)** %f1ptr + call void %f1(void ()** %ignore) +; CHECK: call void @readnone_with_arg(void ()** %ignore) + + ; Bogus call to test2_b to make this a cycle. + call void @test2_b() + + ret void +} + +define void @test2_b() { +; BEFORE-NOT: Function Attrs +; AFTER1: Function Attrs: readonly +; AFTER2: Function Attrs: readnone +; CHECK: define void @test2_b() +entry: + %f2ptr = alloca void ()* + store void ()* @readnone, void ()** %f2ptr + ; Call the other function here to prevent forwarding until the SCC has had + ; function attrs deduced. + call void @test2_a(void ()** %f2ptr) + + %f2 = load void ()*, void ()** %f2ptr + call void %f2() readonly +; BEFORE: call void %f2() +; AFTER: call void @readnone() + + ret void +} + +declare i8* @memcpy(i8*, i8*, i64) +; CHECK-NOT: Function Attrs +; CHECK: declare i8* @memcpy(i8*, i8*, i64) + +; The @test3 function checks that when we refine an indirect call to an +; intrinsic we still revisit the SCC pass. This also covers cases where the +; value handle itself doesn't persist due to the nature of how instcombine +; creates the memcpy intrinsic call, and we rely on the count of indirect calls +; decreasing and the count of direct calls increasing. +define void @test3(i8* %src, i8* %dest, i64 %size) { +; CHECK-NOT: Function Attrs +; BEFORE: define void @test3(i8* %src, i8* %dest, i64 %size) +; AFTER: define void @test3(i8* nocapture readonly %src, i8* nocapture %dest, i64 %size) + %fptr = alloca i8* (i8*, i8*, i64)* + store i8* (i8*, i8*, i64)* @memcpy, i8* (i8*, i8*, i64)** %fptr + %f = load i8* (i8*, i8*, i64)*, i8* (i8*, i8*, i64)** %fptr + call i8* %f(i8* %dest, i8* %src, i64 %size) + ret void +} Index: test/Other/cgscc-observe-devirt.ll =================================================================== --- test/Other/cgscc-observe-devirt.ll +++ test/Other/cgscc-observe-devirt.ll @@ -1,130 +1,99 @@ +; Make sure that even without some external devirtualization iteration tool, +; the CGSCC pass manager correctly observes and re-visits SCCs that change +; structure due to devirtualization. We trigger devirtualization here with GVN +; which forwards a store through a load and to an indirect call. +; ; RUN: opt -aa-pipeline=basic-aa -passes='cgscc(function-attrs)' -S < %s | FileCheck %s --check-prefix=BEFORE ; RUN: opt -aa-pipeline=basic-aa -passes='cgscc(function-attrs,function(gvn))' -S < %s | FileCheck %s --check-prefix=AFTER -; -; Also check that adding an extra CGSCC pass after the function update but -; without requiring the outer manager to iterate doesn't break any invariant. -; RUN: opt -aa-pipeline=basic-aa -passes='cgscc(function-attrs,function(gvn),function-attrs)' -S < %s | FileCheck %s --check-prefix=AFTER2 declare void @readnone() readnone declare void @unknown() -; The @test1_* functions check that when we refine an indirect call to a direct -; call, even if it doesn't change the call graph structure, we revisit the SCC -; passes to reflect the more precise information. -; FIXME: Currently, this isn't implemented in the new pass manager and so we -; only get this with AFTER2, not with AFTER. - -; BEFORE: define void @test1_a() { -; AFTER: define void @test1_a() { -; AFTER2: define void @test1_a() { -define void @test1_a() { - %fptr = alloca void()* - store void()* @unknown, void()** %fptr - %f = load void()*, void()** %fptr - call void %f() - ret void -} - -; BEFORE: define void @test1_b() { -; AFTER: define void @test1_b() { -; AFTER2: define void @test1_b() #0 { -define void @test1_b() { - %fptr = alloca void()* - store void()* @readnone, void()** %fptr - %f = load void()*, void()** %fptr - call void %f() - ret void -} - -; The @test2_* checks that if we refine an indirect call to a direct call and +; The @test1_* checks that if we refine an indirect call to a direct call and ; in the process change the very structure of the call graph we also revisit ; that component of the graph and do so in an up-to-date fashion. -; BEFORE: define void @test2_a1() { -; AFTER: define void @test2_a1() { -; AFTER2: define void @test2_a1() { -define void @test2_a1() { +; BEFORE: define void @test1_a1() { +; AFTER: define void @test1_a1() { +define void @test1_a1() { %fptr = alloca void()* - store void()* @test2_b2, void()** %fptr - store void()* @test2_b1, void()** %fptr + store void()* @test1_b2, void()** %fptr + store void()* @test1_b1, void()** %fptr %f = load void()*, void()** %fptr call void %f() ret void } -; BEFORE: define void @test2_b1() { -; AFTER: define void @test2_b1() { -; AFTER2: define void @test2_b1() { -define void @test2_b1() { +; BEFORE: define void @test1_b1() { +; AFTER: define void @test1_b1() { +define void @test1_b1() { call void @unknown() - call void @test2_a1() + call void @test1_a1() ret void } -; BEFORE: define void @test2_a2() { -; AFTER: define void @test2_a2() #0 { -; AFTER2: define void @test2_a2() #0 { -define void @test2_a2() { +; BEFORE: define void @test1_a2() { +; AFTER: define void @test1_a2() #0 { +define void @test1_a2() { %fptr = alloca void()* - store void()* @test2_b1, void()** %fptr - store void()* @test2_b2, void()** %fptr + store void()* @test1_b1, void()** %fptr + store void()* @test1_b2, void()** %fptr %f = load void()*, void()** %fptr call void %f() ret void } -; BEFORE: define void @test2_b2() { -; AFTER: define void @test2_b2() #0 { -; AFTER2: define void @test2_b2() #0 { -define void @test2_b2() { +; BEFORE: define void @test1_b2() { +; AFTER: define void @test1_b2() #0 { +define void @test1_b2() { call void @readnone() - call void @test2_a2() + call void @test1_a2() ret void } -; The @test3_* set of functions exercise a case where running function passes +; The @test2_* set of functions exercise a case where running function passes ; introduces a new post-order relationship that was not present originally and ; makes sure we walk across the SCCs in that order. -; CHECK: define void @test3_a() { -define void @test3_a() { - call void @test3_b1() - call void @test3_b2() - call void @test3_b3() +; CHECK: define void @test2_a() { +define void @test2_a() { + call void @test2_b1() + call void @test2_b2() + call void @test2_b3() call void @unknown() ret void } -; CHECK: define void @test3_b1() #0 { -define void @test3_b1() { +; CHECK: define void @test2_b1() #0 { +define void @test2_b1() { %fptr = alloca void()* - store void()* @test3_a, void()** %fptr + store void()* @test2_a, void()** %fptr store void()* @readnone, void()** %fptr %f = load void()*, void()** %fptr call void %f() ret void } -; CHECK: define void @test3_b2() #0 { -define void @test3_b2() { +; CHECK: define void @test2_b2() #0 { +define void @test2_b2() { %fptr = alloca void()* - store void()* @test3_a, void()** %fptr - store void()* @test3_b2, void()** %fptr - store void()* @test3_b3, void()** %fptr - store void()* @test3_b1, void()** %fptr + store void()* @test2_a, void()** %fptr + store void()* @test2_b2, void()** %fptr + store void()* @test2_b3, void()** %fptr + store void()* @test2_b1, void()** %fptr %f = load void()*, void()** %fptr call void %f() ret void } -; CHECK: define void @test3_b3() #0 { -define void @test3_b3() { +; CHECK: define void @test2_b3() #0 { +define void @test2_b3() { %fptr = alloca void()* - store void()* @test3_a, void()** %fptr - store void()* @test3_b2, void()** %fptr - store void()* @test3_b3, void()** %fptr - store void()* @test3_b1, void()** %fptr + store void()* @test2_a, void()** %fptr + store void()* @test2_b2, void()** %fptr + store void()* @test2_b3, void()** %fptr + store void()* @test2_b1, void()** %fptr %f = load void()*, void()** %fptr call void %f() ret void Index: test/Transforms/Inline/devirtualize-2.ll =================================================================== --- test/Transforms/Inline/devirtualize-2.ll +++ test/Transforms/Inline/devirtualize-2.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -inline -S | FileCheck %s +; RUN: opt < %s -passes='cgscc(devirt<4>(inline))' -S | FileCheck %s ; PR4834 define i32 @test1() {