Index: llvm/trunk/include/llvm/Analysis/TypeMetadataUtils.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TypeMetadataUtils.h +++ llvm/trunk/include/llvm/Analysis/TypeMetadataUtils.h @@ -20,6 +20,8 @@ namespace llvm { +class DominatorTree; + /// The type of CFI jumptable needed for a function. enum CfiFunctionLinkage { CFL_Definition = 0, @@ -39,7 +41,8 @@ /// call sites based on the call and return them in DevirtCalls. void findDevirtualizableCallsForTypeTest( SmallVectorImpl &DevirtCalls, - SmallVectorImpl &Assumes, const CallInst *CI); + SmallVectorImpl &Assumes, const CallInst *CI, + DominatorTree &DT); /// Given a call to the intrinsic \@llvm.type.checked.load, find all /// devirtualizable call sites based on the call and return them in DevirtCalls. @@ -47,7 +50,7 @@ SmallVectorImpl &DevirtCalls, SmallVectorImpl &LoadedPtrs, SmallVectorImpl &Preds, bool &HasNonCallUses, - const CallInst *CI); + const CallInst *CI, DominatorTree &DT); } #endif Index: llvm/trunk/lib/Analysis/ModuleSummaryAnalysis.cpp =================================================================== --- llvm/trunk/lib/Analysis/ModuleSummaryAnalysis.cpp +++ llvm/trunk/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -147,7 +147,8 @@ SetVector &TypeTestAssumeVCalls, SetVector &TypeCheckedLoadVCalls, SetVector &TypeTestAssumeConstVCalls, - SetVector &TypeCheckedLoadConstVCalls) { + SetVector &TypeCheckedLoadConstVCalls, + DominatorTree &DT) { switch (CI->getCalledFunction()->getIntrinsicID()) { case Intrinsic::type_test: { auto *TypeMDVal = cast(CI->getArgOperand(1)); @@ -172,7 +173,7 @@ SmallVector DevirtCalls; SmallVector Assumes; - findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI); + findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT); for (auto &Call : DevirtCalls) addVCallToSet(Call, Guid, TypeTestAssumeVCalls, TypeTestAssumeConstVCalls); @@ -192,7 +193,7 @@ SmallVector Preds; bool HasNonCallUses = false; findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds, - HasNonCallUses, CI); + HasNonCallUses, CI, DT); // Any non-call uses of the result of llvm.type.checked.load will // prevent us from optimizing away the llvm.type.test. if (HasNonCallUses) @@ -208,11 +209,10 @@ } } -static void -computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, - const Function &F, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, bool HasLocalsInUsedOrAsm, - DenseSet &CantBePromoted) { +static void computeFunctionSummary( + ModuleSummaryIndex &Index, const Module &M, const Function &F, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, DominatorTree &DT, + bool HasLocalsInUsedOrAsm, DenseSet &CantBePromoted) { // Summary not currently supported for anonymous functions, they should // have been named. assert(F.hasName()); @@ -273,7 +273,7 @@ if (CI && CalledFunction->isIntrinsic()) { addIntrinsicToSummary( CI, TypeTests, TypeTestAssumeVCalls, TypeCheckedLoadVCalls, - TypeTestAssumeConstVCalls, TypeCheckedLoadConstVCalls); + TypeTestAssumeConstVCalls, TypeCheckedLoadConstVCalls, DT); continue; } // We should have named any anonymous globals @@ -488,18 +488,19 @@ if (F.isDeclaration()) continue; + DominatorTree DT(const_cast(F)); BlockFrequencyInfo *BFI = nullptr; std::unique_ptr BFIPtr; if (GetBFICallback) BFI = GetBFICallback(F); else if (F.hasProfileData()) { - LoopInfo LI{DominatorTree(const_cast(F))}; + LoopInfo LI{DT}; BranchProbabilityInfo BPI{F, LI}; BFIPtr = llvm::make_unique(F, BPI, LI); BFI = BFIPtr.get(); } - computeFunctionSummary(Index, M, F, BFI, PSI, + computeFunctionSummary(Index, M, F, BFI, PSI, DT, !LocalsUsed.empty() || HasLocalInlineAsmSymbol, CantBePromoted); } Index: llvm/trunk/lib/Analysis/TypeMetadataUtils.cpp =================================================================== --- llvm/trunk/lib/Analysis/TypeMetadataUtils.cpp +++ llvm/trunk/lib/Analysis/TypeMetadataUtils.cpp @@ -14,6 +14,7 @@ #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" @@ -22,11 +23,21 @@ // Search for virtual calls that call FPtr and add them to DevirtCalls. static void findCallsAtConstantOffset(SmallVectorImpl &DevirtCalls, - bool *HasNonCallUses, Value *FPtr, uint64_t Offset) { + bool *HasNonCallUses, Value *FPtr, uint64_t Offset, + const CallInst *CI, DominatorTree &DT) { for (const Use &U : FPtr->uses()) { - Value *User = U.getUser(); + Instruction *User = cast(U.getUser()); + // Ignore this instruction if it is not dominated by the type intrinsic + // being analyzed. Otherwise we may transform a call sharing the same + // vtable pointer incorrectly. Specifically, this situation can arise + // after indirect call promotion and inlining, where we may have uses + // of the vtable pointer guarded by a function pointer check, and a fallback + // indirect call. + if (!DT.dominates(CI, User)) + continue; if (isa(User)) { - findCallsAtConstantOffset(DevirtCalls, HasNonCallUses, User, Offset); + findCallsAtConstantOffset(DevirtCalls, HasNonCallUses, User, Offset, CI, + DT); } else if (auto CI = dyn_cast(User)) { DevirtCalls.push_back({Offset, CI}); } else if (auto II = dyn_cast(User)) { @@ -38,23 +49,23 @@ } // Search for virtual calls that load from VPtr and add them to DevirtCalls. -static void -findLoadCallsAtConstantOffset(const Module *M, - SmallVectorImpl &DevirtCalls, - Value *VPtr, int64_t Offset) { +static void findLoadCallsAtConstantOffset( + const Module *M, SmallVectorImpl &DevirtCalls, Value *VPtr, + int64_t Offset, const CallInst *CI, DominatorTree &DT) { for (const Use &U : VPtr->uses()) { Value *User = U.getUser(); if (isa(User)) { - findLoadCallsAtConstantOffset(M, DevirtCalls, User, Offset); + findLoadCallsAtConstantOffset(M, DevirtCalls, User, Offset, CI, DT); } else if (isa(User)) { - findCallsAtConstantOffset(DevirtCalls, nullptr, User, Offset); + findCallsAtConstantOffset(DevirtCalls, nullptr, User, Offset, CI, DT); } else if (auto GEP = dyn_cast(User)) { // Take into account the GEP offset. if (VPtr == GEP->getPointerOperand() && GEP->hasAllConstantIndices()) { SmallVector Indices(GEP->op_begin() + 1, GEP->op_end()); int64_t GEPOffset = M->getDataLayout().getIndexedOffsetInType( GEP->getSourceElementType(), Indices); - findLoadCallsAtConstantOffset(M, DevirtCalls, User, Offset + GEPOffset); + findLoadCallsAtConstantOffset(M, DevirtCalls, User, Offset + GEPOffset, + CI, DT); } } } @@ -62,7 +73,8 @@ void llvm::findDevirtualizableCallsForTypeTest( SmallVectorImpl &DevirtCalls, - SmallVectorImpl &Assumes, const CallInst *CI) { + SmallVectorImpl &Assumes, const CallInst *CI, + DominatorTree &DT) { assert(CI->getCalledFunction()->getIntrinsicID() == Intrinsic::type_test); const Module *M = CI->getParent()->getParent()->getParent(); @@ -79,15 +91,15 @@ // If we found any, search for virtual calls based on %p and add them to // DevirtCalls. if (!Assumes.empty()) - findLoadCallsAtConstantOffset(M, DevirtCalls, - CI->getArgOperand(0)->stripPointerCasts(), 0); + findLoadCallsAtConstantOffset( + M, DevirtCalls, CI->getArgOperand(0)->stripPointerCasts(), 0, CI, DT); } void llvm::findDevirtualizableCallsForTypeCheckedLoad( SmallVectorImpl &DevirtCalls, SmallVectorImpl &LoadedPtrs, SmallVectorImpl &Preds, bool &HasNonCallUses, - const CallInst *CI) { + const CallInst *CI, DominatorTree &DT) { assert(CI->getCalledFunction()->getIntrinsicID() == Intrinsic::type_checked_load); @@ -114,5 +126,5 @@ for (Value *LoadedPtr : LoadedPtrs) findCallsAtConstantOffset(DevirtCalls, &HasNonCallUses, LoadedPtr, - Offset->getZExtValue()); + Offset->getZExtValue(), CI, DT); } Index: llvm/trunk/lib/Transforms/IPO/WholeProgramDevirt.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ llvm/trunk/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -58,6 +58,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" @@ -406,6 +407,7 @@ struct DevirtModule { Module &M; function_ref AARGetter; + function_ref LookupDomTree; ModuleSummaryIndex *ExportSummary; const ModuleSummaryIndex *ImportSummary; @@ -433,10 +435,12 @@ DevirtModule(Module &M, function_ref AARGetter, function_ref OREGetter, + function_ref LookupDomTree, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary) - : M(M), AARGetter(AARGetter), ExportSummary(ExportSummary), - ImportSummary(ImportSummary), Int8Ty(Type::getInt8Ty(M.getContext())), + : M(M), AARGetter(AARGetter), LookupDomTree(LookupDomTree), + ExportSummary(ExportSummary), ImportSummary(ImportSummary), + Int8Ty(Type::getInt8Ty(M.getContext())), Int8PtrTy(Type::getInt8PtrTy(M.getContext())), Int32Ty(Type::getInt32Ty(M.getContext())), Int64Ty(Type::getInt64Ty(M.getContext())), @@ -533,9 +537,10 @@ // Lower the module using the action and summary passed as command line // arguments. For testing purposes only. - static bool runForTesting( - Module &M, function_ref AARGetter, - function_ref OREGetter); + static bool + runForTesting(Module &M, function_ref AARGetter, + function_ref OREGetter, + function_ref LookupDomTree); }; struct WholeProgramDevirt : public ModulePass { @@ -572,17 +577,23 @@ return *ORE; }; + auto LookupDomTree = [this](Function &F) -> DominatorTree & { + return this->getAnalysis(F).getDomTree(); + }; + if (UseCommandLine) - return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter); + return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter, + LookupDomTree); - return DevirtModule(M, LegacyAARGetter(*this), OREGetter, ExportSummary, - ImportSummary) + return DevirtModule(M, LegacyAARGetter(*this), OREGetter, LookupDomTree, + ExportSummary, ImportSummary) .run(); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); } }; @@ -592,6 +603,7 @@ "Whole program devirtualization", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt", "Whole program devirtualization", false, false) char WholeProgramDevirt::ID = 0; @@ -611,7 +623,11 @@ auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { return FAM.getResult(*F); }; - if (!DevirtModule(M, AARGetter, OREGetter, ExportSummary, ImportSummary) + auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & { + return FAM.getResult(F); + }; + if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary, + ImportSummary) .run()) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -619,7 +635,8 @@ bool DevirtModule::runForTesting( Module &M, function_ref AARGetter, - function_ref OREGetter) { + function_ref OREGetter, + function_ref LookupDomTree) { ModuleSummaryIndex Summary(/*HaveGVs=*/false); // Handle the command-line summary arguments. This code is for testing @@ -637,7 +654,7 @@ bool Changed = DevirtModule( - M, AARGetter, OREGetter, + M, AARGetter, OREGetter, LookupDomTree, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr, ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr) .run(); @@ -1342,7 +1359,7 @@ // points to a member of the type identifier %md. Group calls by (type ID, // offset) pair (effectively the identity of the virtual function) and store // to CallSlots. - DenseSet SeenPtrs; + DenseSet SeenCallSites; for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end(); I != E;) { auto CI = dyn_cast(I->getUser()); @@ -1353,19 +1370,22 @@ // Search for virtual calls based on %p and add them to DevirtCalls. SmallVector DevirtCalls; SmallVector Assumes; - findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI); + auto &DT = LookupDomTree(*CI->getFunction()); + findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT); - // If we found any, add them to CallSlots. Only do this if we haven't seen - // the vtable pointer before, as it may have been CSE'd with pointers from - // other call sites, and we don't want to process call sites multiple times. + // If we found any, add them to CallSlots. if (!Assumes.empty()) { Metadata *TypeId = cast(CI->getArgOperand(1))->getMetadata(); Value *Ptr = CI->getArgOperand(0)->stripPointerCasts(); - if (SeenPtrs.insert(Ptr).second) { - for (DevirtCallSite Call : DevirtCalls) { + for (DevirtCallSite Call : DevirtCalls) { + // Only add this CallSite if we haven't seen it before. The vtable + // pointer may have been CSE'd with pointers from other call sites, + // and we don't want to process call sites multiple times. We can't + // just skip the vtable Ptr if it has been seen before, however, since + // it may be shared by type tests that dominate different calls. + if (SeenCallSites.insert(Call.CS).second) CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS, nullptr); - } } } @@ -1399,8 +1419,9 @@ SmallVector LoadedPtrs; SmallVector Preds; bool HasNonCallUses = false; + auto &DT = LookupDomTree(*CI->getFunction()); findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds, - HasNonCallUses, CI); + HasNonCallUses, CI, DT); // Start by generating "pessimistic" code that explicitly loads the function // pointer from the vtable and performs the type check. If possible, we will Index: llvm/trunk/test/ThinLTO/X86/devirt-after-icp.ll =================================================================== --- llvm/trunk/test/ThinLTO/X86/devirt-after-icp.ll +++ llvm/trunk/test/ThinLTO/X86/devirt-after-icp.ll @@ -0,0 +1,148 @@ +; REQUIRES: x86-registered-target + +; Test devirtualization through the thin link and backend, ensuring that +; it is only applied when the type test corresponding to a devirtualization +; dominates an indirect call using the same vtable pointer. Indirect +; call promotion and inlining may introduce a guarded indirect call +; that can be promoted, which uses the same vtable address as the fallback +; indirect call that cannot be devirtualized. + +; The code below illustrates the structure when we started with code like: +; +; class A { +; public: +; virtual int foo() { return 1; } +; virtual int bar() { return 1; } +; }; +; class B : public A { +; public: +; virtual int foo(); +; virtual int bar(); +; }; +; +; int foo(A *a) { +; return a->foo(); // ICP profile says most calls are to B::foo() +; } +; +; int B::foo() { +; return bar(); +; } +; +; After the compile step, which will perform ICP and a round of inlining, we +; have something like: +; int foo(A *a) { +; if (&a->foo() == B::foo()) +; return ((B*)a)->bar(); // Inlined from promoted direct call to B::foo() +; else +; return a->foo(); +; +; The inlined code seqence will have a type test against "_ZTS1B", +; which will allow us to devirtualize indirect call ((B*)a)->bar() to B::bar(); +; Both that type test and the one for the fallback a->foo() indirect call +; will use the same vtable pointer. Without a dominance check, we could +; incorrectly devirtualize a->foo() to B::foo(); + +; RUN: opt -thinlto-bc -o %t.o %s + +; Legacy PM +; RUN: llvm-lto2 run %t.o -save-temps -pass-remarks=. \ +; RUN: -o %t3 \ +; RUN: -r=%t.o,_Z3bazP1A,px \ +; RUN: -r=%t.o,_ZN1A3fooEv, \ +; RUN: -r=%t.o,_ZN1A3barEv, \ +; RUN: -r=%t.o,_ZN1B3fooEv, \ +; RUN: -r=%t.o,_ZN1B3barEv, \ +; RUN: -r=%t.o,_ZTV1A, \ +; RUN: -r=%t.o,_ZTV1B, \ +; RUN: -r=%t.o,_ZN1A3fooEv, \ +; RUN: -r=%t.o,_ZN1A3barEv, \ +; RUN: -r=%t.o,_ZN1B3fooEv, \ +; RUN: -r=%t.o,_ZN1B3barEv, \ +; RUN: -r=%t.o,_ZTV1A,px \ +; RUN: -r=%t.o,_ZTV1B,px 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR + +; New PM +; RUN: llvm-lto2 run %t.o -save-temps -use-new-pm -pass-remarks=. \ +; RUN: -o %t3 \ +; RUN: -r=%t.o,_Z3bazP1A,px \ +; RUN: -r=%t.o,_ZN1A3fooEv, \ +; RUN: -r=%t.o,_ZN1A3barEv, \ +; RUN: -r=%t.o,_ZN1B3fooEv, \ +; RUN: -r=%t.o,_ZN1B3barEv, \ +; RUN: -r=%t.o,_ZTV1A, \ +; RUN: -r=%t.o,_ZTV1B, \ +; RUN: -r=%t.o,_ZN1A3fooEv, \ +; RUN: -r=%t.o,_ZN1A3barEv, \ +; RUN: -r=%t.o,_ZN1B3fooEv, \ +; RUN: -r=%t.o,_ZN1B3barEv, \ +; RUN: -r=%t.o,_ZTV1A,px \ +; RUN: -r=%t.o,_ZTV1B,px 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR + +; We should only devirtualize the inlined call to bar(). +; REMARK-NOT: single-impl: devirtualized a call to _ZN1B3fooEv +; REMARK: single-impl: devirtualized a call to _ZN1B3barEv +; REMARK-NOT: single-impl: devirtualized a call to _ZN1B3fooEv + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +%class.A = type { i32 (...)** } +%class.B = type { %class.A } + +@_ZTV1A = linkonce_odr hidden unnamed_addr constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* undef, i8* bitcast (i32 (%class.A*)* @_ZN1A3fooEv to i8*), i8* bitcast (i32 (%class.A*)* @_ZN1A3barEv to i8*)] }, align 8, !type !0 +@_ZTV1B = hidden unnamed_addr constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* undef, i8* bitcast (i32 (%class.B*)* @_ZN1B3fooEv to i8*), i8* bitcast (i32 (%class.B*)* @_ZN1B3barEv to i8*)] }, align 8, !type !0, !type !1 + +define hidden i32 @_Z3bazP1A(%class.A* %a) local_unnamed_addr { +entry: + %0 = bitcast %class.A* %a to i32 (%class.A*)*** + %vtable = load i32 (%class.A*)**, i32 (%class.A*)*** %0, align 8 + %1 = bitcast i32 (%class.A*)** %vtable to i8* + %2 = tail call i1 @llvm.type.test(i8* %1, metadata !"_ZTS1A") + tail call void @llvm.assume(i1 %2) + %3 = load i32 (%class.A*)*, i32 (%class.A*)** %vtable, align 8 + ; This is the compare instruction inserted by ICP + %4 = icmp eq i32 (%class.A*)* %3, bitcast (i32 (%class.B*)* @_ZN1B3fooEv to i32 (%class.A*)*) + br i1 %4, label %if.true.direct_targ, label %if.false.orig_indirect + +; This block contains the promoted and inlined call to B::foo(); +; CHECK-IR: if.true.direct_targ: ; preds = %entry +if.true.direct_targ: ; preds = %entry + %5 = bitcast %class.A* %a to %class.B* + %6 = bitcast i32 (%class.A*)** %vtable to i8* + %7 = tail call i1 @llvm.type.test(i8* %6, metadata !"_ZTS1B") + tail call void @llvm.assume(i1 %7) + %vfn.i1 = getelementptr inbounds i32 (%class.A*)*, i32 (%class.A*)** %vtable, i64 1 + %vfn.i = bitcast i32 (%class.A*)** %vfn.i1 to i32 (%class.B*)** + %8 = load i32 (%class.B*)*, i32 (%class.B*)** %vfn.i, align 8 +; Call to bar() can be devirtualized to call to B::bar(), since it was +; inlined from B::foo() after ICP introduced the guarded promotion. +; CHECK-IR: %call.i = tail call i32 @_ZN1B3barEv(%class.B* %3) + %call.i = tail call i32 %8(%class.B* %5) + br label %if.end.icp + +; This block contains the fallback indirect call a->foo() +; CHECK-IR: if.false.orig_indirect: +if.false.orig_indirect: ; preds = %entry +; Fallback indirect call to foo() cannot be devirtualized. +; CHECK-IR: %call = tail call i32 % + %call = tail call i32 %3(%class.A* nonnull %a) + br label %if.end.icp + +if.end.icp: ; preds = %if.false.orig_indirect, %if.true.direct_targ + %9 = phi i32 [ %call, %if.false.orig_indirect ], [ %call.i, %if.true.direct_targ ] + ret i32 %9 +} + +declare i1 @llvm.type.test(i8*, metadata) + +declare void @llvm.assume(i1) + +declare dso_local i32 @_ZN1B3fooEv(%class.B* %this) unnamed_addr +declare dso_local i32 @_ZN1B3barEv(%class.B*) unnamed_addr +declare dso_local i32 @_ZN1A3barEv(%class.A* %this) unnamed_addr +declare dso_local i32 @_ZN1A3fooEv(%class.A* %this) unnamed_addr + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTS1B"}