Index: include/llvm/IR/ModuleSummaryIndex.h =================================================================== --- include/llvm/IR/ModuleSummaryIndex.h +++ include/llvm/IR/ModuleSummaryIndex.h @@ -562,6 +562,8 @@ /// Return the list of pairs. ArrayRef calls() const { return CallGraphEdgeList; } + void addCall(EdgeTy E) { CallGraphEdgeList.push_back(E); } + /// Returns the list of type identifiers used by this function in /// llvm.type.test intrinsics other than by an llvm.assume intrinsic, /// represented as GUIDs. Index: include/llvm/Transforms/IPO/WholeProgramDevirt.h =================================================================== --- include/llvm/Transforms/IPO/WholeProgramDevirt.h +++ include/llvm/Transforms/IPO/WholeProgramDevirt.h @@ -229,6 +229,8 @@ PreservedAnalyses run(Module &M, ModuleAnalysisManager &); }; +bool runWholeProgramDevirtOnIndex(ModuleSummaryIndex *Summary); + } // end namespace llvm #endif // LLVM_TRANSFORMS_IPO_WHOLEPROGRAMDEVIRT_H Index: lib/LTO/LTO.cpp =================================================================== --- lib/LTO/LTO.cpp +++ lib/LTO/LTO.cpp @@ -42,6 +42,7 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/Utils/SplitModule.h" #include @@ -1177,6 +1178,11 @@ if (DumpThinCGSCCs) ThinLTO.CombinedIndex.dumpSCCs(outs()); + // Perform index-based WPD. This will return immediately if there are + // no index entries in the typeIdMetadata map (e.g. if we are instead + // performing IR-based WPD in hybrid regular/thin LTO mode). + runWholeProgramDevirtOnIndex(&ThinLTO.CombinedIndex); + if (Conf.OptLevel > 0) ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, ImportLists, ExportLists); Index: lib/Transforms/IPO/WholeProgramDevirt.cpp =================================================================== --- lib/Transforms/IPO/WholeProgramDevirt.cpp +++ lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -25,12 +25,14 @@ // returns 0, or a single vtable's function returns 1, replace each virtual // call with a comparison of the vptr against that vtable's address. // -// This pass is intended to be used during the regular and thin LTO pipelines. +// This pass is intended to be used during the regular and thin LTO pipelines: +// // During regular LTO, the pass determines the best optimization for each // virtual call and applies the resolutions directly to virtual calls that are // eligible for virtual call optimization (i.e. calls that use either of the -// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During -// ThinLTO, the pass operates in two phases: +// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). +// +// During hybrid Regular/ThinLTO, the pass operates in two phases: // - Export phase: this is run during the thin link over a single merged module // that contains all vtables with !type metadata that participate in the link. // The pass computes a resolution for each virtual call and stores it in the @@ -39,6 +41,14 @@ // modules. The pass applies the resolutions previously computed during the // import phase to each eligible virtual call. // +// During ThinLTO, the pass operates in two phases: +// - Export phase: this is run during the thin link over the index which +// contains a summary of all vtables with !type metadata that participate in +// the link. It computes a resolution for each virtual call and stores it in +// the type identifier summary. Only single implementation devirtualization +// is supported. +// - Import phase: (same as with hybrid case above). +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/WholeProgramDevirt.h" @@ -118,6 +128,11 @@ cl::desc("Maximum number of call targets per " "call site to enable branch funnels")); +static cl::opt + PrintSummaryDevirt("wholeprogramdevirt-print-index-based", cl::Hidden, + cl::init(false), cl::ZeroOrMore, + cl::desc("Print index-based devirtualization messages")); + // Find the minimum offset that we may store a value of size Size bits at. If // IsAfter is set, look for an offset before the object, otherwise look for an // offset after the object. @@ -243,6 +258,11 @@ uint64_t ByteOffset; }; +struct VTableSlotSummary { + StringRef TypeID; + uint64_t ByteOffset; +}; + } // end anonymous namespace namespace llvm { @@ -266,6 +286,25 @@ } }; +template <> struct DenseMapInfo { + static VTableSlotSummary getEmptyKey() { + return {DenseMapInfo::getEmptyKey(), + DenseMapInfo::getEmptyKey()}; + } + static VTableSlotSummary getTombstoneKey() { + return {DenseMapInfo::getTombstoneKey(), + DenseMapInfo::getTombstoneKey()}; + } + static unsigned getHashValue(const VTableSlotSummary &I) { + return DenseMapInfo::getHashValue(I.TypeID) ^ + DenseMapInfo::getHashValue(I.ByteOffset); + } + static bool isEqual(const VTableSlotSummary &LHS, + const VTableSlotSummary &RHS) { + return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset; + } +}; + } // end namespace llvm namespace { @@ -343,6 +382,7 @@ /// pass the vector is non-empty, we will need to add a use of llvm.type.test /// to each of the function summaries in the vector. std::vector SummaryTypeCheckedLoadUsers; + std::vector SummaryTypeTestAssumeUsers; bool isExported() const { return SummaryHasTypeTestAssumeUsers || @@ -359,6 +399,11 @@ AllCallSitesDevirted = false; } + void addSummaryTypeTestAssumeUser(FunctionSummary *FS) { + SummaryTypeTestAssumeUsers.push_back(FS); + markSummaryHasTypeTestAssumeUsers(); + } + void markDevirt() { AllCallSitesDevirted = true; @@ -543,6 +588,26 @@ function_ref LookupDomTree); }; +struct DevirtIndex { + ModuleSummaryIndex *ExportSummary; + + MapVector CallSlots; + + DevirtIndex(ModuleSummaryIndex *ExportSummary) + : ExportSummary(ExportSummary) {} + + bool tryFindVirtualCallTargets(std::vector &TargetsForSlot, + const TypeIdGVInfo TypeIdGVInfo, + uint64_t ByteOffset); + + bool trySingleImplDevirt(MutableArrayRef TargetsForSlot, + VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res, + std::set &DevirtTargets); + + bool run(); +}; + struct WholeProgramDevirt : public ModulePass { static char ID; @@ -633,6 +698,12 @@ return PreservedAnalyses::none(); } +namespace llvm { +bool runWholeProgramDevirtOnIndex(ModuleSummaryIndex *Summary) { + return DevirtIndex(Summary).run(); +} +} // end namespace llvm + bool DevirtModule::runForTesting( Module &M, function_ref AARGetter, function_ref OREGetter, @@ -767,6 +838,28 @@ return !TargetsForSlot.empty(); } +bool DevirtIndex::tryFindVirtualCallTargets( + std::vector &TargetsForSlot, const TypeIdGVInfo TypeIdGVInfo, + uint64_t ByteOffset) { + for (const TypeIdOffsetGVPair P : TypeIdGVInfo) { + // VTable initializer should have only one summary (i.e. not be + // linkonce/weak). + assert(P.second.getSummaryList().size() == 1); + const auto *VS = cast(P.second.getSummaryList()[0].get()); + if (!P.second.getSummaryList()[0]->isLive()) + continue; + for (auto VTP : VS->vTableFuncs()) { + if (VTP.second != P.first + ByteOffset) + continue; + + TargetsForSlot.push_back(VTP.first); + } + } + + // Give up if we couldn't find any targets. + return !TargetsForSlot.empty(); +} + void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn, bool &IsExported) { auto Apply = [&](CallSiteInfo &CSInfo) { @@ -838,6 +931,46 @@ return true; } +bool DevirtIndex::trySingleImplDevirt(MutableArrayRef TargetsForSlot, + VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res, + std::set &DevirtTargets) { + // See if the program contains a single implementation of this virtual + // function. + auto TheFn = TargetsForSlot[0]; + for (auto &&Target : TargetsForSlot) + if (TheFn != Target) + return false; + + // Collect functions devirtualized at least for one call site for stats. + if (PrintSummaryDevirt) + DevirtTargets.insert(TheFn); + + // Record in summary for use in devirtualization during the ThinLTO import + // step. + Res->TheKind = WholeProgramDevirtResolution::SingleImpl; + Res->SingleImplName = TheFn.name(); + // Name will be empty if this thin link driven off of serialized combined + // index (e.g. llvm-lto). However, WPD is not supported/invoked for the + // legacy LTO API anyway. + assert(!Res->SingleImplName.empty()); + + // FIXME: Annotate type tests with hotness. For now, mark these as hot + // to better ensure we have the opportunity to inline them. + CalleeInfo CI(CalleeInfo::HotnessType::Hot, /* RelBF = */ 0); + auto AddCalls = [&](CallSiteInfo &CSInfo) { + for (auto *FS : CSInfo.SummaryTypeCheckedLoadUsers) + FS->addCall({TheFn, CI}); + for (auto *FS : CSInfo.SummaryTypeTestAssumeUsers) + FS->addCall({TheFn, CI}); + }; + AddCalls(SlotInfo.CSInfo); + for (auto &P : SlotInfo.ConstCSInfo) + AddCalls(P.second); + + return true; +} + void DevirtModule::tryICallBranchFunnel( MutableArrayRef TargetsForSlot, VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res, VTableSlot Slot) { @@ -1480,8 +1613,11 @@ } void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { + auto *TypeId = dyn_cast(Slot.TypeID); + if (!TypeId) + return; const TypeIdSummary *TidSummary = - ImportSummary->getTypeIdSummary(cast(Slot.TypeID)->getString()); + ImportSummary->getTypeIdSummary(TypeId->getString()); if (!TidSummary) return; auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset); @@ -1490,6 +1626,7 @@ const WholeProgramDevirtResolution &Res = ResI->second; if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) { + assert(!Res.SingleImplName.empty()); // The type of the function in the declaration is irrelevant because every // call site will cast it to the correct type. auto *SingleImpl = M.getOrInsertFunction( @@ -1693,7 +1830,7 @@ using namespace ore; OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F) << "devirtualized " - << NV("FunctionName", F->getName())); + << NV("FunctionName", DT.first)); } } @@ -1707,3 +1844,77 @@ return true; } + +bool DevirtIndex::run() { + if (ExportSummary->typeIdMetadataMap().empty()) + return true; + + DenseMap> NameByGUID; + for (auto &P : ExportSummary->typeIdMetadataMap()) { + NameByGUID[GlobalValue::getGUID(P.first)].push_back(P.first); + } + + // Collect information from summary about which calls to try to devirtualize. + for (auto &P : *ExportSummary) { + for (auto &S : P.second.SummaryList) { + auto *FS = dyn_cast(S.get()); + if (!FS) + continue; + // FIXME: Only add live functions. + for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) { + for (StringRef Name : NameByGUID[VF.GUID]) { + CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS); + } + } + for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) { + for (StringRef Name : NameByGUID[VF.GUID]) { + CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS); + } + } + for (const FunctionSummary::ConstVCall &VC : + FS->type_test_assume_const_vcalls()) { + for (StringRef Name : NameByGUID[VC.VFunc.GUID]) { + CallSlots[{Name, VC.VFunc.Offset}] + .ConstCSInfo[VC.Args] + .addSummaryTypeTestAssumeUser(FS); + } + } + for (const FunctionSummary::ConstVCall &VC : + FS->type_checked_load_const_vcalls()) { + for (StringRef Name : NameByGUID[VC.VFunc.GUID]) { + CallSlots[{Name, VC.VFunc.Offset}] + .ConstCSInfo[VC.Args] + .addSummaryTypeCheckedLoadUser(FS); + } + } + } + } + + std::set DevirtTargets; + // For each (type, offset) pair: + for (auto &S : CallSlots) { + // Search each of the members of the type identifier for the virtual + // function implementation at offset S.first.ByteOffset, and add to + // TargetsForSlot. + std::vector TargetsForSlot; + auto *TidSummary = ExportSummary->getTypeIdMetadataSummary(S.first.TypeID); + assert(TidSummary); + if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary, + S.first.ByteOffset)) { + WholeProgramDevirtResolution *Res = + &ExportSummary->getOrInsertTypeIdSummary(S.first.TypeID) + .WPDRes[S.first.ByteOffset]; + + if (!trySingleImplDevirt(TargetsForSlot, S.second, Res, DevirtTargets)) + continue; + } + } + + // Optionally have the thin link print message for each devirtualized + // function. + if (PrintSummaryDevirt) + for (const auto &DT : DevirtTargets) + errs() << "Devirtualized call to " << DT << "\n"; + + return true; +} Index: test/ThinLTO/X86/cfi-devirt.ll =================================================================== --- test/ThinLTO/X86/cfi-devirt.ll +++ test/ThinLTO/X86/cfi-devirt.ll @@ -25,6 +25,20 @@ ; NOENABLESPLITFLAG-DAG: typeidMetadata: (name: "_ZTS1B", summary: ((offset: 16, [[B]]))) ; NOENABLESPLITFLAG-DAG: typeidMetadata: (name: "_ZTS1C", summary: ((offset: 16, [[C]]))) +; Legacy PM, Index based WPD +; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436. +; RUN: llvm-lto2 run %t2.o -save-temps -pass-remarks=. -print-after-all \ +; RUN: -verify-machineinstrs=0 \ +; RUN: -o %t3 \ +; RUN: -r=%t2.o,test,px \ +; RUN: -r=%t2.o,_ZN1A1nEi,p \ +; RUN: -r=%t2.o,_ZN1B1fEi,p \ +; RUN: -r=%t2.o,_ZN1C1fEi,p \ +; RUN: -r=%t2.o,empty,p \ +; RUN: -r=%t2.o,_ZTV1B,px \ +; RUN: -r=%t2.o,_ZTV1C,px 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR + ; Legacy PM ; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436. ; RUN: llvm-lto2 run %t.o -save-temps -pass-remarks=. \ Index: test/ThinLTO/X86/devirt.ll =================================================================== --- test/ThinLTO/X86/devirt.ll +++ test/ThinLTO/X86/devirt.ll @@ -33,7 +33,31 @@ ; Type Id on _ZTV1D should have been promoted ; NOENABLESPLITFLAG-DAG: typeidMetadata: (name: "1${{.*}}", summary: ((offset: 16, [[D]]))) -; TODO: Test index-based WPD one %t2.o once implemented. +; Legacy PM, Index based WPD +; RUN: llvm-lto2 run %t2.o -save-temps -pass-remarks=. \ +; RUN: -o %t3 \ +; RUN: -r=%t2.o,test,px \ +; RUN: -r=%t2.o,_ZN1A1nEi,p \ +; RUN: -r=%t2.o,_ZN1B1fEi,p \ +; RUN: -r=%t2.o,_ZN1C1fEi,p \ +; RUN: -r=%t2.o,_ZN1D1mEi,p \ +; RUN: -r=%t2.o,_ZTV1B,px \ +; RUN: -r=%t2.o,_ZTV1C,px \ +; RUN: -r=%t2.o,_ZTV1D,px 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR + +; New PM, Index based WPD +; RUN: llvm-lto2 run %t2.o -save-temps -use-new-pm -pass-remarks=. \ +; RUN: -o %t3 \ +; RUN: -r=%t2.o,test,px \ +; RUN: -r=%t2.o,_ZN1A1nEi,p \ +; RUN: -r=%t2.o,_ZN1B1fEi,p \ +; RUN: -r=%t2.o,_ZN1C1fEi,p \ +; RUN: -r=%t2.o,_ZN1D1mEi,p \ +; RUN: -r=%t2.o,_ZTV1B,px \ +; RUN: -r=%t2.o,_ZTV1C,px \ +; RUN: -r=%t2.o,_ZTV1D,px 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR ; Legacy PM ; RUN: llvm-lto2 run %t.o -save-temps -pass-remarks=. \ Index: test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll =================================================================== --- /dev/null +++ test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll @@ -0,0 +1,66 @@ +; REQUIRES: x86-registered-target + +; Test that index-only devirtualization handles and ignores any +; type metadata that could not be summarized (because it was internal +; and could not be promoted due to the fact that the module has +; no external symbols and therefore could not be assigned a unique +; identifier). In this case we should simply not get the type +; metadata summary entries, and no promotion will occur. + +; Generate unsplit module with summary for ThinLTO index-based WPD. +; RUN: opt -thinlto-bc -thinlto-split-lto-unit=false -o %t2.o %s + +; Check that we don't have module flag when splitting not enabled for ThinLTO, +; and that we generate summary information needed for index-based WPD. +; RUN: llvm-dis -o - %t2.o | FileCheck %s --check-prefix=DIS +; DIS-NOT: typeIdInfo +; DIS-NOT: typeidMetadata + +; Legacy PM, Index based WPD +; RUN: llvm-lto2 run %t2.o -save-temps -pass-remarks=. \ +; RUN: -o %t3 \ +; RUN: -r=%t2.o,test,plx \ +; RUN: -r=%t2.o,_ZN1D1mEi, +; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR + +; New PM, Index based WPD +; RUN: llvm-lto2 run %t2.o -save-temps -use-new-pm -pass-remarks=. \ +; RUN: -o %t3 \ +; RUN: -r=%t2.o,test,plx \ +; RUN: -r=%t2.o,_ZN1D1mEi, +; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +%struct.D = type { i32 (...)** } + +@_ZTV1D = internal constant { [3 x i8*] } { [3 x i8*] [i8* null, i8* undef, i8* bitcast (i32 (%struct.D*, i32)* @_ZN1D1mEi to i8*)] }, !type !3 + +; CHECK-IR-LABEL: define weak_odr dso_local i32 @test +define weak_odr i32 @test(%struct.D* %obj2, i32 %a) { +entry: + %0 = bitcast %struct.D* %obj2 to i8*** + %vtable2 = load i8**, i8*** %0 + %1 = bitcast i8** %vtable2 to i8* + %p2 = call i1 @llvm.type.test(i8* %1, metadata !4) + call void @llvm.assume(i1 %p2) + + %2 = bitcast i8** %vtable2 to i32 (%struct.D*, i32)** + %fptr33 = load i32 (%struct.D*, i32)*, i32 (%struct.D*, i32)** %2, align 8 + + ; Check that the call was not devirtualized. + ; CHECK-IR: %call4 = tail call i32 %fptr33 + %call4 = tail call i32 %fptr33(%struct.D* nonnull %obj2, i32 0) + ret i32 %call4 +} +; CHECK-IR-LABEL: ret i32 +; CHECK-IR-LABEL: } + +declare i1 @llvm.type.test(i8*, metadata) +declare void @llvm.assume(i1) + +declare i32 @_ZN1D1mEi(%struct.D* %this, i32 %a) + +!3 = !{i64 16, !4} +!4 = distinct !{}