diff --git a/llvm/docs/CommandGuide/llvm-profgen.rst b/llvm/docs/CommandGuide/llvm-profgen.rst --- a/llvm/docs/CommandGuide/llvm-profgen.rst +++ b/llvm/docs/CommandGuide/llvm-profgen.rst @@ -36,6 +36,12 @@ ------- :program:`llvm-profgen` supports the following options: +.. option:: --format=[text|binary|extbinary|compbinary|gcc] + + Specify the format of the generated profile. Supported are `text`, + `binary`, `extbinary`, `compbinary`, `gcc`, see `llvm-profdata` for more + descriptions of the format. + .. option:: --show-mmap-events Print mmap events. diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -246,6 +246,10 @@ return LineOffset == O.LineOffset && Discriminator == O.Discriminator; } + bool operator!=(const LineLocation &O) const { + return LineOffset != O.LineOffset || Discriminator != O.Discriminator; + } + uint32_t LineOffset; uint32_t Discriminator; }; @@ -585,6 +589,11 @@ /// Return the sample count of the first instruction of the function. /// The function can be either a standalone symbol or an inlined function. uint64_t getEntrySamples() const { + if (FunctionSamples::ProfileIsCS && getHeadSamples()) { + // For CS profile, if we already have more accurate head samples + // counted by branch sample from caller, use them as entry samples. + return getHeadSamples(); + } uint64_t Count = 0; // Use either BodySamples or CallsiteSamples which ever has the smaller // lineno. @@ -680,19 +689,28 @@ /// Return the function name. StringRef getName() const { return Name; } + /// Return function name with context. + StringRef getNameWithContext() const { + return FunctionSamples::ProfileIsCS ? Context.getNameWithContext() : Name; + } + /// Return the original function name. StringRef getFuncName() const { return getFuncName(Name); } /// Return the canonical name for a function, taking into account /// suffix elision policy attributes. static StringRef getCanonicalFnName(const Function &F) { - static const char *knownSuffixes[] = { ".llvm.", ".part." }; auto AttrName = "sample-profile-suffix-elision-policy"; auto Attr = F.getFnAttribute(AttrName).getValueAsString(); + return getCanonicalFnName(F.getName(), Attr); + } + + static StringRef getCanonicalFnName(StringRef FnName, StringRef Attr = "") { + static const char *knownSuffixes[] = { ".llvm.", ".part." }; if (Attr == "" || Attr == "all") { - return F.getName().split('.').first; + return FnName.split('.').first; } else if (Attr == "selected") { - StringRef Cand(F.getName()); + StringRef Cand(FnName); for (const auto &Suf : knownSuffixes) { StringRef Suffix(Suf); auto It = Cand.rfind(Suffix); @@ -704,11 +722,11 @@ } return Cand; } else if (Attr == "none") { - return F.getName(); + return FnName; } else { assert(false && "internal error: unknown suffix elision policy"); } - return F.getName(); + return FnName; } /// Translate \p Name into its original name. diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -276,7 +276,10 @@ /// it needs to be parsed by the SampleProfileReaderText class. std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) { auto &OS = *OutputStream; - OS << S.getName() << ":" << S.getTotalSamples(); + if (FunctionSamples::ProfileIsCS) + OS << "[" << S.getNameWithContext() << "]:" << S.getTotalSamples(); + else + OS << S.getName() << ":" << S.getTotalSamples(); if (Indent == 0) OS << ":" << S.getHeadSamples(); OS << "\n"; diff --git a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfbin b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfbin new file mode 100755 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ start addr of bar +// 400684 -> address in main +// LBR Entry: | Source | Target +// 0x40062f/0x4005b0/P/-/-/0 | callq -132 | start addr of bar +// 0x400645/0x4005ff/P/-/-/0 | jmp -75 | movl -8(%rbp), %eax +// 0x400637/0x400645/P/-/-/0 | jmp 9 | jmp -75 +// 0x4005e9/0x400634/P/-/-/0 | (bar)retq | next addr of [callq -132 ] +// 0x4005d7/0x4005e5/P/-/-/0 | jmp 9 | movl -4(%rbp), %eax diff --git a/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test @@ -0,0 +1,47 @@ +; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER +; RUN: FileCheck %s --input-file %t + +; CHECK:[main:1 @ foo]:44:0 +; CHECK: 2.2: 14 +; CHECK: 3: 15 +; CHECK: 3.2: 14 bar:14 +; CHECK: 3.4: 1 +; CHECK:[main:1 @ foo:3.2 @ bar]:14:0 +; CHECK: 1: 14 + +; CHECK-UNWINDER: Binary(inline-cs-noprobe.perfbin)'s Range Counter: +; CHECK-UNWINDER: main:1 @ foo:3.2 @ bar +; CHECK-UNWINDER: (6af, 6bb): 14 +; CHECK-UNWINDER: main:1 @ foo +; CHECK-UNWINDER: (670, 6ad): 1 +; CHECK-UNWINDER: (67e, 69b): 1 +; CHECK-UNWINDER: (67e, 6ad): 13 +; CHECK-UNWINDER: (6bd, 6c8): 14 + +; CHECK-UNWINDER: Binary(inline-cs-noprobe.perfbin)'s Branch Counter: +; CHECK-UNWINDER: main:1 @ foo +; CHECK-UNWINDER: (69b, 670): 1 +; CHECK-UNWINDER: (6c8, 67e): 15 + +; original code: +; clang -O3 -g test.c -o a.out +#include + +int bar(int x, int y) { + if (x % 3) { + return x - y; + } + return x + y; +} + +void foo() { + int s, i = 0; + while (i++ < 4000 * 4000) + if (i % 91) s = bar(i, s); else s += 30; + printf("sum is %d\n", s); +} + +int main() { + foo(); + return 0; +} diff --git a/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test @@ -0,0 +1,60 @@ +; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-noprobe.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER +; RUN: FileCheck %s --input-file %t + +; CHECK:[main:1 @ foo:3 @ bar]:12:3 +; CHECK: 0: 3 +; CHECK: 1: 3 +; CHECK: 2: 2 +; CHECK: 4: 1 +; CHECK: 5: 3 +; CHECK:[main:1 @ foo]:9:0 +; CHECK: 2: 3 +; CHECK: 3: 3 bar:3 + +; CHECK-UNWINDER: Binary(noinline-cs-noprobe.perfbin)'s Range Counter: +; CHECK-UNWINDER: main:1 @ foo +; CHECK-UNWINDER: (5ff, 62f): 3 +; CHECK-UNWINDER: (634, 637): 3 +; CHECK-UNWINDER: (645, 645): 3 +; CHECK-UNWINDER: main:1 @ foo:3 @ bar +; CHECK-UNWINDER: (5b0, 5c8): 1 +; CHECK-UNWINDER: (5b0, 5d7): 2 +; CHECK-UNWINDER: (5dc, 5e9): 1 +; CHECK-UNWINDER: (5e5, 5e9): 2 + +; CHECK-UNWINDER: Binary(noinline-cs-noprobe.perfbin)'s Branch Counter: +; CHECK-UNWINDER: main:1 @ foo +; CHECK-UNWINDER: (62f, 5b0): 3 +; CHECK-UNWINDER: (637, 645): 3 +; CHECK-UNWINDER: (645, 5ff): 3 +; CHECK-UNWINDER: main:1 @ foo:3 @ bar +; CHECK-UNWINDER: (5c8, 5dc): 2 +; CHECK-UNWINDER: (5d7, 5e5): 2 +; CHECK-UNWINDER: (5e9, 634): 3 + + + + + +; original code: +; clang -O0 -g test.c -o a.out +#include + +int bar(int x, int y) { + if (x % 3) { + return x - y; + } + return x + y; +} + +void foo() { + int s, i = 0; + while (i++ < 4000 * 4000) + if (i % 91) s = bar(i, s); else s += 30; + printf("sum is %d\n", s); +} + +int main() { + foo(); + return 0; +} diff --git a/llvm/tools/llvm-profgen/CMakeLists.txt b/llvm/tools/llvm-profgen/CMakeLists.txt --- a/llvm/tools/llvm-profgen/CMakeLists.txt +++ b/llvm/tools/llvm-profgen/CMakeLists.txt @@ -7,6 +7,7 @@ MC MCDisassembler Object + ProfileData Support Symbolize ) @@ -15,4 +16,5 @@ llvm-profgen.cpp PerfReader.cpp ProfiledBinary.cpp + ProfileGenerator.cpp ) diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -56,16 +56,238 @@ } }; +// The type of perfscript +enum PerfScriptType { + PERF_INVILID = 0, + PERF_LBR = 1, // Only LBR sample + PERF_LBR_STACK = 2, // Hybrid sample including call stack and LBR stack. +}; + +// The parsed LBR sample entry. +struct LBREntry { + uint64_t Source = 0; + uint64_t Target = 0; + // An artificial branch stands for a series of consecutive branches starting + // from the current binary with a transition through external code and + // eventually landing back in the current binary. + bool IsArtificial = false; + LBREntry(uint64_t S, uint64_t T, bool I) + : Source(S), Target(T), IsArtificial(I) {} +}; + +// The parsed hybrid sample including call stack and LBR stack. +struct HybridSample { + // Profiled binary that current frame address belongs to + ProfiledBinary *Binary; + // Call stack recorded in FILO(leaf to root) order + std::list CallStack; + // LBR stack recorded in FIFO order + SmallVector LBRStack; + + // Used for sample aggregation + bool operator==(const HybridSample &Other) const { + if (Other.Binary != Binary) + return false; + const std::list &OtherCallStack = Other.CallStack; + const SmallVector &OtherLBRStack = Other.LBRStack; + + if (CallStack.size() != OtherCallStack.size() || + LBRStack.size() != OtherLBRStack.size()) + return false; + + auto Iter = CallStack.begin(); + for (auto Address : OtherCallStack) { + if (Address != *Iter++) + return false; + } + + for (size_t I = 0; I < OtherLBRStack.size(); I++) { + if (LBRStack[I].Source != OtherLBRStack[I].Source || + LBRStack[I].Target != OtherLBRStack[I].Target) + return false; + } + return true; + } +}; + +// The state for the unwinder, it doesn't hold the data but only keep the +// pointer/index of the data, While unwinding, the CallStack is changed +// dynamicially and will be recorded as the context of the sample +struct UnwindState { + // Profiled binary that current frame address belongs to + const ProfiledBinary *Binary; + // TODO: switch to use trie for call stack + std::list CallStack; + // Used to fall through the LBR stack + uint32_t LBRIndex = 0; + // Reference to HybridSample.LBRStack + const SmallVector &LBRStack; + // Used to iterate the address range + InstructionPointer InstPtr; + UnwindState(const HybridSample &Sample) + : Binary(Sample.Binary), CallStack(Sample.CallStack), + LBRStack(Sample.LBRStack), + InstPtr(Sample.Binary, Sample.CallStack.front()) {} + + bool validateInitialState() { + uint64_t LBRLeaf = LBRStack[LBRIndex].Target; + uint64_t StackLeaf = CallStack.front(); + // When we take a stack sample, ideally the sampling distance between the + // leaf IP of stack and the last LBR target shouldn't be very large. + // Use a heuristic size (0x100) to filter out broken records. + if (StackLeaf < LBRLeaf || StackLeaf >= LBRLeaf + 0x100) { + WithColor::warning() << "Bogus trace: stack tip = " + << format("%#010x", StackLeaf) + << ", LBR tip = " << format("%#010x\n", LBRLeaf); + return false; + } + return true; + } + + void checkStateConsistency() { + assert(InstPtr.Address == CallStack.front() && + "IP should align with context leaf"); + } + + std::string getExpandedContextStr() const { + return Binary->getExpandedContextStr(CallStack); + } + const ProfiledBinary *getBinary() const { return Binary; } + bool hasNextLBR() const { return LBRIndex < LBRStack.size(); } + uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } + uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } + const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } + void advanceLBR() { LBRIndex++; } +}; + +// The counter of branch samples for one function indexed by the branch, +// which is represented as the source and target offset pair. +using BranchSample = std::map, uint64_t>; +// The counter of range samples for one function indexed by the range, +// which is represented as the start and end offset pair. +using RangeSample = std::map, uint64_t>; +// Range sample counters indexed by the context string +using ContextRangeCounter = std::unordered_map; +// Branch sample counters indexed by the context string +using ContextBranchCounter = std::unordered_map; + +// For Hybrid sample counters +struct ContextSampleCounters { + ContextRangeCounter RangeCounter; + ContextBranchCounter BranchCounter; + + void recordRangeCount(std::string &ContextId, uint64_t Start, uint64_t End, + uint64_t Repeat) { + RangeCounter[ContextId][{Start, End}] += Repeat; + } + void recordBranchCount(std::string &ContextId, uint64_t Source, + uint64_t Target, uint64_t Repeat) { + BranchCounter[ContextId][{Source, Target}] += Repeat; + } +}; + +struct HybridSampleHash { + uint64_t hashCombine(uint64_t Hash, uint64_t Value) const { + // Simple DJB2 hash + return ((Hash << 5) + Hash) + Value; + } + + uint64_t operator()(const HybridSample &Sample) const { + uint64_t Hash = 5381; + Hash = hashCombine(Hash, reinterpret_cast(Sample.Binary)); + for (const auto &Value : Sample.CallStack) { + Hash = hashCombine(Hash, Value); + } + for (const auto &Entry : Sample.LBRStack) { + Hash = hashCombine(Hash, Entry.Source); + Hash = hashCombine(Hash, Entry.Target); + } + return Hash; + } +}; + +// After parsing the sample, we record the samples by aggregating them +// into this structure and the value is the sample counter. +using AggregationCounter = + std::unordered_map; + +/* +As in hybrid sample we have a group of LBRs and the most recent sampling call +stack, we can walk through those LBRs to infer more call stacks which would be +used as context for profile. VirtualUnwinder is the class to do the call stack +unwinding based on LBR state. Two types of unwinding are processd here: +1) LBR unwinding and 2) linear range unwinding. +Specifically, for each LBR entry(can be classified into call, return, regular +branch), LBR unwinding will replay the operation by pushing, popping or +switching leaf frame towards the call stack and since the initial call stack +is most recently sampled, the replay should be in anti-execution order, i.e. for +the regular case, pop the call stack when LBR is call, push frame on call stack +when LBR is return. After each LBR processed, it also needs to align with the +next LBR by going through instructions from previous LBR's target to current +LBR's source, which is the linear unwinding. As instruction from linear range +can come from different function by inlining, linear unwinding will do the range +splitting and record counters by the range with same inline context. Over those +unwinding process we will record each call stack as context id and LBR/linear +range as sample counter for further CS profile generation. +*/ +class VirtualUnwinder { +public: + VirtualUnwinder(ContextSampleCounters *Counters) : SampleCounters(Counters) {} + + bool isCallState(UnwindState &State) const { + // The tail call frame is always missing here in stack sample, we will + // use a specific tail call tracker to infer it. + return State.getBinary()->addressIsCall(State.getCurrentLBRSource()); + } + + bool isReturnState(UnwindState &State) const { + // Simply check addressIsReturn, as ret is always reliable, both for + // regular call and tail call. + return State.getBinary()->addressIsReturn(State.getCurrentLBRSource()); + } + + void unwindCall(UnwindState &State); + void unwindLinear(UnwindState &State, uint64_t Repeat); + void unwindReturn(UnwindState &State); + void unwindBranchWithinFrame(UnwindState &State); + bool unwind(const HybridSample &Sample, uint64_t Repeat); + void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, + uint64_t Repeat); + void recordBranchCount(const LBREntry &Branch, UnwindState &State, + uint64_t Repeat); + +private: + ContextSampleCounters *SampleCounters; +}; + // Filename to binary map using BinaryMap = StringMap; // Address to binary map for fast look-up using AddressBinaryMap = std::map; +// Binary to ContextSampleCounters Map to support multiple binary, we may have +// same binary loaded at different addresses, they should share the same sample +// counter +using BinarySampleCounterMap = + std::unordered_map; // Load binaries and read perf trace to parse the events and samples class PerfReader { - BinaryMap BinaryTable; - AddressBinaryMap AddrToBinaryMap; // Used by address-based lookup. +public: + PerfReader(cl::list &BinaryFilenames); + + // Hybrid sample(call stack + LBRs) profile traces are seprated by double line + // break, search for that within the first 4k charactors to avoid going + // through the whole file. + static bool isHybridPerfScript(StringRef FileName) { + auto BufOrError = MemoryBuffer::getFileOrSTDIN(FileName, 4000); + if (!BufOrError) + exitWithError(BufOrError.getError(), FileName); + auto Buffer = std::move(BufOrError.get()); + if (Buffer->getBuffer().find("\n\n") == StringRef::npos) + return false; + return true; + } // The parsed MMap event struct MMapEvent { @@ -82,18 +304,48 @@ ProfiledBinary &loadBinary(const StringRef BinaryPath, bool AllowNameConflict = true); void updateBinaryAddress(const MMapEvent &Event); + PerfScriptType getPerfScriptType() const { return PerfType; } + // Entry of the reader to parse multiple perf traces + void parsePerfTraces(cl::list &PerfTraceFilenames); + const BinarySampleCounterMap &getBinarySampleCounters() const { + return BinarySampleCounters; + } -public: - PerfReader(cl::list &BinaryFilenames); - +private: /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a /// mapping between the binary name and its memory layout. /// void parseMMap2Event(TraceStream &TraceIt); - void parseEvent(TraceStream &TraceIt); - // Parse perf events and samples - void parseTrace(StringRef Filename); - void parsePerfTraces(cl::list &PerfTraceFilenames); + // Parse perf events/samples and do aggregation + void parseAndAggregateTrace(StringRef Filename); + // Parse either an MMAP event or a perf sample + void parseEventOrSample(TraceStream &TraceIt); + // Parse the hybrid sample including the call and LBR line + void parseHybridSample(TraceStream &TraceIt); + // Extract call stack from the perf trace lines + bool extractCallstack(TraceStream &TraceIt, std::list &CallStack); + // Extract LBR stack from one perf trace line + bool extractLBRStack(TraceStream &TraceIt, + SmallVector &LBRStack, + ProfiledBinary *Binary); + void checkAndSetPerfType(cl::list &PerfTraceFilenames); + // Post process the profile after trace aggregation, we will do simple range + // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample). + void generateRawProfile(); + // Unwind the hybrid samples after aggregration + void unwindSamples(); + void printUnwinderOutput(); + // Helper function for looking up binary in AddressBinaryMap + ProfiledBinary *getBinary(uint64_t Address); + + BinaryMap BinaryTable; + AddressBinaryMap AddrToBinaryMap; // Used by address-based lookup. + +private: + BinarySampleCounterMap BinarySampleCounters; + // Samples with the repeating time generated by the perf reader + AggregationCounter AggregatedSamples; + PerfScriptType PerfType; }; } // end namespace sampleprof diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -11,9 +11,136 @@ cl::init(false), cl::ZeroOrMore, cl::desc("Print binary load events.")); +static cl::opt ShowUnwinderOutput("show-unwinder-output", + cl::ReallyHidden, cl::init(false), + cl::ZeroOrMore, + cl::desc("Print unwinder output")); + namespace llvm { namespace sampleprof { +void VirtualUnwinder::unwindCall(UnwindState &State) { + // The 2nd frame after leaf could be missing if stack sample is + // taken when IP is within prolog/epilog, as frame chain isn't + // setup yet. Fill in the missing frame in that case. + // TODO: Currently we just assume all the addr that can't match the + // 2nd frame is in prolog/epilog. In the future, we will switch to + // pro/epi tracker(Dwarf CFI) for the precise check. + uint64_t Source = State.getCurrentLBRSource(); + auto Iter = State.CallStack.begin(); + if (State.CallStack.size() == 1 || *(++Iter) != Source) { + State.CallStack.front() = Source; + } else { + State.CallStack.pop_front(); + } + State.InstPtr.update(Source); +} + +void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) { + InstructionPointer &IP = State.InstPtr; + uint64_t Target = State.getCurrentLBRTarget(); + uint64_t End = IP.Address; + // Unwind linear execution part + while (IP.Address >= Target) { + uint64_t PrevIP = IP.Address; + IP.backward(); + // Break into segments for implicit call/return due to inlining + bool SameInlinee = + State.getBinary()->inlineContextEqual(PrevIP, IP.Address); + if (!SameInlinee || PrevIP == Target) { + recordRangeCount(PrevIP, End, State, Repeat); + End = IP.Address; + } + State.CallStack.front() = IP.Address; + } +} + +void VirtualUnwinder::unwindReturn(UnwindState &State) { + // Add extra frame as we unwind through the return + const LBREntry &LBR = State.getCurrentLBR(); + uint64_t CallAddr = State.getBinary()->getCallAddrFromFrameAddr(LBR.Target); + State.CallStack.front() = CallAddr; + State.CallStack.push_front(LBR.Source); + State.InstPtr.update(LBR.Source); +} + +void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) { + // TODO: Tolerate tail call for now, as we may see tail call from libraries. + // This is only for intra function branches, excluding tail calls. + uint64_t Source = State.getCurrentLBRSource(); + State.CallStack.front() = Source; + State.InstPtr.update(Source); +} + +void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End, + UnwindState &State, uint64_t Repeat) { + std::string &&ContextId = State.getExpandedContextStr(); + uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start); + uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End); + SampleCounters->recordRangeCount(ContextId, StartOffset, EndOffset, Repeat); +} + +void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, + UnwindState &State, uint64_t Repeat) { + if (Branch.IsArtificial) + return; + std::string &&ContextId = State.getExpandedContextStr(); + uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source); + uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target); + SampleCounters->recordBranchCount(ContextId, SourceOffset, TargetOffset, + Repeat); +} + +bool VirtualUnwinder::unwind(const HybridSample &Sample, uint64_t Repeat) { + // Capture initial state as starting point for unwinding. + UnwindState State(Sample); + + // Sanity check - making sure leaf of LBR aligns with leaf of stack sample + // Stack sample sometimes can be unreliable, so filter out bogus ones. + if (!State.validateInitialState()) + return false; + + // Also do not attempt linear unwind for the leaf range as it's incomplete. + bool IsLeaf = true; + + // Now process the LBR samples in parrallel with stack sample + // Note that we do not reverse the LBR entry order so we can + // unwind the sample stack as we walk through LBR entries. + while (State.hasNextLBR()) { + State.checkStateConsistency(); + + // Unwind implicit calls/returns from inlining, along the linear path, + // break into smaller sub section each with its own calling context. + if (!IsLeaf) { + unwindLinear(State, Repeat); + } + IsLeaf = false; + + // Save the LBR branch before it gets unwound. + const LBREntry &Branch = State.getCurrentLBR(); + + if (isCallState(State)) { + // Unwind calls - we know we encountered call if LBR overlaps with + // transition between leaf the 2nd frame. Note that for calls that + // were not in the original stack sample, we should have added the + // extra frame when processing the return paired with this call. + unwindCall(State); + } else if (isReturnState(State)) { + // Unwind returns - check whether the IP is indeed at a return instruction + unwindReturn(State); + } else { + // Unwind branches - for regular intra function branches, we only + // need to record branch with context. + unwindBranchWithinFrame(State); + } + State.advanceLBR(); + // Record `branch` with calling context after unwinding. + recordBranchCount(Branch, State, Repeat); + } + + return true; +} + PerfReader::PerfReader(cl::list &BinaryFilenames) { // Load the binaries. for (auto Filename : BinaryFilenames) @@ -61,6 +188,208 @@ Binary.setBaseAddress(Event.BaseAddress); } +ProfiledBinary *PerfReader::getBinary(uint64_t Address) { + auto Iter = AddrToBinaryMap.lower_bound(Address); + if (Iter == AddrToBinaryMap.end() || Iter->first != Address) { + if (Iter == AddrToBinaryMap.begin()) + return nullptr; + Iter--; + } + return Iter->second; +} + +static void printSampleCounter(ContextRangeCounter &Counter) { + for (auto Range : Counter) { + outs() << Range.first << "\n"; + for (auto I : Range.second) { + outs() << " (" << format("%" PRIx64, I.first.first) << ", " + << format("%" PRIx64, I.first.second) << "): " << I.second << "\n"; + } + } +} + +void PerfReader::printUnwinderOutput() { + for (auto I : BinarySampleCounters) { + const ProfiledBinary *Binary = I.first; + outs() << "Binary(" << Binary->getName().str() << ")'s Range Counter:\n"; + printSampleCounter(I.second.RangeCounter); + outs() << "\nBinary(" << Binary->getName().str() << ")'s Branch Counter:\n"; + printSampleCounter(I.second.BranchCounter); + } +} + +void PerfReader::unwindSamples() { + for (const auto &Item : AggregatedSamples) { + const HybridSample &Sample = Item.first; + VirtualUnwinder Unwinder(&BinarySampleCounters[Sample.Binary]); + Unwinder.unwind(Sample, Item.second); + } + + if (ShowUnwinderOutput) + printUnwinderOutput(); +} + +bool PerfReader::extractLBRStack(TraceStream &TraceIt, + SmallVector &LBRStack, + ProfiledBinary *Binary) { + // The raw format of LBR stack is like: + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 + // It's in FIFO order and seperated by whitespace. + SmallVector Records; + TraceIt.getCurrentLine().split(Records, " "); + + // Extract leading instruction pointer if present, use single + // list to pass out as reference. + size_t Index = 0; + if (!Records.empty() && Records[0].find('/') == StringRef::npos) { + Index = 1; + } + // Now extract LBR samples - note that we do not reverse the + // LBR entry order so we can unwind the sample stack as we walk + // through LBR entries. + uint64_t PrevTrDst = 0; + + while (Index < Records.size()) { + auto &Token = Records[Index++]; + if (Token.size() == 0) + continue; + + SmallVector Addresses; + Token.split(Addresses, "/"); + uint64_t Src; + uint64_t Dst; + Addresses[0].substr(2).getAsInteger(16, Src); + Addresses[1].substr(2).getAsInteger(16, Dst); + + bool SrcIsInternal = Binary->addressIsCode(Src); + bool DstIsInternal = Binary->addressIsCode(Dst); + bool IsArtificial = false; + // Ignore branches outside the current binary. + if (!SrcIsInternal && !DstIsInternal) + continue; + if (!SrcIsInternal && DstIsInternal) { + // For transition from external code (such as dynamic libraries) to + // the current binary, keep track of the branch target which will be + // grouped with the Source of the last transition from the current + // binary. + PrevTrDst = Dst; + continue; + } + if (SrcIsInternal && !DstIsInternal) { + // For transition to external code, group the Source with the next + // availabe transition target. + if (!PrevTrDst) + continue; + Dst = PrevTrDst; + PrevTrDst = 0; + IsArtificial = true; + } + // TODO: filter out buggy duplicate branches on Skylake + + LBRStack.emplace_back(LBREntry(Src, Dst, IsArtificial)); + } + TraceIt.advance(); + return !LBRStack.empty(); +} + +bool PerfReader::extractCallstack(TraceStream &TraceIt, + std::list &CallStack) { + // The raw format of call stack is like: + // 4005dc # leaf frame + // 400634 + // 400684 # root frame + // It's in bottom-up order with each frame in one line. + + // Extract stack frames from sample + ProfiledBinary *Binary = nullptr; + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + StringRef FrameStr = TraceIt.getCurrentLine().ltrim(); + // We might get an empty line at the beginning or comments, skip it + uint64_t FrameAddr = 0; + if (FrameStr.getAsInteger(16, FrameAddr)) { + TraceIt.advance(); + break; + } + TraceIt.advance(); + if (!Binary) { + Binary = getBinary(FrameAddr); + // we might have addr not match the MMAP, skip it + if (!Binary) { + if (AddrToBinaryMap.size() == 0) + WithColor::warning() << "No MMAP event in the perfscript, create it " + "with '--show-mmap-events'\n"; + break; + } + } + // Currently intermixed frame from different binaries is not supported. + // Ignore bottom frames not from binary of interest. + if (!Binary->addressIsCode(FrameAddr)) + break; + + // We need to translate return address to call address + // for non-leaf frames + if (!CallStack.empty()) { + FrameAddr = Binary->getCallAddrFromFrameAddr(FrameAddr); + } + + CallStack.emplace_back(FrameAddr); + } + + if (CallStack.empty()) + return false; + // Skip other unrelated line, find the next valid LBR line + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + TraceIt.advance(); + } + // Filter out broken stack sample. We may not have complete frame info + // if sample end up in prolog/epilog, the result is dangling context not + // connected to entry point. This should be relatively rare thus not much + // impact on overall profile quality. However we do want to filter them + // out to reduce the number of different calling contexts. One instance + // of such case - when sample landed in prolog/epilog, somehow stack + // walking will be broken in an unexpected way that higher frames will be + // missing. + return !Binary->addressInPrologEpilog(CallStack.front()); +} + +void PerfReader::parseHybridSample(TraceStream &TraceIt) { + // The raw hybird sample started with call stack in FILO order and followed + // intermediately by LBR sample + // e.g. + // 4005dc # call stack leaf + // 400634 + // 400684 # call stack root + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries + // + HybridSample Sample; + + // Parsing call stack and populate into HybridSample.CallStack + if (!extractCallstack(TraceIt, Sample.CallStack)) { + // Skip the next LBR line matched current call stack + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) + TraceIt.advance(); + return; + } + // Set the binary current sample belongs to + Sample.Binary = getBinary(Sample.CallStack.front()); + + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) { + // Parsing LBR stack and populate into HybridSample.LBRStack + if (extractLBRStack(TraceIt, Sample.LBRStack, Sample.Binary)) { + // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR + // ranges + Sample.CallStack.front() = Sample.LBRStack[0].Target; + // Record samples by aggregation + AggregatedSamples[Sample]++; + } + } else { + // LBR sample is encoded in single line after stack sample + exitWithError("'Hybrid perf sample is corrupted, No LBR sample line"); + } +} + void PerfReader::parseMMap2Event(TraceStream &TraceIt) { // Parse a line like: // PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0 @@ -104,27 +433,74 @@ outs() << "Mmap: Binary " << Event.BinaryPath << " loaded at " << format("0x%" PRIx64 ":", Event.BaseAddress) << " \n"; } + TraceIt.advance(); } -void PerfReader::parseEvent(TraceStream &TraceIt) { +void PerfReader::parseEventOrSample(TraceStream &TraceIt) { if (TraceIt.getCurrentLine().startswith("PERF_RECORD_MMAP2")) parseMMap2Event(TraceIt); - - TraceIt.advance(); + else if (getPerfScriptType() == PERF_LBR_STACK) + parseHybridSample(TraceIt); + else { + // TODO: parse other type sample + TraceIt.advance(); + } } -void PerfReader::parseTrace(StringRef Filename) { +void PerfReader::parseAndAggregateTrace(StringRef Filename) { // Trace line iterator TraceStream TraceIt(Filename); - while (!TraceIt.isAtEoF()) { - parseEvent(TraceIt); + while (!TraceIt.isAtEoF()) + parseEventOrSample(TraceIt); +} + +void PerfReader::checkAndSetPerfType( + cl::list &PerfTraceFilenames) { + bool HasHybridPerf = true; + for (auto FileName : PerfTraceFilenames) { + if (!isHybridPerfScript(FileName)) { + HasHybridPerf = false; + break; + } + } + + if (HasHybridPerf) { + // Set up ProfileIsCS to enable context-sensitive functionalities + // in SampleProf + FunctionSamples::ProfileIsCS = true; + PerfType = PERF_LBR_STACK; + + } else { + // TODO: Support other type of perf script + PerfType = PERF_INVILID; + } + + if (BinaryTable.size() > 1) { + // TODO: remove this if everything is ready to support multiple binaries. + exitWithError("Currently only support one input binary, multiple binaries' " + "profile will be merged in one profile and make profile " + "summary info inaccurate. Please use `perfdata` to merge " + "profiles from multiple binaries."); + } +} + +void PerfReader::generateRawProfile() { + if (getPerfScriptType() == PERF_LBR_STACK) { + // Unwind samples if it's hybird sample + unwindSamples(); + } else if (getPerfScriptType() == PERF_LBR) { + // TODO: range overlap computation for regular AutoFDO } } void PerfReader::parsePerfTraces(cl::list &PerfTraceFilenames) { - // Parse perf traces. + // Check and set current perfscript type + checkAndSetPerfType(PerfTraceFilenames); + // Parse perf traces and do aggregation. for (auto Filename : PerfTraceFilenames) - parseTrace(Filename); + parseAndAggregateTrace(Filename); + + generateRawProfile(); } } // end namespace sampleprof diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-profgen/ProfileGenerator.h @@ -0,0 +1,96 @@ +//===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/ProfileData/SampleProfWriter.h" + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +class ProfileGenerator { + +public: + ProfileGenerator(){}; + virtual ~ProfileGenerator() = default; + static std::unique_ptr + create(const BinarySampleCounterMap &SampleCounters, + enum PerfScriptType SampleType); + virtual void generateProfile() = 0; + + // Use SampleProfileWriter to serialize profile map + void write(); + +protected: + /* + For each region boundary point, mark if it is begin or end (or both) of + the region. Boundary points are inclusive. Log the sample count as well + so we can use it when we compute the sample count of each disjoint region + later. Note that there might be multiple ranges with different sample + count that share same begin/end point. We need to accumulate the sample + count for the boundary point for such case, because for the example + below, + + |<--100-->| + |<------200------>| + A B C + + sample count for disjoint region [A,B] would be 300. + */ + void findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges); + + // Used by SampleProfileWriter + StringMap ProfileMap; +}; + +class CSProfileGenerator : public ProfileGenerator { + const BinarySampleCounterMap &BinarySampleCounters; + +public: + CSProfileGenerator(const BinarySampleCounterMap &Counters) + : BinarySampleCounters(Counters){}; + +public: + void generateProfile() override { + // Fill in function body samples + populateFunctionBodySamples(); + + // Fill in boundary sample counts as well as call site samples for calls + populateFunctionBoundarySamples(); + + // Fill in call site value sample for inlined calls and also use context to + // infer missing samples. Since we don't have call count for inlined + // functions, we estimate it from inlinee's profile using the entry of the + // body sample. + populateInferredFunctionSamples(); + } + +private: + // Helper function for updating body sample for a leaf location in + // FunctionProfile + void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, + const FrameLocation &LeafLoc, + uint64_t Count); + // Lookup or create FunctionSamples for the context + FunctionSamples &getFunctionProfileForContext(StringRef ContextId); + void populateFunctionBodySamples(); + void populateFunctionBoundarySamples(); + void populateInferredFunctionSamples(); +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -0,0 +1,329 @@ +//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProfileGenerator.h" + +static cl::opt OutputFilename("output", cl::value_desc("output"), + cl::Required, + cl::desc("Output profile file")); + +static cl::opt OutputFormat( + "format", cl::desc("Format of output profile"), cl::init(SPF_Text), + cl::values( + clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), + clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"), + clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), + clEnumValN(SPF_Text, "text", "Text encoding"), + clEnumValN(SPF_GCC, "gcc", + "GCC encoding (only meaningful for -sample)"))); + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +std::unique_ptr +ProfileGenerator::create(const BinarySampleCounterMap &BinarySampleCounters, + enum PerfScriptType SampleType) { + std::unique_ptr ProfileGenerator; + + if (SampleType == PERF_LBR_STACK) { + ProfileGenerator.reset(new CSProfileGenerator(BinarySampleCounters)); + } else { + // TODO: + llvm_unreachable("Unsupported perfscript!"); + } + + return ProfileGenerator; +} + +void ProfileGenerator::write() { + auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); + if (std::error_code EC = WriterOrErr.getError()) + exitWithError(EC, OutputFilename); + auto Writer = std::move(WriterOrErr.get()); + Writer->write(ProfileMap); +} + +void ProfileGenerator::findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges) { + + /* + Regions may overlap with each other. Using the boundary info, find all + disjoint ranges and their sample count. BoundaryPoint contains the count + mutiple samples begin/end at this points. + + |<--100-->| Sample1 + |<------200------>| Sample2 + A B C + + In the example above, + Sample1 begins at A, ends at B, its value is 100. + Sample2 beings at A, ends at C, its value is 200. + For A, BeginCount is the sum of sample begins at A, which is 300 and no + samples ends at A, so EndCount is 0. + Then boundary points A, B, and C with begin/end counts are: + A: (300, 0) + B: (0, 100) + C: (0, 200) + */ + struct BoundaryPoint { + // Sum of sample counts beginning at this point + uint64_t BeginCount; + // Sum of sample counts ending at this point + uint64_t EndCount; + + BoundaryPoint() : BeginCount(0), EndCount(0){}; + + void addBeginCount(uint64_t Count) { BeginCount += Count; } + + void addEndCount(uint64_t Count) { EndCount += Count; } + }; + + /* + For the above example. With boundary points, follwing logic finds two + disjoint region of + + [A,B]: 300 + [B+1,C]: 200 + + If there is a boundary point that both begin and end, the point itself + becomes a separate disjoint region. For example, if we have original + ranges of + + |<--- 100 --->| + |<--- 200 --->| + A B C + + there are three boundary points with their begin/end counts of + + A: (100, 0) + B: (200, 100) + C: (0, 200) + + the disjoint ranges would be + + [A, B-1]: 100 + [B, B]: 300 + [B+1, C]: 200. + */ + std::map Boundaries; + + for (auto Item : Ranges) { + uint64_t Begin = Item.first.first; + uint64_t End = Item.first.second; + uint64_t Count = Item.second; + if (Boundaries.find(Begin) == Boundaries.end()) + Boundaries[Begin] = BoundaryPoint(); + Boundaries[Begin].addBeginCount(Count); + + if (Boundaries.find(End) == Boundaries.end()) + Boundaries[End] = BoundaryPoint(); + Boundaries[End].addEndCount(Count); + } + + uint64_t BeginAddress = 0; + int Count = 0; + for (auto Item : Boundaries) { + uint64_t Address = Item.first; + BoundaryPoint &Point = Item.second; + if (Point.BeginCount) { + if (BeginAddress) + DisjointRanges[{BeginAddress, Address - 1}] = Count; + Count += Point.BeginCount; + BeginAddress = Address; + } + if (Point.EndCount) { + assert(BeginAddress && "First boundary point cannot be 'end' point"); + DisjointRanges[{BeginAddress, Address}] = Count; + Count -= Point.EndCount; + BeginAddress = Address + 1; + } + } +} + +FunctionSamples & +CSProfileGenerator::getFunctionProfileForContext(StringRef ContextStr) { + auto Ret = ProfileMap.try_emplace(ContextStr, FunctionSamples()); + if (Ret.second) { + SampleContext FContext(Ret.first->first(), RawContext); + FunctionSamples &FProfile = Ret.first->second; + FProfile.setName(FContext.getName()); + FProfile.setContext(FContext); + } + return Ret.first->second; +} + +void CSProfileGenerator::updateBodySamplesforFunctionProfile( + FunctionSamples &FunctionProfile, const FrameLocation &LeafLoc, + uint64_t Count) { + // Filter out invalid negative(int type) lineOffset + if (LeafLoc.second.LineOffset & 0x80000000) + return; + // Use the maximum count of samples with same line location + ErrorOr R = FunctionProfile.findSamplesAt( + LeafLoc.second.LineOffset, LeafLoc.second.Discriminator); + uint64_t PreviousCount = R ? R.get() : 0; + if (PreviousCount < Count) { + FunctionProfile.addBodySamples(LeafLoc.second.LineOffset, + LeafLoc.second.Discriminator, + Count - PreviousCount); + FunctionProfile.addTotalSamples(Count - PreviousCount); + } +} + +void CSProfileGenerator::populateFunctionBodySamples() { + for (const auto &BI : BinarySampleCounters) { + ProfiledBinary *Binary = BI.first; + for (const auto &CI : BI.second.RangeCounter) { + StringRef ContextId(CI.first); + // Get or create function profile for the range + FunctionSamples &FunctionProfile = + getFunctionProfileForContext(ContextId); + // Compute disjoint ranges first, so we can use MAX + // for calculating count for each location. + RangeSample Ranges; + findDisjointRanges(Ranges, CI.second); + + for (auto Range : Ranges) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (Count == 0) + continue; + + InstructionPointer IP(Binary, RangeBegin, true); + + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + while (IP.Address <= RangeEnd) { + uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); + const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(Offset); + // Recording body sample for this specific context + updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc, Count); + // Move to next IP within the range + IP.advance(); + } + } + } + } +} + +void CSProfileGenerator::populateFunctionBoundarySamples() { + for (const auto &BI : BinarySampleCounters) { + ProfiledBinary *Binary = BI.first; + for (const auto &CI : BI.second.BranchCounter) { + StringRef ContextId(CI.first); + // Get or create function profile for branch Source + FunctionSamples &FunctionProfile = + getFunctionProfileForContext(ContextId); + + for (auto Entry : CI.second) { + uint64_t SourceOffset = Entry.first.first; + uint64_t TargetOffset = Entry.first.second; + uint64_t Count = Entry.second; + // Get the callee name by branch target if it's a call branch + StringRef CalleeName = FunctionSamples::getCanonicalFnName( + Binary->getFuncFromStartOffset(TargetOffset)); + if (CalleeName.size() == 0) + continue; + + // Record called target sample and its count + const FrameLocation &LeafLoc = + Binary->getInlineLeafFrameLoc(SourceOffset); + + FunctionProfile.addCalledTargetSamples(LeafLoc.second.LineOffset, + LeafLoc.second.Discriminator, + CalleeName, Count); + FunctionProfile.addTotalSamples(Count); + + // Record head sample for called target(callee) + // TODO: Cleanup ' @ ' + std::string CalleeContextId = + getCallSite(LeafLoc) + " @ " + CalleeName.str(); + if (ContextId.find(" @ ") != StringRef::npos) { + CalleeContextId = + ContextId.rsplit(" @ ").first.str() + " @ " + CalleeContextId; + } + + if (ProfileMap.find(CalleeContextId) != ProfileMap.end()) { + FunctionSamples &CalleeProfile = ProfileMap[CalleeContextId]; + assert(Count != 0 && "Unexpected zero weight branch"); + if (CalleeProfile.getName().size()) { + CalleeProfile.addHeadSamples(Count); + } + } + } + } + } +} + +static FrameLocation getCallerContext(StringRef CalleeContext, + StringRef &CallerNameWithContext) { + StringRef CallerContext = CalleeContext.rsplit(" @ ").first; + CallerNameWithContext = CallerContext.rsplit(':').first; + auto ContextSplit = CallerContext.rsplit(" @ "); + FrameLocation LeafFrameLoc = {"", {0, 0}}; + StringRef Funcname; + SampleContext::decodeContextString(ContextSplit.second, Funcname, + LeafFrameLoc.second); + LeafFrameLoc.first = Funcname.str(); + return LeafFrameLoc; +} + +void CSProfileGenerator::populateInferredFunctionSamples() { + for (const auto &Item : ProfileMap) { + const StringRef CalleeContext = Item.first(); + const FunctionSamples &CalleeProfile = Item.second; + + // If we already have head sample counts, we must have value profile + // for call sites added already. Skip to avoid double counting. + if (CalleeProfile.getHeadSamples()) + continue; + // If we don't have context, nothing to do for caller's call site. + // This could happen for entry point function. + if (CalleeContext.find(" @ ") == StringRef::npos) + continue; + + // Infer Caller's frame loc and context ID through string splitting + StringRef CallerContextId; + FrameLocation &&CallerLeafFrameLoc = + getCallerContext(CalleeContext, CallerContextId); + + // It's possible that we haven't seen any sample directly in the caller, + // in which case CallerProfile will not exist. But we can't modify + // ProfileMap while iterating it. + // TODO: created function profile for those callers too + if (ProfileMap.find(CallerContextId) == ProfileMap.end()) + continue; + FunctionSamples &CallerProfile = ProfileMap[CallerContextId]; + + // Since we don't have call count for inlined functions, we + // estimate it from inlinee's profile using entry body sample. + uint64_t EstimatedCallCount = CalleeProfile.getEntrySamples(); + // If we don't have samples with location, use 1 to indicate live. + if (!EstimatedCallCount && !CalleeProfile.getBodySamples().size()) + EstimatedCallCount = 1; + CallerProfile.addCalledTargetSamples( + CallerLeafFrameLoc.second.LineOffset, + CallerLeafFrameLoc.second.Discriminator, CalleeProfile.getName(), + EstimatedCallCount); + updateBodySamplesforFunctionProfile(CallerProfile, CallerLeafFrameLoc, + EstimatedCallCount); + } +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -24,13 +24,18 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/SampleProf.h" #include "llvm/Support/Path.h" +#include #include +#include #include #include #include #include +using namespace llvm; +using namespace sampleprof; using namespace llvm::object; namespace llvm { @@ -40,14 +45,50 @@ struct InstructionPointer { ProfiledBinary *Binary; - // Offset to the base address of the executable segment of the binary. - uint64_t Offset; + union { + // Offset of the executable segment of the binary. + uint64_t Offset = 0; + // Also used as address in unwinder + uint64_t Address; + }; // Index to the sorted code address array of the binary. - uint64_t Index; + uint64_t Index = 0; + InstructionPointer(ProfiledBinary *Binary, uint64_t Address, + bool RoundToNext = false); + void advance(); + void backward(); + void update(uint64_t Addr); +}; + +// PrologEpilog offset tracker, used to filter out broken stack samples +// Currently we use a heuristic size (two) to infer prolog and epilog +// based on the start address and return address. In the future, +// we will switch to Dwarf CFI based tracker +struct PrologEpilogTracker { + // A set of prolog and epilog offsets. Used by virtual unwinding. + std::unordered_set PrologEpilogSet; + ProfiledBinary *Binary; + PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){}; + + // Take the two addresses from the start of function as prolog + void inferPrologOffsets( + std::unordered_map &FuncStartAddrMap) { + for (auto I : FuncStartAddrMap) { + PrologEpilogSet.insert(I.first); + InstructionPointer IP(Binary, I.first); + IP.advance(); + PrologEpilogSet.insert(IP.Offset); + } + } - InstructionPointer(ProfiledBinary *Binary, uint64_t Offset) - : Binary(Binary), Offset(Offset) { - Index = 0; + // Take the last two addresses before the return address as epilog + void inferEpilogOffsets(std::unordered_set &RetAddrs) { + for (auto Addr : RetAddrs) { + PrologEpilogSet.insert(Addr); + InstructionPointer IP(Binary, Addr); + IP.backward(); + PrologEpilogSet.insert(IP.Offset); + } } }; @@ -67,12 +108,14 @@ std::unique_ptr MII; std::unique_ptr DisAsm; std::unique_ptr MIA; - std::unique_ptr IP; + std::unique_ptr IPrinter; // A list of text sections sorted by start RVA and size. Used to check // if a given RVA is a valid code address. std::set> TextSections; // Function offset to name mapping. std::unordered_map FuncStartAddrMap; + // Offset to context location map. Used to expand the context. + std::unordered_map Offset2LocStackMap; // An array of offsets of all instructions sorted in increasing order. The // sorting is needed to fast advance to the next forward/backward instruction. std::vector CodeAddrs; @@ -81,9 +124,10 @@ // A set of return instruction offsets. Used by virtual unwinding. std::unordered_set RetAddrs; + PrologEpilogTracker ProEpilogTracker; + // The symbolizer used to get inline context for an instruction. std::unique_ptr Symbolizer; - void setPreferredBaseAddress(const ELFObjectFileBase *O); // Set up disassembler and related components. @@ -97,7 +141,8 @@ bool dissassembleSymbol(std::size_t SI, ArrayRef Bytes, SectionSymbolsTy &Symbols, const SectionRef &Section); /// Symbolize a given instruction pointer and return a full call context. - FrameLocationStack symbolize(const InstructionPointer &I); + FrameLocationStack symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName = false); /// Decode the interesting parts of the binary and build internal data /// structures. On high level, the parts of interest are: @@ -107,17 +152,81 @@ /// 3. Pseudo probe related sections, used by probe-based profile /// generation. void load(); + const FrameLocationStack &getFrameLocationStack(uint64_t Offset) const { + auto I = Offset2LocStackMap.find(Offset); + assert(I != Offset2LocStackMap.end() && + "Can't find location for offset in the binary"); + return I->second; + } public: - ProfiledBinary(StringRef Path) : Path(Path) { + ProfiledBinary(StringRef Path) : Path(Path), ProEpilogTracker(this) { setupSymbolizer(); load(); } - + uint64_t virtualAddrToOffset(uint64_t VitualAddress) const { + return VitualAddress - BaseAddress; + } + uint64_t offsetToVirtualAddr(uint64_t Offset) const { + return Offset + BaseAddress; + } const StringRef getPath() const { return Path; } const StringRef getName() const { return llvm::sys::path::filename(Path); } uint64_t getBaseAddress() const { return BaseAddress; } void setBaseAddress(uint64_t Address) { BaseAddress = Address; } + uint64_t getPreferredBaseAddress() const { return PreferredBaseAddress; } + + bool addressIsCode(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return Offset2LocStackMap.find(Offset) != Offset2LocStackMap.end(); + } + bool addressIsCall(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return CallAddrs.count(Offset); + } + bool addressIsReturn(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return RetAddrs.count(Offset); + } + bool addressInPrologEpilog(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return ProEpilogTracker.PrologEpilogSet.count(Offset); + } + + uint64_t getAddressforIndex(uint64_t Index) const { + return offsetToVirtualAddr(CodeAddrs[Index]); + } + + // Get the index in CodeAddrs for the address + // As we might get an address which is not the code + // here it would round to the next valid code address by + // using lower bound operation + uint32_t getIndexForAddr(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + auto Low = std::lower_bound(CodeAddrs.begin(), CodeAddrs.end(), Offset); + return Low - CodeAddrs.begin(); + } + + uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const { + return getAddressforIndex(getIndexForAddr(FrameAddr) - 1); + } + + StringRef getFuncFromStartOffset(uint64_t Offset) { + return FuncStartAddrMap[Offset]; + } + + const FrameLocation &getInlineLeafFrameLoc(uint64_t Offset, + bool NameOnly = false) { + return getFrameLocationStack(Offset).back(); + } + + // Compare two addresses' inline context + bool inlineContextEqual(uint64_t Add1, uint64_t Add2) const; + + // Get the context string of the current stack with inline context filled in. + // It will search the disassembling info stored in Offset2LocStackMap. This is + // used as the key of function sample map + std::string getExpandedContextStr(const std::list &stack) const; }; } // end namespace sampleprof diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp --- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp +++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp @@ -18,6 +18,7 @@ #define DEBUG_TYPE "load-binary" using namespace llvm; +using namespace sampleprof; static cl::opt ShowDisassembly("show-disassembly", cl::ReallyHidden, cl::init(false), cl::ZeroOrMore, @@ -95,11 +96,63 @@ // Disassemble the text sections. disassemble(Obj); + // Use function start and return address to infer prolog and epilog + ProEpilogTracker.inferPrologOffsets(FuncStartAddrMap); + ProEpilogTracker.inferEpilogOffsets(RetAddrs); + // TODO: decode other sections. return; } +bool ProfiledBinary::inlineContextEqual(uint64_t Address1, + uint64_t Address2) const { + uint64_t Offset1 = virtualAddrToOffset(Address1); + uint64_t Offset2 = virtualAddrToOffset(Address2); + const FrameLocationStack &Context1 = getFrameLocationStack(Offset1); + const FrameLocationStack &Context2 = getFrameLocationStack(Offset2); + if (Context1.size() != Context2.size()) + return false; + + // The leaf frame contains location within the leaf, and it + // needs to be remove that as it's not part of the calling context + return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1, + Context2.begin(), Context2.begin() + Context2.size() - 1); +} + +std::string +ProfiledBinary::getExpandedContextStr(const std::list &Stack) const { + std::string ContextStr; + SmallVector ContextVec; + // Process from frame root to leaf + for (auto Iter = Stack.rbegin(); Iter != Stack.rend(); Iter++) { + uint64_t Offset = virtualAddrToOffset(*Iter); + const FrameLocationStack &ExpandedContext = getFrameLocationStack(Offset); + for (const auto &Loc : ExpandedContext) { + ContextVec.push_back(getCallSite(Loc)); + } + } + + assert(ContextVec.size() && "Context length should be at least 1"); + + std::ostringstream OContextStr; + for (uint32_t I = 0; I < (uint32_t)ContextVec.size(); I++) { + if (OContextStr.str().size()) { + OContextStr << " @ "; + } + + if (I == ContextVec.size() - 1) { + // Only keep the function name for the leaf frame + StringRef Ref(ContextVec[I]); + OContextStr << Ref.split(":").first.str(); + } else { + OContextStr << ContextVec[I]; + } + } + + return OContextStr.str(); +} + void ProfiledBinary::setPreferredBaseAddress(const ELFObjectFileBase *Obj) { for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); SI != SE; ++SI) { @@ -142,7 +195,7 @@ if (ShowDisassembly) { outs() << format("%8" PRIx64 ":", Offset); size_t Start = outs().tell(); - IP->printInst(&Inst, Offset + Size, "", *STI.get(), outs()); + IPrinter->printInst(&Inst, Offset + Size, "", *STI.get(), outs()); if (ShowSourceLocations) { unsigned Cur = outs().tell() - Start; if (Cur < 40) @@ -155,6 +208,10 @@ const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode()); + // Populate a vector of the symbolized callsite at this location + InstructionPointer IP(this, Offset); + Offset2LocStackMap[Offset] = symbolize(IP, true); + // Populate address maps. CodeAddrs.push_back(Offset); if (MCDesc.isCall()) @@ -206,9 +263,9 @@ MIA.reset(TheTarget->createMCInstrAnalysis(MII.get())); int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); - IP.reset(TheTarget->createMCInstPrinter(Triple(TripleName), AsmPrinterVariant, - *AsmInfo, *MII, *MRI)); - IP->setPrintBranchImmAsAddress(true); + IPrinter.reset(TheTarget->createMCInstPrinter( + Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI)); + IPrinter->setPrintBranchImmAsAddress(true); } void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) { @@ -283,7 +340,8 @@ Symbolizer = std::make_unique(SymbolizerOpts); } -FrameLocationStack ProfiledBinary::symbolize(const InstructionPointer &IP) { +FrameLocationStack ProfiledBinary::symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName) { assert(this == IP.Binary && "Binary should only symbolize its own instruction"); auto Addr = object::SectionedAddress{IP.Offset + PreferredBaseAddress, @@ -297,14 +355,43 @@ const auto &CallerFrame = InlineStack.getFrame(I); if (CallerFrame.FunctionName == "") break; + StringRef FunctionName(CallerFrame.FunctionName); + if (UseCanonicalFnName) + FunctionName = FunctionSamples::getCanonicalFnName(FunctionName); LineLocation Line(CallerFrame.Line - CallerFrame.StartLine, CallerFrame.Discriminator); - FrameLocation Callsite(CallerFrame.FunctionName, Line); + FrameLocation Callsite(FunctionName.str(), Line); CallStack.push_back(Callsite); } return CallStack; } +InstructionPointer::InstructionPointer(ProfiledBinary *Binary, uint64_t Address, + bool RoundToNext) + : Binary(Binary), Address(Address) { + Index = Binary->getIndexForAddr(Address); + if (RoundToNext) { + // we might get address which is not the code + // it should round to the next valid address + this->Address = Binary->getAddressforIndex(Index); + } +} + +void InstructionPointer::advance() { + Index++; + Address = Binary->getAddressforIndex(Index); +} + +void InstructionPointer::backward() { + Index--; + Address = Binary->getAddressforIndex(Index); +} + +void InstructionPointer::update(uint64_t Addr) { + Address = Addr; + Index = Binary->getIndexForAddr(Address); +} + } // end namespace sampleprof } // end namespace llvm diff --git a/llvm/tools/llvm-profgen/llvm-profgen.cpp b/llvm/tools/llvm-profgen/llvm-profgen.cpp --- a/llvm/tools/llvm-profgen/llvm-profgen.cpp +++ b/llvm/tools/llvm-profgen/llvm-profgen.cpp @@ -1,4 +1,4 @@ -//===- llvm-profgen.cpp - LLVM SPGO profile generation tool ---------------===// +//===- llvm-profgen.cpp - LLVM SPGO profile generation tool -----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,6 +12,7 @@ #include "ErrorHandling.h" #include "PerfReader.h" +#include "ProfileGenerator.h" #include "ProfiledBinary.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" @@ -28,10 +29,6 @@ llvm::cl::MiscFlags::CommaSeparated, cl::desc("Path of profiled binary files")); -static cl::opt OutputFilename("output", cl::value_desc("output"), - cl::Required, - cl::desc("Output profile file")); - using namespace llvm; using namespace sampleprof; @@ -49,5 +46,10 @@ PerfReader Reader(BinaryFilenames); Reader.parsePerfTraces(PerfTraceFilenames); + std::unique_ptr Generator = ProfileGenerator::create( + Reader.getBinarySampleCounters(), Reader.getPerfScriptType()); + Generator->generateProfile(); + Generator->write(); + return EXIT_SUCCESS; }