diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -820,6 +820,50 @@ mutable unsigned LastFn = 0; mutable unsigned Counter = ~0U; + // ASM printing is a function pass. So we won't have at any point in time + // all the functions available to extract size expressions, so we need to + // wait to doFinalize, but at that point, IR is missing. So we extract what + // we need in the function pass part. + // To that end, BlockDesc captures necessary basic block info: the expression + // evaluating its size; a measure of its latency; the list of statically-known + // calls to module-internal functions (identified by name); and the frequency + // relative to the function entry block. + struct BlockDesc { + BlockDesc(const MCExpr *SizeExp, size_t Latency, + std::vector &&InternalCalls, float Freq) + : SizeExp(SizeExp), Latency(Latency), InternalCalls(InternalCalls), + Freq(Freq) {} + + const MCExpr *const SizeExp; + const size_t Latency; + const std::vector InternalCalls; + const float Freq; + }; + + /// The reward has 2 components: IWS - a measure of icache pressure; and + /// latency. + struct Reward { + Reward() = default; + Reward(const MCExpr *IWS, float Latency) : IWS(IWS), Latency(Latency) {} + + const MCExpr *IWS = nullptr; + float Latency = 0.0f; + }; + + // BlockDescriptors stores, for each function (identified by name) the + // BlockDescs for its basic blocks. + // Since BlockDescs point to called functions by name, BlockDescriptors + // captures the static call graph (within a module). + std::map> BlockDescriptors; + // This is the list of functions that may be called from outside the module. + // We want to compute rewards only relative to these. + std::vector Entrypoints; + + const Reward getFunctionReward(const std::string &FName); + const Reward getCGReward(const std::string &FName); + + std::map CalculatedRewards; + /// This method emits the header for the current function. virtual void emitFunctionHeader(); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -127,6 +128,9 @@ #define DEBUG_TYPE "asm-printer" +// Calculating reward (iCache pressure & Latency) for MLGO +static cl::opt CalculateReward("calc-reward", cl::init(false)); + const char DWARFGroupName[] = "dwarf"; const char DWARFGroupDescription[] = "DWARF Emission"; const char DbgTimerName[] = "emit"; @@ -426,6 +430,7 @@ MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired(); AU.addRequired(); + AU.addRequired(); } bool AsmPrinter::doInitialization(Module &M) { @@ -1548,6 +1553,11 @@ /// EmitFunctionBody - This method emits the body and trailer for a /// function. void AsmPrinter::emitFunctionBody() { + std::string FName = MF->getFunction().getName().str(); + std::vector &BlockDescs = BlockDescriptors[FName]; + if (!MF->getFunction().hasLocalLinkage()) + Entrypoints.push_back(FName); + emitFunctionHeader(); // Emit target-specific gunk before the function body. @@ -1681,10 +1691,30 @@ // We must emit temporary symbol for the end of this basic block, if either // we have BBLabels enabled or if this basic blocks marks the end of a // section. - if (MF->hasBBLabels() || + if (CalculateReward || MF->hasBBLabels() || (MAI->hasDotTypeDotSizeDirective() && MBB.isEndSection())) OutStreamer->emitLabel(MBB.getEndSymbol()); + if (CalculateReward) { + const MCExpr *Size = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(MBB.getEndSymbol(), OutContext), + MCSymbolRefExpr::create(MBB.getSymbol(), OutContext), OutContext); + std::vector Calls; + if (const auto *BB = MBB.getBasicBlock()) { + for (const auto &I : *BB) + if (const auto *CB = dyn_cast(&I)) + if (const auto *CF = CB->getCalledFunction()) + if (!CF->isDeclaration() && !CF->isIntrinsic()) + Calls.push_back(CF->getName().str()); + + // FIXME: alternative way to estimate latency. + size_t BlockLatency = MBB.size(); + BlockDescs.emplace_back(Size, BlockLatency, std::move(Calls), + getAnalysis() + .getBlockFreqRelativeToEntryBlock(&MBB)); + } + } + if (MBB.isEndSection()) { // The size directive for the section containing the entry block is // handled separately by the function section. @@ -2294,6 +2324,24 @@ MMI = nullptr; AddrLabelSymbols = nullptr; + if (CalculateReward) { + // We output a succession of: function name, followed by a comma, followed + // by the reward, followed by a comma, and then the next function, etc. + auto *DS = OutStreamer->getContext().getELFNamedSection( + ".llvm_block_data", "", ELF::SHT_NOTE, ELF::SHF_STRINGS); + OutStreamer->switchSection(DS); + bool OldUseForParsing = OutStreamer->getUseAssemblerInfoForParsing(); + OutStreamer->setUseAssemblerInfoForParsing(true); + for (const auto &E : Entrypoints) { + const auto R = getCGReward(E); + OutStreamer->emitBytes(E); + OutStreamer->emitBytes(","); + OutStreamer->emitULEB128Value(R.IWS); + OutStreamer->emitULEB128IntValue(static_cast(R.Latency)); + } + OutStreamer->setUseAssemblerInfoForParsing(OldUseForParsing); + } + OutStreamer->finish(); OutStreamer->reset(); OwnedMLI.reset(); @@ -2302,6 +2350,100 @@ return false; } +// Outputting float MCExpr values is not supported, so, instead, we convert +// float probabilities to an integer. +int64_t getIntegralProbability(float P) { + return static_cast(100.0 * P); +} + +// For a call graph starting at FName, for each function in the transitive +// closure, we add its IWS weighed by the probability of its execution. +// The entrypoint function (FName) has probability 1 (by convention). +// Other functions have their probability as the sum of probabilities of paths +// reaching them. +// The probability along a path is calculated by multiplying the probabilities +// of call sites. +const AsmPrinter::Reward AsmPrinter::getCGReward(const std::string &FName) { + std::map FunctionFrequencies; + + // We perform a depth-first traversal and accummulate a function's probability + // to be executed. + std::set CurrentPath; + struct Workitem { + // Name of function to work on. + std::string Name; + // The frequency, on the current path, of executing the function. + + float Freq = 0.0; + // Indication that we explored the current path below here, and we should + // be able to reconsider this function if we arrive to it via some other + // path. + bool RemoveFromPath = false; + }; + + std::stack Work; + Work.push({FName, 1.0, false}); + + while (!Work.empty()) { + auto &WI = Work.top(); + if (WI.RemoveFromPath) { + CurrentPath.erase(WI.Name); + Work.pop(); + continue; + } + WI.RemoveFromPath = true; + CurrentPath.insert(WI.Name); + FunctionFrequencies[WI.Name] += WI.Freq; + for (auto &B : BlockDescriptors[WI.Name]) + for (auto &C : B.InternalCalls) + if (CurrentPath.find(C) == CurrentPath.end()) + Work.push({C, B.Freq * WI.Freq, false}); + } + // Now aggregate the CG IWS and latency. + const MCExpr *RetIWS = nullptr; + float Latency = 0.0; + for (const auto &P : FunctionFrequencies) { + if (P.second == 0) + continue; + auto FR = getFunctionReward(P.first); + int64_t IntegralProb = getIntegralProbability(std::min(1.0f, P.second)); + const MCExpr *WFIWS = MCBinaryExpr::createMul( + FR.IWS, MCConstantExpr::create(IntegralProb, OutContext), OutContext); + Latency += FR.Latency * P.second; + if (!RetIWS) + RetIWS = WFIWS; + else + RetIWS = MCBinaryExpr::createAdd(RetIWS, WFIWS, OutContext); + } + return Reward(RetIWS, Latency); +} + +// For a function, we just traverse the BBs and add their size weighed by their +// probability to be executed. +const AsmPrinter::Reward +AsmPrinter::getFunctionReward(const std::string &FName) { + auto I = CalculatedRewards.insert(std::make_pair(FName, Reward())); + if (!I.second) + return I.first->second; + + float Latency = 0.0; + const MCExpr *Prob = nullptr; + for (const auto &BD : BlockDescriptors[FName]) { + float BlockLoadProb = std::min(BD.Freq, 1.0f); + int64_t IntegralProb = getIntegralProbability(BlockLoadProb); + const auto *BP = MCBinaryExpr::createMul( + BD.SizeExp, MCConstantExpr::create(IntegralProb, OutContext), + OutContext); + Latency += BD.Latency * BD.Freq; + if (!Prob) + Prob = BP; + else + Prob = MCBinaryExpr::createAdd(BP, Prob, OutContext); + } + I.first->second = AsmPrinter::Reward(Prob, Latency); + return I.first->second; +} + MCSymbol *AsmPrinter::getMBBExceptionSym(const MachineBasicBlock &MBB) { auto Res = MBBSectionExceptionSyms.try_emplace(MBB.getSectionIDNum()); if (Res.second) @@ -3757,6 +3899,10 @@ bool AsmPrinter::shouldEmitLabelForBasicBlock( const MachineBasicBlock &MBB) const { + // If we calculate the reward, we want labels for each basic block, + // so we may evaluate the basic blocks' size. + if (CalculateReward) + return true; // With `-fbasic-block-sections=`, a label is needed for every non-entry block // in the labels mode (option `=labels`) and every section beginning in the // sections mode (`=all` and `=list=`). diff --git a/llvm/test/Transforms/Inline/ML/Inputs/parse_reward.py b/llvm/test/Transforms/Inline/ML/Inputs/parse_reward.py new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/ML/Inputs/parse_reward.py @@ -0,0 +1,34 @@ +import sys + +def decode_uleb(bits): + result = 0 + shift = 0 + while True: + byte = bits.pop(0) + result |= (byte & 0x7f) << shift + shift += 7 + if byte & 0x80 == 0: + break + return result + +def get_name(int_arr): + ret = '' + for ival in int_arr: + ret += chr(ival) + return ret + +def parse_one_function(data): + term_char = ord(',') + sep = data.index(term_char) + name = get_name(data[:sep]) + data = data[sep+1:] + iws = decode_uleb(data) + latency = decode_uleb(data) + print('{},{},{}'.format(name, iws, latency)) + return data + + +with open(sys.argv[1], 'rb') as f: + data = [ord(x) if int(sys.version[0]) < 3 else x for x in f.read()] + while len(data) > 0: + data = parse_one_function(data) diff --git a/llvm/test/Transforms/Inline/ML/reward-o3.ll b/llvm/test/Transforms/Inline/ML/reward-o3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/ML/reward-o3.ll @@ -0,0 +1,151 @@ +; RUN: llc -calc-reward --filetype=asm < %s | FileCheck -check-prefix=EXPR %s + +; RUN: llc -calc-reward --filetype=obj -o %t.o %s +; RUN: llvm-objcopy --dump-section=.llvm_block_data.=%t.data %t.o /dev/null +; RUN: llvm-objdump -d %t.o | FileCheck -check-prefix=DUMP %s +; RUN: %python %p/Inputs/parse_reward.py %t.data | FileCheck -check-prefix=OBJ %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +declare void @ext(); + +define i32 @f0(i32 %a1) { + %add = add i32 %a1, 1 + %cond = icmp sle i32 %add, 1 + br i1 %cond, label %yes, label %no, !prof !1 +yes: + %a2 = call i32 @f1(i32 %add) + br label %exit +no: + %a3 = call i32 @f2(i32 %add) + br label %exit +exit: + %ret = phi i32 [%a2, %yes], [%a3, %no] + ret i32 %ret +} + +define internal i32 @f1(i32 %c1) { + call void @ext() + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false, !prof !2 + +cond_false: + br label %exit + +cond_true: + %c11 = call i32 @f2(i32 %c1) + br label %exit +exit: + %c12 = phi i32 [ 0, %cond_false], [ %c11, %cond_true ] + ret i32 %c12 +} + +define i32 @f2(i32 %c1) { + call void @ext() + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false, !prof !3 + +cond_false: + %ret = call i32 @f1(i32 2) + ret i32 %ret + + cond_true: + ret i32 0 +} + +define i32 @f3() { + ret i32 1 +} + +define i32 @f4() { + %x = call i32 @f3() + br label %exit +exit: + ret i32 %x +} + +define i32 @f5(i32 %i) { + %cond = icmp sle i32 %i, 1 + br i1 %cond, label %cond_true, label %cond_false, !prof !4 +cond_true: + %ret = call i32 @f3() + ret i32 %ret +cond_false: + ret i32 0 +} + +!1 = !{!"branch_weights", i32 1, i32 3} +!2 = !{!"branch_weights", i32 2, i32 1} +!3 = !{!"branch_weights", i32 5, i32 1} +!4 = !{!"branch_weights", i32 2, i32 3} + +; the labels' first index matches the function, and the second index the block. +; for example, .LBB0_1 is the second block in f0 ("yes") +; EXPR: .ascii "f0" +; EXPR: .uleb128 (((((.LBB_END0_1-.LBB0_1)*25)+(((.LBB_END0_2-.LBB0_2)*74)+((.LBB_END0_0-.LBB0_0)*100)))*100)+((((.LBB_END1_2-.LBB1_2)*65)+((.LBB_END1_0-.LBB1_0)*100))*38))+((((.LBB_END2_1-.LBB2_1)*16)+(((.LBB_END2_2-.LBB2_2)*83)+((.LBB_END2_0-.LBB2_0)*100)))*91) +; EXPR: .ascii "f2" +; EXPR: .uleb128 ((((.LBB_END1_2-.LBB1_2)*65)+((.LBB_END1_0-.LBB1_0)*100))*16)+((((.LBB_END2_1-.LBB2_1)*16)+(((.LBB_END2_2-.LBB2_2)*83)+((.LBB_END2_0-.LBB2_0)*100)))*100) +; EXPR: .ascii "f3" +; EXPR: .uleb128 ((.LBB_END3_0-.LBB3_0)*100)*100 +; EXPR: .ascii "f4" +; EXPR: .uleb128 (((.LBB_END3_0-.LBB3_0)*100)*100)+(((.LBB_END4_0-.LBB4_0)*100)*100) +; EXPR: .ascii "f5" +; EXPR: .uleb128 (((.LBB_END3_0-.LBB3_0)*100)*40)+((((.LBB_END5_1-.LBB5_1)*40)+(((.LBB_END5_2-.LBB5_2)*60)+((.LBB_END5_0-.LBB5_0)*100)))*100) + +; Note: because we multiply by 100 both when computing internal IWS and call +; graph IWS, the values below should be seen as inflated by 10000 +; OBJ: f0,386914,23 +; OBJ: f2,212560,13 +; OBJ: f3,60000,2 +; OBJ: f4,140000,8 +; OBJ: f5,124000,6 + +; For reference, we expect the output to look like this - which should allow +; tracking the blocks and their size, to cross-check the OBJ labels. +; DUMP: : +; DUMP-NEXT: 0: +; DUMP: 6: {{.*}} jle +; DUMP-NEXT: 8: +; DUMP: e: {{.*}} retq +; DUMP-NEXT: f: +; DUMP: 15: {{.*}} retq +; DUMP-NEXT: 16: + +; DUMP: : +; DUMP-NEXT: 20: +; DUMP: 2b: {{.*}} jg +; DUMP-NEXT: 2d: +; DUMP: 35: {{.*}} retq +; DUMP-NEXT: 36: +; DUMP: 39: {{.*}} retq +; DUMP-NEXT: 3a: + +; DUMP: : +; DUMP-NEXT: 40: +; DUMP: 4b: {{.*}} jge +; DUMP-NEXT: 4d: +; DUMP: 50: {{.*}} retq +; DUMP-NEXT: 51: +; DUMP: 5c:{{.*}} retq +; DUMP-NEXT: 5d: + +; DUMP: : +; DUMP-NEXT: 60: +; DUMP-NEXT: 65: {{.*}} retq +; DUMP-NEXT: 66: + +; DUMP: : +; DUMP-NEXT: 70: +; DUMP-NEXT: 71: {{.*}} callq +; DUMP-NEXT: 76: {{.*}} popq +; DUMP-NEXT: 77: {{.*}} retq +; DUMP-NEXT: 78: + +; DUMP: : +; DUMP-NEXT: 80: +; DUMP: 83: {{.*}} jle +; DUMP-NEXT: 85: +; DUMP: 87: {{.*}} retq +; DUMP-NEXT: 88: +; DUMP: 8f: {{.*}} retq