diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -820,6 +820,50 @@
   mutable unsigned LastFn = 0;
   mutable unsigned Counter = ~0U;
 
+  // ASM printing is a function pass. So we won't have at any point in time
+  // all the functions available to extract size expressions, so we need to
+  // wait to doFinalize, but at that point, IR is missing. So we extract what
+  // we need in the function pass part.
+  // To that end, BlockDesc captures necessary basic block info: the expression
+  // evaluating its size; a measure of its latency; the list of statically-known
+  // calls to module-internal functions (identified by name); and the frequency
+  // relative to the function entry block.
+  struct BlockDesc {
+    BlockDesc(const MCExpr *SizeExp, size_t Latency,
+              std::vector<std::string> &&InternalCalls, float Freq)
+        : SizeExp(SizeExp), Latency(Latency), InternalCalls(InternalCalls),
+          Freq(Freq) {}
+
+    const MCExpr *const SizeExp;
+    const size_t Latency;
+    const std::vector<std::string> InternalCalls;
+    const float Freq;
+  };
+
+  /// The reward has 2 components: IWS - a measure of icache pressure; and
+  /// latency.
+  struct Reward {
+    Reward() = default;
+    Reward(const MCExpr *IWS, float Latency) : IWS(IWS), Latency(Latency) {}
+
+    const MCExpr *IWS = nullptr;
+    float Latency = 0.0f;
+  };
+
+  // BlockDescriptors stores, for each function (identified by name) the
+  // BlockDescs for its basic blocks.
+  // Since BlockDescs point to called functions by name, BlockDescriptors
+  // captures the static call graph (within a module).
+  std::map<std::string, std::vector<BlockDesc>> BlockDescriptors;
+  // This is the list of functions that may be called from outside the module.
+  // We want to compute rewards only relative to these.
+  std::vector<std::string> Entrypoints;
+
+  const Reward getFunctionReward(const std::string &FName);
+  const Reward getCGReward(const std::string &FName);
+
+  std::map<std::string, Reward> CalculatedRewards;
+
   /// This method emits the header for the current function.
   virtual void emitFunctionHeader();
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -127,6 +128,9 @@
 
 #define DEBUG_TYPE "asm-printer"
 
+// Calculating reward (iCache pressure & Latency) for MLGO
+static cl::opt<bool> CalculateReward("calc-reward", cl::init(false));
+
 const char DWARFGroupName[] = "dwarf";
 const char DWARFGroupDescription[] = "DWARF Emission";
 const char DbgTimerName[] = "emit";
@@ -426,6 +430,7 @@
   MachineFunctionPass::getAnalysisUsage(AU);
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
+  AU.addRequired<MachineBlockFrequencyInfo>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
@@ -1548,6 +1553,11 @@
 /// EmitFunctionBody - This method emits the body and trailer for a
 /// function.
 void AsmPrinter::emitFunctionBody() {
+  std::string FName = MF->getFunction().getName().str();
+  std::vector<BlockDesc> &BlockDescs = BlockDescriptors[FName];
+  if (!MF->getFunction().hasLocalLinkage())
+    Entrypoints.push_back(FName);
+
   emitFunctionHeader();
 
   // Emit target-specific gunk before the function body.
@@ -1681,10 +1691,30 @@
     // We must emit temporary symbol for the end of this basic block, if either
     // we have BBLabels enabled or if this basic blocks marks the end of a
     // section.
-    if (MF->hasBBLabels() ||
+    if (CalculateReward || MF->hasBBLabels() ||
         (MAI->hasDotTypeDotSizeDirective() && MBB.isEndSection()))
       OutStreamer->emitLabel(MBB.getEndSymbol());
 
+    if (CalculateReward) {
+      const MCExpr *Size = MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(MBB.getEndSymbol(), OutContext),
+          MCSymbolRefExpr::create(MBB.getSymbol(), OutContext), OutContext);
+      std::vector<std::string> Calls;
+      if (const auto *BB = MBB.getBasicBlock()) {
+        for (const auto &I : *BB)
+          if (const auto *CB = dyn_cast<CallBase>(&I))
+            if (const auto *CF = CB->getCalledFunction())
+              if (!CF->isDeclaration() && !CF->isIntrinsic())
+                Calls.push_back(CF->getName().str());
+
+        // FIXME: alternative way to estimate latency.
+        size_t BlockLatency = MBB.size();
+        BlockDescs.emplace_back(Size, BlockLatency, std::move(Calls),
+                                getAnalysis<MachineBlockFrequencyInfo>()
+                                    .getBlockFreqRelativeToEntryBlock(&MBB));
+      }
+    }
+
     if (MBB.isEndSection()) {
       // The size directive for the section containing the entry block is
       // handled separately by the function section.
@@ -2294,6 +2324,24 @@
   MMI = nullptr;
   AddrLabelSymbols = nullptr;
 
+  if (CalculateReward) {
+    // We output a succession of: function name, followed by a comma, followed
+    // by the reward, followed by a comma, and then the next function, etc.
+    auto *DS = OutStreamer->getContext().getELFNamedSection(
+        ".llvm_block_data", "", ELF::SHT_NOTE, ELF::SHF_STRINGS);
+    OutStreamer->switchSection(DS);
+    bool OldUseForParsing = OutStreamer->getUseAssemblerInfoForParsing();
+    OutStreamer->setUseAssemblerInfoForParsing(true);
+    for (const auto &E : Entrypoints) {
+      const auto R = getCGReward(E);
+      OutStreamer->emitBytes(E);
+      OutStreamer->emitBytes(",");
+      OutStreamer->emitULEB128Value(R.IWS);
+      OutStreamer->emitULEB128IntValue(static_cast<int64_t>(R.Latency));
+    }
+    OutStreamer->setUseAssemblerInfoForParsing(OldUseForParsing);
+  }
+
   OutStreamer->finish();
   OutStreamer->reset();
   OwnedMLI.reset();
@@ -2302,6 +2350,100 @@
   return false;
 }
 
+// Outputting float MCExpr values is not supported, so, instead, we convert
+// float probabilities to an integer.
+int64_t getIntegralProbability(float P) {
+  return static_cast<int64_t>(100.0 * P);
+}
+
+// For a call graph starting at FName, for each function in the transitive
+// closure, we add its IWS weighed by the probability of its execution.
+// The entrypoint function (FName) has probability 1 (by convention).
+// Other functions have their probability as the sum of probabilities of paths
+// reaching them.
+// The probability along a path is calculated by multiplying the probabilities
+// of call sites.
+const AsmPrinter::Reward AsmPrinter::getCGReward(const std::string &FName) {
+  std::map<std::string, float> FunctionFrequencies;
+
+  // We perform a depth-first traversal and accummulate a function's probability
+  // to be executed.
+  std::set<std::string> CurrentPath;
+  struct Workitem {
+    // Name of function to work on.
+    std::string Name;
+    // The frequency, on the current path, of executing the function.
+
+    float Freq = 0.0;
+    // Indication that we explored the current path below here, and we should
+    // be able to reconsider this function if we arrive to it via some other
+    // path.
+    bool RemoveFromPath = false;
+  };
+
+  std::stack<Workitem> Work;
+  Work.push({FName, 1.0, false});
+
+  while (!Work.empty()) {
+    auto &WI = Work.top();
+    if (WI.RemoveFromPath) {
+      CurrentPath.erase(WI.Name);
+      Work.pop();
+      continue;
+    }
+    WI.RemoveFromPath = true;
+    CurrentPath.insert(WI.Name);
+    FunctionFrequencies[WI.Name] += WI.Freq;
+    for (auto &B : BlockDescriptors[WI.Name])
+      for (auto &C : B.InternalCalls)
+        if (CurrentPath.find(C) == CurrentPath.end())
+          Work.push({C, B.Freq * WI.Freq, false});
+  }
+  // Now aggregate the CG IWS and latency.
+  const MCExpr *RetIWS = nullptr;
+  float Latency = 0.0;
+  for (const auto &P : FunctionFrequencies) {
+    if (P.second == 0)
+      continue;
+    auto FR = getFunctionReward(P.first);
+    int64_t IntegralProb = getIntegralProbability(std::min(1.0f, P.second));
+    const MCExpr *WFIWS = MCBinaryExpr::createMul(
+        FR.IWS, MCConstantExpr::create(IntegralProb, OutContext), OutContext);
+    Latency += FR.Latency * P.second;
+    if (!RetIWS)
+      RetIWS = WFIWS;
+    else
+      RetIWS = MCBinaryExpr::createAdd(RetIWS, WFIWS, OutContext);
+  }
+  return Reward(RetIWS, Latency);
+}
+
+// For a function, we just traverse the BBs and add their size weighed by their
+// probability to be executed.
+const AsmPrinter::Reward
+AsmPrinter::getFunctionReward(const std::string &FName) {
+  auto I = CalculatedRewards.insert(std::make_pair(FName, Reward()));
+  if (!I.second)
+    return I.first->second;
+
+  float Latency = 0.0;
+  const MCExpr *Prob = nullptr;
+  for (const auto &BD : BlockDescriptors[FName]) {
+    float BlockLoadProb = std::min(BD.Freq, 1.0f);
+    int64_t IntegralProb = getIntegralProbability(BlockLoadProb);
+    const auto *BP = MCBinaryExpr::createMul(
+        BD.SizeExp, MCConstantExpr::create(IntegralProb, OutContext),
+        OutContext);
+    Latency += BD.Latency * BD.Freq;
+    if (!Prob)
+      Prob = BP;
+    else
+      Prob = MCBinaryExpr::createAdd(BP, Prob, OutContext);
+  }
+  I.first->second = AsmPrinter::Reward(Prob, Latency);
+  return I.first->second;
+}
+
 MCSymbol *AsmPrinter::getMBBExceptionSym(const MachineBasicBlock &MBB) {
   auto Res = MBBSectionExceptionSyms.try_emplace(MBB.getSectionIDNum());
   if (Res.second)
@@ -3757,6 +3899,10 @@
 
 bool AsmPrinter::shouldEmitLabelForBasicBlock(
     const MachineBasicBlock &MBB) const {
+  // If we calculate the reward, we want labels for each basic block,
+  // so we may evaluate the basic blocks' size.
+  if (CalculateReward)
+    return true;
   // With `-fbasic-block-sections=`, a label is needed for every non-entry block
   // in the labels mode (option `=labels`) and every section beginning in the
   // sections mode (`=all` and `=list=`).
diff --git a/llvm/test/Transforms/Inline/ML/Inputs/parse_reward.py b/llvm/test/Transforms/Inline/ML/Inputs/parse_reward.py
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/Inputs/parse_reward.py
@@ -0,0 +1,34 @@
+import sys
+
+def decode_uleb(bits):
+  result = 0
+  shift = 0
+  while True:
+    byte = bits.pop(0)
+    result |= (byte & 0x7f) << shift
+    shift += 7
+    if byte & 0x80 == 0:
+      break
+  return result
+
+def get_name(int_arr):
+  ret = ''
+  for ival in int_arr:
+    ret += chr(ival)
+  return ret
+
+def parse_one_function(data):
+  term_char = ord(',')
+  sep = data.index(term_char)
+  name = get_name(data[:sep])
+  data = data[sep+1:]
+  iws = decode_uleb(data)
+  latency = decode_uleb(data)
+  print('{},{},{}'.format(name, iws, latency))
+  return data
+
+
+with open(sys.argv[1], 'rb') as f:
+  data = [ord(x) if int(sys.version[0]) < 3 else x for x in f.read()]
+  while len(data) > 0:
+    data = parse_one_function(data)
diff --git a/llvm/test/Transforms/Inline/ML/reward-o3.ll b/llvm/test/Transforms/Inline/ML/reward-o3.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/reward-o3.ll
@@ -0,0 +1,151 @@
+; RUN: llc -calc-reward --filetype=asm < %s | FileCheck -check-prefix=EXPR %s
+
+; RUN: llc -calc-reward --filetype=obj -o %t.o %s
+; RUN: llvm-objcopy --dump-section=.llvm_block_data.=%t.data %t.o /dev/null
+; RUN: llvm-objdump -d %t.o | FileCheck -check-prefix=DUMP %s
+; RUN: %python %p/Inputs/parse_reward.py %t.data | FileCheck -check-prefix=OBJ %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev4-linux-gnu"
+
+declare void @ext();
+
+define i32 @f0(i32 %a1) {
+  %add = add i32 %a1, 1
+  %cond = icmp sle i32 %add, 1
+  br i1 %cond, label %yes, label %no, !prof !1
+yes:
+  %a2 = call i32 @f1(i32 %add)
+  br label %exit
+no:
+  %a3 = call i32 @f2(i32 %add)
+  br label %exit
+exit:
+  %ret = phi i32 [%a2, %yes], [%a3, %no]
+  ret i32 %ret
+}
+
+define internal i32 @f1(i32 %c1) {
+  call void @ext()
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false, !prof !2
+
+cond_false:
+  br label %exit
+
+cond_true:
+  %c11 = call i32 @f2(i32 %c1)
+  br label %exit
+exit:
+  %c12 = phi i32 [ 0, %cond_false], [ %c11, %cond_true ]
+  ret i32 %c12
+}
+
+define i32 @f2(i32 %c1) {
+  call void @ext()
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false, !prof !3
+
+cond_false:
+  %ret = call i32 @f1(i32 2)
+  ret i32 %ret
+
+ cond_true:
+  ret i32 0
+}
+
+define i32 @f3() {
+  ret i32 1
+}
+
+define i32 @f4() {
+  %x = call i32 @f3()
+  br label %exit
+exit:
+  ret i32 %x
+}
+
+define i32 @f5(i32 %i) {
+  %cond = icmp sle i32 %i, 1
+  br i1 %cond, label %cond_true, label %cond_false, !prof !4
+cond_true:
+  %ret = call i32 @f3()
+  ret i32 %ret
+cond_false:
+  ret i32 0
+}
+
+!1 = !{!"branch_weights", i32 1, i32 3}
+!2 = !{!"branch_weights", i32 2, i32 1}
+!3 = !{!"branch_weights", i32 5, i32 1}
+!4 = !{!"branch_weights", i32 2, i32 3}
+
+; the labels' first index matches the function, and the second index the block.
+; for example, .LBB0_1 is the second block in f0 ("yes")
+; EXPR: .ascii "f0"
+; EXPR: .uleb128    (((((.LBB_END0_1-.LBB0_1)*25)+(((.LBB_END0_2-.LBB0_2)*74)+((.LBB_END0_0-.LBB0_0)*100)))*100)+((((.LBB_END1_2-.LBB1_2)*65)+((.LBB_END1_0-.LBB1_0)*100))*38))+((((.LBB_END2_1-.LBB2_1)*16)+(((.LBB_END2_2-.LBB2_2)*83)+((.LBB_END2_0-.LBB2_0)*100)))*91)
+; EXPR: .ascii "f2"
+; EXPR: .uleb128   ((((.LBB_END1_2-.LBB1_2)*65)+((.LBB_END1_0-.LBB1_0)*100))*16)+((((.LBB_END2_1-.LBB2_1)*16)+(((.LBB_END2_2-.LBB2_2)*83)+((.LBB_END2_0-.LBB2_0)*100)))*100)
+; EXPR: .ascii "f3"
+; EXPR: .uleb128 ((.LBB_END3_0-.LBB3_0)*100)*100
+; EXPR: .ascii "f4"
+; EXPR: .uleb128 (((.LBB_END3_0-.LBB3_0)*100)*100)+(((.LBB_END4_0-.LBB4_0)*100)*100)
+; EXPR: .ascii "f5"
+; EXPR: .uleb128 (((.LBB_END3_0-.LBB3_0)*100)*40)+((((.LBB_END5_1-.LBB5_1)*40)+(((.LBB_END5_2-.LBB5_2)*60)+((.LBB_END5_0-.LBB5_0)*100)))*100)
+
+; Note: because we multiply by 100 both when computing internal IWS and call
+; graph IWS, the values below should be seen as inflated by 10000
+; OBJ: f0,386914,23
+; OBJ: f2,212560,13
+; OBJ: f3,60000,2
+; OBJ: f4,140000,8
+; OBJ: f5,124000,6
+
+; For reference, we expect the output to look like this - which should allow
+; tracking the blocks and their size, to cross-check the OBJ labels.
+; DUMP: <f0>:
+; DUMP-NEXT:         0:
+; DUMP:              6: {{.*}} jle
+; DUMP-NEXT:         8:
+; DUMP:              e: {{.*}} retq
+; DUMP-NEXT:         f:
+; DUMP:             15: {{.*}} retq
+; DUMP-NEXT:        16:
+
+; DUMP: <f1>:
+; DUMP-NEXT:        20:
+; DUMP:             2b: {{.*}} jg
+; DUMP-NEXT:        2d:
+; DUMP:             35: {{.*}} retq
+; DUMP-NEXT:        36:
+; DUMP:             39: {{.*}} retq
+; DUMP-NEXT:        3a:
+
+; DUMP: <f2>:
+; DUMP-NEXT:        40:
+; DUMP:             4b: {{.*}} jge
+; DUMP-NEXT:        4d:
+; DUMP:             50: {{.*}} retq
+; DUMP-NEXT:        51:
+; DUMP:             5c:{{.*}} retq
+; DUMP-NEXT:        5d:
+
+; DUMP: <f3>:
+; DUMP-NEXT:        60:
+; DUMP-NEXT:        65: {{.*}} retq
+; DUMP-NEXT:        66:
+
+; DUMP: <f4>:
+; DUMP-NEXT:        70:
+; DUMP-NEXT:        71: {{.*}} callq
+; DUMP-NEXT:        76: {{.*}} popq
+; DUMP-NEXT:        77: {{.*}} retq
+; DUMP-NEXT:        78:
+
+; DUMP: <f5>:
+; DUMP-NEXT:        80:
+; DUMP:             83: {{.*}} jle
+; DUMP-NEXT:        85:
+; DUMP:             87: {{.*}} retq
+; DUMP-NEXT:        88:
+; DUMP:             8f: {{.*}} retq