diff --git a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h
--- a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h
+++ b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h
@@ -99,6 +99,10 @@
 ImmutablePass *
 createImmutableModuleSummaryIndexWrapperPass(const ModuleSummaryIndex *Index);
 
+/// Returns true if the instruction could have memprof metadata, used to ensure
+/// consistency between summary analysis and the ThinLTO backend processing.
+bool mayHaveMemprofSummary(const CallBase *CB);
+
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_MODULESUMMARYANALYSIS_H
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1564,6 +1564,11 @@
     Attrs = Attrs.removeFnAttribute(getContext(), Kind);
   }
 
+  /// Removes the attribute from the function
+  void removeFnAttr(StringRef Kind) {
+    Attrs = Attrs.removeFnAttribute(getContext(), Kind);
+  }
+
   /// Removes the attribute from the return value
   void removeRetAttr(Attribute::AttrKind Kind) {
     Attrs = Attrs.removeRetAttribute(getContext(), Kind);
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1300,6 +1300,10 @@
   /// Indicates that summary-based synthetic entry count propagation has run
   bool HasSyntheticEntryCounts = false;
 
+  /// Indicates that summary-based profile guided heap optimization context
+  /// disambigution has run.
+  bool WithPGHOContextDisambiguation = false;
+
   /// Indicates that distributed backend should skip compilation of the
   /// module. Flag is suppose to be set by distributed ThinLTO indexing
   /// when it detected that the module is not needed during the final
@@ -1503,6 +1507,13 @@
   bool hasSyntheticEntryCounts() const { return HasSyntheticEntryCounts; }
   void setHasSyntheticEntryCounts() { HasSyntheticEntryCounts = true; }
 
+  bool withPGHOContextDisambiguation() const {
+    return WithPGHOContextDisambiguation;
+  }
+  void setWithPGHOContextDisambiguation() {
+    WithPGHOContextDisambiguation = true;
+  }
+
   bool skipModuleByDistributedBackend() const {
     return SkipModuleByDistributedBackend;
   }
diff --git a/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h
--- a/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/PGHOContextDisambiguation.h
@@ -17,22 +17,36 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/PassManager.h"
 #include <functional>
 
 namespace llvm {
 class GlobalValueSummary;
 class Module;
-class ModuleSummaryIndex;
+class OptimizationRemarkEmitter;
 
 class PGHOContextDisambiguation
     : public PassInfoMixin<PGHOContextDisambiguation> {
   /// Run the context disambiguator on \p TheModule, returns true if any changes
   /// was made.
-  bool processModule(Module &M);
+  bool processModule(
+      Module &M,
+      function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
+
+  /// In the ThinLTO backend, apply the cloning decisions in ImportSummary to
+  /// the IR.
+  bool applyImport(Module &M);
+
+  /// Import summary containing cloning decisions for the ThinLTO backend.
+  const ModuleSummaryIndex *ImportSummary;
+
+  // Owns the import summary specified by internal options for testing the
+  // ThinLTO backend via opt (to simulate distributed ThinLTO).
+  std::unique_ptr<ModuleSummaryIndex> ImportSummaryForTesting;
 
 public:
-  PGHOContextDisambiguation() {}
+  PGHOContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr);
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -282,6 +282,10 @@
   std::vector<CallsiteInfo> Callsites;
   std::vector<AllocInfo> Allocs;
 
+#ifndef NDEBUG
+  DenseSet<const CallBase *> CallsThatMayHaveMemprofSummary;
+#endif
+
   bool HasInlineAsmMaybeReferencingInternal = false;
   bool HasIndirBranchToBlockAddress = false;
   bool HasUnknownCall = false;
@@ -425,6 +429,10 @@
               .updateHotness(getHotness(Candidate.Count, PSI));
       }
 
+      // Summarize memprof related metadata. This is only needed for ThinLTO.
+      if (!IsThinLTO)
+        continue;
+
       // TODO: Skip indirect calls for now. Need to handle these better, likely
       // by creating multiple Callsites, one per target, then speculatively
       // devirtualize while applying clone info in the ThinLTO backends. This
@@ -435,6 +443,14 @@
       if (!CalledFunction)
         continue;
 
+      // Ensure we keep this analysis in sync with the handling in the ThinLTO
+      // backend (see PGHOContextDisambiguation::applyImport). Save this call
+      // so that we can skip it in checking the reverse case later.
+      assert(mayHaveMemprofSummary(CB));
+#ifndef NDEBUG
+      CallsThatMayHaveMemprofSummary.insert(CB);
+#endif
+
       // Compute the list of stack ids first (so we can trim them from the stack
       // ids on any MIBs).
       CallStack<MDNode, MDNode::op_iterator> InstCallsite(
@@ -542,6 +558,25 @@
             ? CalleeInfo::HotnessType::Cold
             : CalleeInfo::HotnessType::Critical);
 
+#ifndef NDEBUG
+  // Make sure that all calls we decided could not have memprof summaries get a
+  // false value for mayHaveMemprofSummary, to ensure that this handling remains
+  // in sync with the ThinLTO backend handling.
+  if (IsThinLTO) {
+    for (const BasicBlock &BB : F) {
+      for (const Instruction &I : BB) {
+        const auto *CB = dyn_cast<CallBase>(&I);
+        if (!CB)
+          continue;
+        // We already checked these above.
+        if (CallsThatMayHaveMemprofSummary.count(CB))
+          continue;
+        assert(!mayHaveMemprofSummary(CB));
+      }
+    }
+  }
+#endif
+
   bool NonRenamableLocal = isNonRenamableLocal(F);
   bool NotEligibleForImport = NonRenamableLocal ||
                               HasInlineAsmMaybeReferencingInternal ||
@@ -1033,3 +1068,36 @@
 
 INITIALIZE_PASS(ImmutableModuleSummaryIndexWrapperPass, "module-summary-info",
                 "Module summary info", false, true)
+
+bool llvm::mayHaveMemprofSummary(const CallBase *CB) {
+  if (!CB)
+    return false;
+  if (CB->isDebugOrPseudoInst())
+    return false;
+  auto *CI = dyn_cast<CallInst>(CB);
+  auto *CalledValue = CB->getCalledOperand();
+  auto *CalledFunction = CB->getCalledFunction();
+  if (CalledValue && !CalledFunction) {
+    CalledValue = CalledValue->stripPointerCasts();
+    // Stripping pointer casts can reveal a called function.
+    CalledFunction = dyn_cast<Function>(CalledValue);
+  }
+  // Check if this is an alias to a function. If so, get the
+  // called aliasee for the checks below.
+  if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
+    assert(!CalledFunction &&
+           "Expected null called function in callsite for alias");
+    CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
+  }
+  // Check if this is a direct call to a known function or a known
+  // intrinsic, or an indirect call with profile data.
+  if (CalledFunction) {
+    if (CI && CalledFunction->isIntrinsic())
+      return false;
+  } else {
+    // TODO: For now skip indirect calls. See comments in
+    // computeFunctionSummary for what is needed to handle this.
+    return false;
+  }
+  return true;
+}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -8018,7 +8018,7 @@
     case bitc::FS_FLAGS: { // [flags]
       uint64_t Flags = Record[0];
       // Scan flags.
-      assert(Flags <= 0xff && "Unexpected bits in flag");
+      assert(Flags <= 0x1ff && "Unexpected bits in flag");
 
       return Flags & 0x8;
     }
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -107,11 +107,13 @@
     Flags |= 0x40;
   if (withWholeProgramVisibility())
     Flags |= 0x80;
+  if (withPGHOContextDisambiguation())
+    Flags |= 0x100;
   return Flags;
 }
 
 void ModuleSummaryIndex::setFlags(uint64_t Flags) {
-  assert(Flags <= 0xff && "Unexpected bits in flag");
+  assert(Flags <= 0x1ff && "Unexpected bits in flag");
   // 1 bit: WithGlobalValueDeadStripping flag.
   // Set on combined index only.
   if (Flags & 0x1)
@@ -145,6 +147,10 @@
   // Set on combined index only.
   if (Flags & 0x80)
     setWithWholeProgramVisibility();
+  // 1 bit: WithPGHOContextDisambiguation flag.
+  // Set on combined index only.
+  if (Flags & 0x100)
+    setWithPGHOContextDisambiguation();
 }
 
 // Collect for the given module the list of function it defines
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1506,6 +1506,11 @@
   MPM.addPass(Annotation2MetadataPass());
 
   if (ImportSummary) {
+    // For ThinLTO we must apply the context disambiguation decisions early, to
+    // ensure we can correctly match the callsites to summary data.
+    if (EnablePGHOContextDisambiguation)
+      MPM.addPass(PGHOContextDisambiguation(ImportSummary));
+
     // These passes import type identifier resolutions for whole-program
     // devirtualization and CFI. They must run early because other passes may
     // disturb the specific instruction patterns that these passes look for,
diff --git a/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp
--- a/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/PGHOContextDisambiguation.cpp
@@ -27,8 +27,11 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -38,13 +41,40 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include <sstream>
 #include <vector>
 using namespace llvm;
 using namespace llvm::memprof;
+using namespace ore;
 
 #define DEBUG_TYPE "pgho-context-disambiguation"
 
+STATISTIC(FunctionClonesAnalysis,
+          "Number of function clones created during whole program analysis");
+STATISTIC(FunctionClonesThinBackend,
+          "Number of function clones created during ThinLTO backend");
+STATISTIC(FunctionsClonedThinBackend,
+          "Number of functions that had clones created during ThinLTO backend");
+STATISTIC(AllocTypeNotCold,
+          "Number of not cold static allocations (possibly cloned)");
+STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned)");
+STATISTIC(AllocTypeNotColdThinBackend,
+          "Number of not cold static allocations (possibly cloned) during "
+          "ThinLTO backend");
+STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
+                                    "(possibly cloned) during ThinLTO backend");
+STATISTIC(OrigAllocsThinBackend,
+          "Number of original (not cloned) allocations with memprof profiles "
+          "during ThinLTO backend");
+STATISTIC(
+    AllocVersionsThinBackend,
+    "Number of allocation versions (including clones) during ThinLTO backend");
+STATISTIC(MaxAllocVersionsThinBackend,
+          "Maximum number of allocation versions created for an original "
+          "allocation during ThinLTO backend");
+STATISTIC(UnclonableAllocsThinBackend,
+          "Number of unclonable ambigous allocations during ThinLTO backend");
 
 static cl::opt<std::string> DotFilePathPrefix(
     "pgho-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -65,6 +95,11 @@
     VerifyNodes("pgho-verify-nodes", cl::init(false), cl::Hidden,
                 cl::desc("Perform frequent verification checks on nodes."));
 
+static cl::opt<std::string> PGHOImportSummary(
+    "pgho-import-summary",
+    cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
+    cl::Hidden);
+
 inline bool hasSingleAllocType(uint8_t AllocTypes) {
   switch (AllocTypes) {
   case (uint8_t)AllocationType::Cold:
@@ -111,6 +146,8 @@
   /// behavior of an allocation based on its context.
   void identifyClones();
 
+  bool assignFunctions();
+
   void dump() const;
   void print(raw_ostream &OS) const;
 
@@ -361,6 +398,28 @@
     return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
   }
 
+  /// Update the allocation call to record type of allocated memory.
+  void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
+    AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
+    static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
+  }
+
+  /// Update non-allocation call to invoke (possibly cloned) function
+  /// CalleeFunc.
+  void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
+    static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
+  }
+
+  /// Clone the given function for the given callsite, recording mapping of all
+  /// of the functions tracked calls to their new versions in the CallMap.
+  /// Assigns new clones to clone number CloneNo.
+  FuncInfo cloneFunctionForCallsite(
+      FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+      std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
+    return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
+        Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
+  }
+
   /// Gets a label to use in the dot graph for the given call clone in the given
   /// function.
   std::string getLabel(const FuncTy *Func, const CallTy Call,
@@ -453,7 +512,9 @@
     : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
                                   Instruction *> {
 public:
-  ModuleCallsiteContextGraph(Module &M);
+  ModuleCallsiteContextGraph(
+      Module &M,
+      function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
 
 private:
   friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
@@ -463,10 +524,19 @@
   bool calleeMatchesFunc(Instruction *Call, const Function *Func);
   uint64_t getLastStackId(Instruction *Call);
   std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
+  void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+  void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
+  CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
+                       Instruction *>::FuncInfo
+  cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
+                           std::map<CallInfo, CallInfo> &CallMap,
+                           std::vector<CallInfo> &CallsWithMetadataInFunc,
+                           unsigned CloneNo);
   std::string getLabel(const Function *Func, const Instruction *Call,
                        unsigned CloneNo) const;
 
   const Module &Mod;
+  function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
 };
 
 /// Represents a call in the summary index graph, which can either be an
@@ -509,6 +579,14 @@
   bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
   uint64_t getLastStackId(IndexCall &Call);
   std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
+  void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+  void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
+  CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+                       IndexCall>::FuncInfo
+  cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
+                           std::map<CallInfo, CallInfo> &CallMap,
+                           std::vector<CallInfo> &CallsWithMetadataInFunc,
+                           unsigned CloneNo);
   std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
                        unsigned CloneNo) const;
 
@@ -1187,7 +1265,9 @@
   return StackIds;
 }
 
-ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) {
+ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
+    Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
+    : Mod(M), OREGetter(OREGetter) {
   for (auto &F : M) {
     for (auto &BB : F) {
       for (auto &I : BB) {
@@ -2017,6 +2097,874 @@
     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/true);
 }
 
+static std::string getAllocTypeAttributeString(AllocationType Type) {
+  switch (Type) {
+  case AllocationType::NotCold:
+    return "notcold";
+    break;
+  case AllocationType::Cold:
+    return "cold";
+    break;
+  default:
+    dbgs() << "Unexpected alloc type " << (uint8_t)Type;
+    assert(false);
+  }
+  llvm_unreachable("invalid alloc type");
+}
+
+void ModuleCallsiteContextGraph::updateAllocationCall(
+    CallInfo &Call, AllocationType AllocType) {
+  std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
+  auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
+                                "memprof", AllocTypeString);
+  cast<CallBase>(Call.call())->addFnAttr(A);
+  OREGetter(Call.call()->getFunction())
+      .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
+            << NV("AllocationCall", Call.call()) << " in clone "
+            << NV("Caller", Call.call()->getFunction())
+            << " marked with memprof allocation attribute "
+            << NV("Attribute", AllocTypeString));
+}
+
+void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
+                                                     AllocationType AllocType) {
+  auto *AI = Call.call().dyn_cast<AllocInfo *>();
+  assert(AI);
+  assert(AI->Versions.size() > Call.cloneNo());
+  AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
+}
+
+void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
+                                            FuncInfo CalleeFunc) {
+  if (CalleeFunc.cloneNo() > 0)
+    cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
+  OREGetter(CallerCall.call()->getFunction())
+      .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
+            << NV("Call", CallerCall.call()) << " in clone "
+            << NV("Caller", CallerCall.call()->getFunction())
+            << " assigned to call function clone "
+            << NV("Callee", CalleeFunc.func()));
+}
+
+void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
+                                           FuncInfo CalleeFunc) {
+  auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
+  // Caller cannot be an allocation.
+  assert(CI);
+  assert(CI->Clones.size() > CallerCall.cloneNo());
+  CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
+}
+
+CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
+                     Instruction *>::FuncInfo
+ModuleCallsiteContextGraph::cloneFunctionForCallsite(
+    FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+    std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
+  // Use existing LLVM facilities for cloning and obtaining Call in clone
+  ValueToValueMapTy VMap;
+  auto *NewFunc = CloneFunction(Func.func(), VMap);
+  std::string Name = getPGHOFuncName(Func.func()->getName(), CloneNo);
+  assert(!Func.func()->getParent()->getFunction(Name));
+  NewFunc->setName(Name);
+  for (auto &Inst : CallsWithMetadataInFunc) {
+    // This map always has the initial version in it.
+    assert(Inst.cloneNo() == 0);
+    CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
+  }
+  OREGetter(Func.func())
+      .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
+            << "created clone " << NV("NewFunction", NewFunc));
+  return {NewFunc, CloneNo};
+}
+
+CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+                     IndexCall>::FuncInfo
+IndexCallsiteContextGraph::cloneFunctionForCallsite(
+    FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+    std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
+  // Check how many clones we have of Call (and therefore function).
+  // The next clone number is the current size of versions array.
+  // Confirm this matches the CloneNo provided by the caller, which is based on
+  // the number of function clones we have.
+  assert(CloneNo ==
+         (Call.call().is<AllocInfo *>()
+              ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
+              : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
+  // Walk all the instructions in this function. Create a new version for
+  // each (by adding an entry to the Versions/Clones summary array), and copy
+  // over the version being called for the function clone being cloned here.
+  // Additionally, add an entry to the CallMap for the new function clone,
+  // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
+  // to the new call clone.
+  for (auto &Inst : CallsWithMetadataInFunc) {
+    // This map always has the initial version in it.
+    assert(Inst.cloneNo() == 0);
+    if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
+      assert(AI->Versions.size() == CloneNo);
+      // We assign the allocation type later (in updateAllocationCall), just add
+      // an entry for it here.
+      AI->Versions.push_back(0);
+    } else {
+      auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
+      assert(CI && CI->Clones.size() == CloneNo);
+      // We assign the clone number later (in updateCall), just add an entry for
+      // it here.
+      CI->Clones.push_back(0);
+    }
+    CallMap[Inst] = {Inst.call(), CloneNo};
+  }
+  return {Func.func(), CloneNo};
+}
+
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
+  bool Changed = false;
+
+  // Keep track of the assignment of nodes (callsites) to function clones they
+  // call.
+  std::map<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
+
+  // Update caller node to call function version CalleeFunc, by recording the
+  // assignment in CallsiteToCalleeFuncCloneMap.
+  auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
+                                        const FuncInfo &CalleeFunc) {
+    CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
+  };
+
+  // Walk all functions for which we saw calls with memprof metadata, and handle
+  // cloning for each of its calls.
+  for (auto &FuncEntry : FuncToCallsWithMetadata) {
+    FuncInfo OrigFunc(FuncEntry.first);
+    // Map from each clone of OrigFunc to a map of remappings of each call of
+    // interest (from original uncloned call to the corresponding cloned call in
+    // that function clone).
+    std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
+    for (auto Call : FuncEntry.second) {
+      ContextNode *Node = getNodeForInst(Call);
+      // Skip call if we do not have a node for it (all uses of its stack ids
+      // were either on inlined chains or pruned from the MIBs), or if we did
+      // not create any clones for it.
+      if (!Node || Node->Clones.empty())
+        continue;
+      // Not having a call should have prevented cloning.
+      assert(Node->hasCall());
+
+      // Track the assignment of function clones to clones of the current
+      // callsite Node being handled.
+      std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
+
+      // Assign callsite version CallsiteClone to function version FuncClone,
+      // and also assign (possibly cloned) Call to CallsiteClone.
+      auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
+                                                CallInfo &Call,
+                                                ContextNode *CallsiteClone,
+                                                bool IsAlloc) {
+        // Record the clone of callsite node assigned to this function clone.
+        FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
+
+        assert(FuncClonesToCallMap.count(FuncClone));
+        std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
+        CallInfo CallClone(Call);
+        if (CallMap.count(Call))
+          CallClone = CallMap[Call];
+        CallsiteClone->setCall(CallClone);
+      };
+
+      // Keep track of the clones of callsite Node that need to be assigned to
+      // function clones. This list may be expanded in the loop body below if we
+      // find additional cloning is required.
+      std::vector<ContextNode *> Clones(Node->Clones);
+      // Ignore original Node if we moved all of its contexts to clones.
+      if (!Node->ContextIds.empty())
+        Clones.insert(Clones.begin(), Node);
+
+      // Now walk through all of the clones of this callsite Node that we need,
+      // and determine the assignment to a corresponding clone of the current
+      // function (creating new function clones as needed).
+      for (unsigned I = 0; I < Clones.size(); I++) {
+        ContextNode *Clone = Clones[I];
+        if (VerifyNodes)
+          checkNode<DerivedCCG, FuncTy, CallTy>(Clone, /*CheckEdges=*/true);
+
+        // Need to create a new function clone if we have more callsite clones
+        // than existing function clones, which would have been assigned to an
+        // earlier clone in the list (we assign callsite clones to function
+        // clones greedily).
+        if (FuncClonesToCallMap.size() <= I) {
+          // If this is the first callsite copy, assign to original function.
+          if (I == 0) {
+            // Since FuncClonesToCallMap is empty in this case, no clones have
+            // been created for this function yet, and no callers should have
+            // been assigned a function clone for this callee node yet.
+            assert(llvm::none_of(Clone->CallerEdges, [&](ContextEdge *E) {
+              return CallsiteToCalleeFuncCloneMap.count(E->Caller);
+            }));
+            // Initialize with empty call map, assign Clone to original function
+            // and its callers, and skip to the next clone.
+            FuncClonesToCallMap[OrigFunc] = {};
+            AssignCallsiteCloneToFuncClone(
+                OrigFunc, Call, Clone,
+                AllocationCallToContextNodeMap.count(Call));
+            for (auto CE : Clone->CallerEdges)
+              RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
+            continue;
+          }
+
+          // First locate which copy of OrigFunc to clone again. If a caller
+          // of this callsite clone was already assigned to call a particular
+          // function clone, we need to redirect all of those callers to the
+          // new function clone, and update their other callees within this
+          // function.
+          FuncInfo PreviousAssignedFuncClone;
+          auto EI = llvm::find_if(Clone->CallerEdges, [&](ContextEdge *E) {
+            return CallsiteToCalleeFuncCloneMap.count(E->Caller);
+          });
+          bool CallerAssignedToCloneOfFunc = false;
+          if (EI != Clone->CallerEdges.end()) {
+            ContextEdge *Edge = *EI;
+            PreviousAssignedFuncClone =
+                CallsiteToCalleeFuncCloneMap[Edge->Caller];
+            CallerAssignedToCloneOfFunc = true;
+          }
+
+          // Clone function and save it along with the CallInfo map created
+          // during cloning in the FuncClonesToCallMap.
+          std::map<CallInfo, CallInfo> NewCallMap;
+          unsigned CloneNo = FuncClonesToCallMap.size();
+          // Clone 0 is the original function, which should already exist in the
+          // map.
+          assert(CloneNo > 0);
+          FuncInfo NewFuncClone = cloneFunctionForCallsite(
+              OrigFunc, Call, NewCallMap, FuncEntry.second, CloneNo);
+          FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
+          FunctionClonesAnalysis++;
+          Changed = true;
+
+          // If no caller callsites were already assigned to a clone of this
+          // function, we can simply assign this clone to the new func clone
+          // and update all callers to it, then skip to the next clone.
+          if (!CallerAssignedToCloneOfFunc) {
+            AssignCallsiteCloneToFuncClone(
+                NewFuncClone, Call, Clone,
+                AllocationCallToContextNodeMap.count(Call));
+            for (auto CE : Clone->CallerEdges)
+              RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
+            continue;
+          }
+
+          // We may need to do additional node cloning in this case.
+          // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
+          // that were previously assigned to call PreviousAssignedFuncClone,
+          // to record that they now call NewFuncClone.
+          for (auto CE : Clone->CallerEdges) {
+            if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
+                // We subsequently fall through to later handling that
+                // will perform any additional cloning required for
+                // callers that were calling other function clones.
+                CallsiteToCalleeFuncCloneMap[CE->Caller] !=
+                    PreviousAssignedFuncClone)
+              continue;
+
+            RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
+
+            // If we are cloning a function that was already assigned to some
+            // callers, then essentially we are creating new callsite clones
+            // of the other callsites in that function that are reached by those
+            // callers. Clone the other callees of the current callsite's caller
+            // that were already assigned to PreviousAssignedFuncClone
+            // accordingly. This is important since we subsequently update the
+            // calls from the nodes in the graph and their assignments to callee
+            // functions recorded in CallsiteToCalleeFuncCloneMap.
+            for (auto CalleeEdge : CE->Caller->CalleeEdges) {
+              ContextNode *Callee = CalleeEdge->Callee;
+              // Skip the current callsite, we are looking for other
+              // callsites Caller calls.
+              if (Callee == Clone)
+                continue;
+              if (!Callee->hasCall())
+                continue;
+              // Skip any that have been removed on an earlier iteration when
+              // cleaning up newly None type callee edges.
+              if (CalleeEdge->Callee == nullptr &&
+                  CalleeEdge->Caller == nullptr) {
+                assert(RemovedEdges.count(CalleeEdge));
+                continue;
+              }
+              ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
+              removeNoneTypeCalleeEdges(NewClone);
+              // Moving the edge may have resulted in some none type
+              // callee edges on the original Callee.
+              removeNoneTypeCalleeEdges(Callee);
+              assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
+              // If the Callee node was already assigned to call a specific
+              // function version, make sure its new clone is assigned to call
+              // that same function clone.
+              if (CallsiteToCalleeFuncCloneMap.count(Callee))
+                RecordCalleeFuncOfCallsite(
+                    NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
+              // Update NewClone with the new Call clone of this callsite's Call
+              // created for the new function clone created earlier.
+              // Recall that we have already ensured when building the graph
+              // that each caller can only call callsites within the same
+              // function, so we are guaranteed that Callee Call is in the
+              // current OrigFunc.
+              // CallMap is set up as indexed by original Call at clone 0.
+              CallInfo OrigCall(Callee->getOrigNode()->Call);
+              OrigCall.setCloneNo(0);
+              std::map<CallInfo, CallInfo> &CallMap =
+                  FuncClonesToCallMap[NewFuncClone];
+              assert(CallMap.count(OrigCall));
+              CallInfo NewCall(CallMap[OrigCall]);
+              assert(NewCall);
+              NewClone->setCall(NewCall);
+            }
+          }
+          // Fall through to handling below to perform the recording of the
+          // function for this callsite clone. This enables handling of cases
+          // where the callers were assigned to different clones of a function.
+        }
+
+        // See if we can use existing function clone. Walk through
+        // all caller edges to see if any have already been assigned to
+        // a clone of this callsite's function. If we can use it, do so. If not,
+        // because that function clone is already assigned to a different clone
+        // of this callsite, then we need to clone again.
+        // Basically, this checking is needed to handle the case where different
+        // caller functions/callsites may need versions of this function
+        // containing different mixes of callsite clones across the different
+        // callsites within the function. If that happens, we need to create
+        // additional function clones to handle the various combinations.
+        //
+        // Keep track of any new clones of this callsite created by the
+        // following loop, as well as any existing clone that we decided to
+        // assign this clone to.
+        std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
+        FuncInfo FuncCloneAssignedToCurCallsiteClone;
+        // We need to be able to remove Edge from CallerEdges, so need to adjust
+        // iterator in the loop.
+        for (auto EI = Clone->CallerEdges.begin();
+             EI != Clone->CallerEdges.end();) {
+          auto *Edge = *EI;
+          // If this caller already assigned to call a version of OrigFunc, need
+          // to ensure we can assign this callsite clone to that function clone.
+          if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
+            FuncInfo FuncCloneCalledByCaller =
+                CallsiteToCalleeFuncCloneMap[Edge->Caller];
+            // First we need to confirm that this function clone is available
+            // for use by this callsite node clone.
+            //
+            // While FuncCloneToCurNodeCloneMap is built only for this Node and
+            // its callsite clones, one of those callsite clones X could have
+            // been assigned to the same function clone called by Edge's caller
+            // - if Edge's caller calls another callsite within Node's original
+            // function, and that callsite has another caller reaching clone X.
+            // We need to clone Node again in this case.
+            if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
+                 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
+                     Clone) ||
+                // Detect when we have multiple callers of this callsite that
+                // have already been assigned to specific, and different, clones
+                // of OrigFunc (due to other unrelated callsites in Func they
+                // reach via call contexts). Is this Clone of callsite Node
+                // assigned to a different clone of OrigFunc? If so, clone Node
+                // again.
+                (FuncCloneAssignedToCurCallsiteClone &&
+                 FuncCloneAssignedToCurCallsiteClone !=
+                     FuncCloneCalledByCaller)) {
+              // We need to use a different newly created callsite clone, in
+              // order to assign it to another new function clone on a
+              // subsequent iteration over the Clones array (adjusted below).
+              // Note we specifically do not reset the
+              // CallsiteToCalleeFuncCloneMap entry for this caller, so that
+              // when this new clone is processed later we know which version of
+              // the function to copy (so that other callsite clones we have
+              // assigned to that function clone are properly cloned over). See
+              // comments in the function cloning handling earlier.
+
+              // Check if we already have cloned this callsite again while
+              // walking through caller edges, for a caller calling the same
+              // function clone. If so, we can move this edge to that new clone
+              // rather than creating yet another new clone.
+              if (FuncCloneToNewCallsiteCloneMap.count(
+                      FuncCloneCalledByCaller)) {
+                ContextNode *NewClone =
+                    FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
+                moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
+                // Cleanup any none type edges cloned over.
+                removeNoneTypeCalleeEdges(NewClone);
+              } else {
+                // Create a new callsite clone.
+                ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
+                removeNoneTypeCalleeEdges(NewClone);
+                FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
+                    NewClone;
+                // Add to list of clones and process later.
+                Clones.push_back(NewClone);
+                assert(EI == Clone->CallerEdges.end() ||
+                       Clone->AllocTypes != (uint8_t)AllocationType::None);
+                assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
+              }
+              // Moving the caller edge may have resulted in some none type
+              // callee edges.
+              removeNoneTypeCalleeEdges(Clone);
+              // We will handle the newly created callsite clone in a subsequent
+              // iteration over this Node's Clones. Continue here since we
+              // already adjusted iterator EI while moving the edge.
+              continue;
+            }
+
+            // Otherwise, we can use the function clone already assigned to this
+            // caller.
+            if (!FuncCloneAssignedToCurCallsiteClone) {
+              FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
+              // Assign Clone to FuncCloneCalledByCaller
+              AssignCallsiteCloneToFuncClone(
+                  FuncCloneCalledByCaller, Call, Clone,
+                  AllocationCallToContextNodeMap.count(Call));
+            } else
+              // Don't need to do anything - callsite is already calling this
+              // function clone.
+              assert(FuncCloneAssignedToCurCallsiteClone ==
+                     FuncCloneCalledByCaller);
+
+          } else {
+            // We have not already assigned this caller to a version of
+            // OrigFunc. Do the assignment now.
+
+            // First check if we have already assigned this callsite clone to a
+            // clone of OrigFunc for another caller during this iteration over
+            // its caller edges.
+            if (!FuncCloneAssignedToCurCallsiteClone) {
+              // Find first function in FuncClonesToCallMap without an assigned
+              // clone of this callsite Node. We should always have one
+              // available at this point due to the earlier cloning when the
+              // FuncClonesToCallMap size was smaller than the clone number.
+              for (auto &CF : FuncClonesToCallMap) {
+                if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
+                  FuncCloneAssignedToCurCallsiteClone = CF.first;
+                  break;
+                }
+              }
+              assert(FuncCloneAssignedToCurCallsiteClone);
+              // Assign Clone to FuncCloneAssignedToCurCallsiteClone
+              AssignCallsiteCloneToFuncClone(
+                  FuncCloneAssignedToCurCallsiteClone, Call, Clone,
+                  AllocationCallToContextNodeMap.count(Call));
+            } else
+              assert(FuncCloneToCurNodeCloneMap
+                         [FuncCloneAssignedToCurCallsiteClone] == Clone);
+            // Update callers to record function version called.
+            RecordCalleeFuncOfCallsite(Edge->Caller,
+                                       FuncCloneAssignedToCurCallsiteClone);
+          }
+
+          EI++;
+        }
+      }
+      if (VerifyCCG) {
+        checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/true);
+        for (auto *PE : Node->CalleeEdges)
+          checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee,
+                                                /*CheckEdges=*/true);
+        for (auto *CE : Node->CallerEdges)
+          checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller,
+                                                /*CheckEdges=*/true);
+        for (unsigned I = 0; I < Clones.size(); I++) {
+          ContextNode *Clone = Clones[I];
+          checkNode<DerivedCCG, FuncTy, CallTy>(Clone, /*CheckEdges=*/true);
+          for (auto *PE : Clone->CalleeEdges)
+            checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee,
+                                                  /*CheckEdges=*/true);
+          for (auto *CE : Clone->CallerEdges)
+            checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller,
+                                                  /*CheckEdges=*/true);
+        }
+      }
+    }
+  }
+
+  // Clean up edges removed during the assignment and additional cloning.
+  deleteRemovedEdges();
+
+  auto UpdateCalls = [&](ContextNode *Node,
+                         DenseSet<const ContextNode *> &Visited,
+                         auto &&UpdateCalls) {
+    auto Inserted = Visited.insert(Node);
+    if (!Inserted.second)
+      return;
+
+    for (auto *Clone : Node->Clones)
+      UpdateCalls(Clone, Visited, UpdateCalls);
+
+    for (auto &Edge : Node->CallerEdges)
+      UpdateCalls(Edge->Caller, Visited, UpdateCalls);
+
+    // Skip if either no call to update, or if we ended up with no context ids
+    // (we moved all edges onto other clones).
+    if (!Node->hasCall() || Node->ContextIds.empty())
+      return;
+
+    if (Node->IsAllocation) {
+      updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
+      return;
+    }
+
+    if (!CallsiteToCalleeFuncCloneMap.count(Node))
+      return;
+
+    auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
+    updateCall(Node->Call, CalleeFunc);
+  };
+
+  DenseSet<const ContextNode *> Visited;
+  for (auto &Entry : AllocationCallToContextNodeMap)
+    UpdateCalls(Entry.second, Visited, UpdateCalls);
+
+  return Changed;
+}
+
+bool PGHOContextDisambiguation::applyImport(Module &M) {
+  assert(ImportSummary);
+  bool Changed = false;
+  if (!ImportSummary->withPGHOContextDisambiguation()) {
+    // The profile matcher applies hotness attributes directly for allocations,
+    // and those will cause us to generate calls to the hot/cold interfaces
+    // unconditionally. If context disambiguation was not enabled in the thin
+    // link then assume we don't want these calls (e.g. not linking with
+    // the appropriate library, or otherwise trying to disable this behavior).
+    // For now, simply strip existing hotness attributes so they aren't applied,
+    // and exit early since no cloning decisions were made.
+    for (auto &F : M) {
+      for (auto &BB : F)
+        for (auto &I : BB) {
+          auto *CI = dyn_cast<CallBase>(&I);
+          if (CI && CI->hasFnAttr("memprof")) {
+            CI->removeFnAttr("memprof");
+            Changed = true;
+          }
+        }
+    }
+    return Changed;
+  }
+
+  auto IsPGHOFunc = [](const Function &F) {
+    return F.getName().contains(".pgho.");
+  };
+
+  // We also need to clone any aliases that reference cloned functions, because
+  // the modified callsites may invoke via the alias. Keep track of the aliases
+  // for each function.
+  std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
+      FuncToAliasMap;
+  for (auto &A : M.aliases()) {
+    auto *Aliasee = A.getAliaseeObject();
+    if (auto *F = dyn_cast<Function>(Aliasee))
+      FuncToAliasMap[F].insert(&A);
+  }
+
+  for (auto &F : M) {
+    if (F.isDeclaration() || IsPGHOFunc(F))
+      continue;
+
+    OptimizationRemarkEmitter ORE(&F);
+
+    SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
+    bool ClonesCreated = false;
+    unsigned NumClonesCreated = 0;
+    auto CloneFuncIfNeeded = [&](unsigned NumClones) {
+      // We should at least have version 0 which is the original copy.
+      assert(NumClones > 0);
+      // If we already performed cloning of this function, confirm that the
+      // requested number of clones matches (the thin link should ensure the
+      // number of clones for each constituent callsite is consistent within
+      // each function), before returning.
+      if (ClonesCreated) {
+        assert(NumClonesCreated == NumClones);
+        return;
+      }
+      Changed = true;
+      ClonesCreated = true;
+      NumClonesCreated = NumClones;
+      // If only one copy needed use original.
+      if (NumClones == 1)
+        return;
+      VMaps.reserve(NumClones - 1);
+      FunctionsClonedThinBackend++;
+      for (unsigned I = 1; I < NumClones; I++) {
+        VMaps.emplace_back(new ValueToValueMapTy());
+        auto *NewF = CloneFunction(&F, *VMaps.back());
+        FunctionClonesThinBackend++;
+        // Strip memprof and callsite metadata from clone as they are no longer
+        // needed.
+        for (auto &BB : *NewF) {
+          for (auto &Inst : BB) {
+            Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
+            Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
+          }
+        }
+        std::string Name = getPGHOFuncName(F.getName(), I);
+        auto *PrevF = M.getFunction(Name);
+        if (PrevF) {
+          // We might have created this when adjusting callsite in another
+          // function. It should be a declaration.
+          assert(PrevF->isDeclaration());
+          NewF->takeName(PrevF);
+          PrevF->replaceAllUsesWith(NewF);
+          PrevF->eraseFromParent();
+        } else
+          NewF->setName(Name);
+        ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
+                 << "created clone " << NV("NewFunction", NewF));
+
+        // Now handle aliases to this function, and clone those as well.
+        if (!FuncToAliasMap.count(&F))
+          continue;
+        for (auto *A : FuncToAliasMap[&F]) {
+          std::string Name = getPGHOFuncName(A->getName(), I);
+          auto *PrevA = M.getNamedAlias(Name);
+          auto *NewA = GlobalAlias::create(
+              A->getValueType(), A->getType()->getPointerAddressSpace(),
+              A->getLinkage(), Name, NewF);
+          NewA->copyAttributesFrom(A);
+          if (PrevA) {
+            // We might have created this when adjusting callsite in another
+            // function. It should be a declaration.
+            assert(PrevA->isDeclaration());
+            NewA->takeName(PrevA);
+            PrevA->replaceAllUsesWith(NewA);
+            PrevA->eraseFromParent();
+          }
+        }
+      }
+    };
+
+    // Locate the summary for F. This is complicated by the fact that it might
+    // have been internalized or promoted.
+    // FIXME: Ideally we would retain the original GUID in some fashion on the
+    // function (e.g. as metadata), but for now do our best to locate the
+    // summary without that information.
+    ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
+    if (!TheFnVI)
+      // See if theFn was internalized, by checking index directly with
+      // original name (this avoids the name adjustment done by getGUID() for
+      // internal symbols).
+      TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
+    if (!TheFnVI) {
+      // Now query with the original name before any promotion was performed.
+      StringRef OrigName =
+          ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
+      std::string OrigId = GlobalValue::getGlobalIdentifier(
+          OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
+      TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
+      // Could be a promoted local imported from another module. We need to pass
+      // down more info here to find the original module id. For now, try with
+      // the OrigName which might have been stored in the OidGuidMap in the
+      // index. This would not work if there were same-named locals in multiple
+      // modules, however.
+      if (!TheFnVI) {
+        auto OrigGUID = ImportSummary->getGUIDFromOriginalID(
+            GlobalValue::getGUID(OrigName));
+        if (OrigGUID)
+          TheFnVI = ImportSummary->getValueInfo(OrigGUID);
+      }
+    }
+    // If still not found, this could be an imported local (see comment above).
+    // Skip for now as it will be cloned in its original module (where it would
+    // have been promoted to global scope so should satisfy any reference in
+    // this module).
+    if (!TheFnVI)
+      continue;
+
+    auto *GVSummary =
+        ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
+    if (!GVSummary)
+      // Must have been imported, use the first summary (might be multiple if
+      // this was a linkonce_odr).
+      GVSummary = TheFnVI.getSummaryList().front().get();
+
+    // If this was an imported alias skip it as we won't have the function
+    // summary, and it should be cloned in the original module.
+    if (isa<AliasSummary>(GVSummary))
+      continue;
+
+    auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
+
+    if (FS->allocs().empty() && FS->callsites().empty())
+      continue;
+
+    auto SI = FS->callsites().begin();
+    auto AI = FS->allocs().begin();
+
+    // Assume for now that the instructions are in the exact same order
+    // as when the summary was created, but confirm this is correct by
+    // matching the stack ids.
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        auto *CB = dyn_cast<CallBase>(&I);
+        // Same handling as when creating module summary.
+        if (!mayHaveMemprofSummary(CB))
+          continue;
+
+        CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
+            I.getMetadata(LLVMContext::MD_callsite));
+        auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
+
+        // Include allocs that were already assigned a memprof function
+        // attribute in the statistics.
+        if (CB->getAttributes().hasFnAttr("memprof")) {
+          assert(!MemProfMD);
+          CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
+              ? AllocTypeColdThinBackend++
+              : AllocTypeNotColdThinBackend++;
+          OrigAllocsThinBackend++;
+          AllocVersionsThinBackend++;
+          if (!MaxAllocVersionsThinBackend)
+            MaxAllocVersionsThinBackend = 1;
+        }
+
+        if (MemProfMD) {
+          // Consult the next alloc node.
+          assert(AI != FS->allocs().end());
+          auto &AllocNode = *(AI++);
+
+          // Sanity check that the MIB stack ids match between the summary and
+          // instruction metadata.
+          auto MIBIter = AllocNode.MIBs.begin();
+          for (auto &MDOp : MemProfMD->operands()) {
+            assert(MIBIter != AllocNode.MIBs.end());
+            auto &MIB = *(MIBIter++);
+            auto StackIdIndexIter = MIB.StackIdIndices.begin();
+            auto *MIBMD = cast<const MDNode>(MDOp);
+            MDNode *StackMDNode = getMIBStackNode(MIBMD);
+            assert(StackMDNode);
+            SmallVector<unsigned> StackIdsFromMetadata;
+            CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
+            for (auto ContextIter =
+                     StackContext.beginAfterSharedPrefix(CallsiteContext);
+                 ContextIter != StackContext.end(); ++ContextIter) {
+              // If this is a direct recursion, simply skip the duplicate
+              // entries, to be consistent with how the summary ids were
+              // generated during ModuleSummaryAnalysis.
+              if (!StackIdsFromMetadata.empty() &&
+                  StackIdsFromMetadata.back() == *ContextIter)
+                continue;
+              assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
+              assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
+                     *ContextIter);
+              StackIdIndexIter++;
+            }
+          }
+
+          // Perform cloning if not yet done.
+          CloneFuncIfNeeded(AllocNode.Versions.size());
+
+          OrigAllocsThinBackend++;
+          AllocVersionsThinBackend += AllocNode.Versions.size();
+          if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
+            MaxAllocVersionsThinBackend = AllocNode.Versions.size();
+
+          // If there is only one version that means we didn't end up
+          // considering this function for cloning, and in that case the alloc
+          // will still be none type or should have gotten the default NotCold.
+          // Skip that after calling clone helper since that does some sanity
+          // checks that confirm we haven't decided yet that we need cloning.
+          if (AllocNode.Versions.size() == 1) {
+            assert((AllocationType)AllocNode.Versions[0] ==
+                       AllocationType::NotCold ||
+                   (AllocationType)AllocNode.Versions[0] ==
+                       AllocationType::None);
+            UnclonableAllocsThinBackend++;
+            continue;
+          }
+
+          // All versions should have a singular allocation type.
+          assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
+            return Type == ((uint8_t)AllocationType::NotCold |
+                            (uint8_t)AllocationType::Cold);
+          }));
+
+          // Update the allocation types per the summary info.
+          for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
+            // Ignore any that didn't get an assigned allocation type.
+            if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
+              continue;
+            AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
+            AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
+                                            : AllocTypeNotColdThinBackend++;
+            std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
+            auto A = llvm::Attribute::get(F.getContext(), "memprof",
+                                          AllocTypeString);
+            CallBase *CBClone;
+            // Copy 0 is the original function.
+            if (!J)
+              CBClone = CB;
+            else
+              CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+            CBClone->addFnAttr(A);
+            ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
+                     << NV("AllocationCall", CBClone) << " in clone "
+                     << NV("Caller", CBClone->getFunction())
+                     << " marked with memprof allocation attribute "
+                     << NV("Attribute", AllocTypeString));
+          }
+        } else if (!CallsiteContext.empty()) {
+          // Consult the next callsite node.
+          assert(SI != FS->callsites().end());
+          auto &StackNode = *(SI++);
+
+          // Sanity check that the stack ids match between the summary and
+          // instruction metadata.
+          auto StackIdIndexIter = StackNode.StackIdIndices.begin();
+          for (auto StackId : CallsiteContext) {
+            assert(StackIdIndexIter != StackNode.StackIdIndices.end());
+            assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
+                   StackId);
+            StackIdIndexIter++;
+          }
+
+          // Perform cloning if not yet done.
+          CloneFuncIfNeeded(StackNode.Clones.size());
+
+          // Should have skipped indirect calls via mayHaveMemprofSummary.
+          assert(CB->getCalledFunction());
+          assert(!IsPGHOFunc(*CB->getCalledFunction()));
+
+          // Update the calls per the summary info.
+          // Save orig name since it gets updated in the first iteration
+          // below.
+          auto CalleeOrigName = CB->getCalledFunction()->getName();
+          for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
+            // Do nothing if this version calls the original version of its
+            // callee.
+            if (!StackNode.Clones[J])
+              continue;
+            auto NewF = M.getOrInsertFunction(
+                getPGHOFuncName(CalleeOrigName, StackNode.Clones[J]),
+                CB->getCalledFunction()->getFunctionType());
+            CallBase *CBClone;
+            // Copy 0 is the original function.
+            if (!J)
+              CBClone = CB;
+            else
+              CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+            CBClone->setCalledFunction(NewF);
+            ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
+                     << NV("Call", CBClone) << " in clone "
+                     << NV("Caller", CBClone->getFunction())
+                     << " assigned to call function clone "
+                     << NV("Callee", NewF.getCallee()));
+          }
+        }
+        // Memprof and callsite metadata on memory allocations no longer needed.
+        I.setMetadata(LLVMContext::MD_memprof, nullptr);
+        I.setMetadata(LLVMContext::MD_callsite, nullptr);
+      }
+    }
+  }
+
+  return Changed;
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
   if (DumpCCG) {
@@ -2043,22 +2991,62 @@
   if (ExportToDot)
     exportToDot("ccg.cloned.dot");
 
-  return false;
+  bool Changed = assignFunctions();
+
+  if (DumpCCG) {
+    dbgs() << "CCG after assigning function clones:\n";
+    dbgs() << *this;
+  }
+  if (ExportToDot)
+    exportToDot("ccg.clonefuncassign.dot");
+
+  return Changed;
 }
 
 bool PGHOContextDisambiguation::processModule(
-    Module &M) {
+    Module &M,
+    function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
   bool Changed = false;
 
-  ModuleCallsiteContextGraph CCG(M);
+  // If we have an import summary, then the cloning decisions were made during
+  // the thin link on the index. Apply them and return.
+  if (ImportSummary) {
+    Changed = applyImport(M);
+    return Changed;
+  }
+
+  ModuleCallsiteContextGraph CCG(M, OREGetter);
   Changed = CCG.process();
 
   return Changed;
 }
 
+PGHOContextDisambiguation::PGHOContextDisambiguation(
+    const ModuleSummaryIndex *Summary)
+    : ImportSummary(Summary) {
+  // The PGHOImportSummary should only be used for testing ThinLTO distributed
+  // backend handling via opt, in which case we don't have a summary from the
+  // pass pipeline.
+  assert(!ImportSummary || PGHOImportSummary.empty());
+  if (!ImportSummary && !PGHOImportSummary.empty()) {
+    ExitOnError ExitOnErr("-pgho-import-summary: " + PGHOImportSummary + ": ");
+    auto ReadSummaryFile =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(PGHOImportSummary)));
+    if (Expected<std::unique_ptr<ModuleSummaryIndex>> SummaryOrErr =
+            getModuleSummaryIndex(*ReadSummaryFile)) {
+      ImportSummaryForTesting = std::move(*SummaryOrErr);
+      ImportSummary = ImportSummaryForTesting.get();
+    }
+  }
+}
+
 PreservedAnalyses PGHOContextDisambiguation::run(Module &M,
                                                  ModuleAnalysisManager &AM) {
-  if (!processModule(M))
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+  };
+  if (!processModule(M, OREGetter))
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
@@ -2069,4 +3057,5 @@
         isPrevailing) {
   IndexCallsiteContextGraph CCG(Index, isPrevailing);
   CCG.process();
+  Index.setWithPGHOContextDisambiguation();
 }
diff --git a/llvm/test/ThinLTO/X86/pgho-basic.ll b/llvm/test/ThinLTO/X86/pgho-basic.ll
--- a/llvm/test/ThinLTO/X86/pgho-basic.ll
+++ b/llvm/test/ThinLTO/X86/pgho-basic.ll
@@ -1,5 +1,5 @@
 ;; Test callsite context graph generation for simple call graph with
-;; two memprof contexts and no inlining.
+;; two memprof contexts and no inlining, as well as graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -37,12 +37,44 @@
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
 ; RUN:	-pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:	-stats -pass-remarks=pgho-context-disambiguation -save-temps \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:	--check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should have cloned bar, baz, and foo, for the cold memory allocation.
 ; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:	-r=%t.o,main,plx \
+; RUN:	-r=%t.o,_ZdaPv, \
+; RUN:	-r=%t.o,sleep, \
+; RUN:	-r=%t.o,_Znam, \
+; RUN:	-pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
+; RUN:	-pgho-export-to-dot -pgho-dot-file-path-prefix=%t2. \
+; RUN:	-stats -pass-remarks=pgho-context-disambiguation \
+; RUN:	-o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:	--check-prefix=STATS
+
+; RUN:	cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should have cloned bar, baz, and foo, for the cold memory allocation.
+; RUN:	cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Check distributed index
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
+
+;; Run ThinLTO backend
+; RUN: opt -passes=pgho-context-disambiguation \
+; RUN:	-pgho-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=REMARKS
+
 ; ModuleID = 'pgho-basic.ll'
 source_filename = "pgho-basic.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -150,6 +182,7 @@
 !15 = !{i64 -5964873800580613432}
 !16 = !{i64 2732490490862098848}
 
+
 ; DUMP: CCG before cloning:
 ; DUMP: Callsite Context Graph:
 ; DUMP: Node [[BAR:0x[a-z0-9]+]]
@@ -275,6 +308,52 @@
 ; DUMP: 	CallerEdges:
 
 
+; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1
+; REMARKS: created clone _Z3barv.pgho.1
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3barv.pgho.1 marked with memprof allocation attribute cold
+; REMARKS: created clone _Z3bazv.pgho.1
+; REMARKS: call in clone _Z3bazv.pgho.1 assigned to call function clone _Z3barv.pgho.1
+; REMARKS: created clone _Z3foov.pgho.1
+; REMARKS: call in clone _Z3foov.pgho.1 assigned to call function clone _Z3bazv.pgho.1
+
+
+; IR: define {{.*}} @main
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   %call = call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   %call1 = call {{.*}} @_Z3foov.pgho.1()
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv()
+; IR:   %call = call {{.*}} @_Z3barv()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   %call = call {{.*}} @_Z3bazv()
+; IR: define internal {{.*}} @_Z3barv.pgho.1()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv.pgho.1()
+; IR:   %call = call {{.*}} @_Z3barv.pgho.1()
+; IR: define internal {{.*}} @_Z3foov.pgho.1()
+; IR:   %call = call {{.*}} @_Z3bazv.pgho.1()
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 2 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
+; STATS: 3 pgho-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 3 pgho-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 3 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 1 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
+
+
 ; DOT: digraph CallsiteContextGraph {
 ; DOT:     N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3barv -\> alloc",tooltip="N[[BAR]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
 ; DOT:     N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv",tooltip="N[[BAZ]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
@@ -306,3 +385,9 @@
 ; DOTCLONED:     N[[FOO]] -> N[[BAZ]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default
 ; DOTCLONED:     N[[MAIN1]] -> N[[FOO]][tooltip=" ContextIds: 1",fillcolor="brown1"]; // default
 ; DOTCLONED: }
+
+
+; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6988045695824228603, {{.*}} callsites: ((callee: ^[[BAZ:[0-9]+]], clones: (0, 1)
+; DISTRIB: ^[[BAR:[0-9]+]] = gv: (guid: 10756268697391741933, {{.*}} allocs: ((versions: (notcold, cold)
+; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1)
+; DISTRIB: ^[[BAZ]] = gv: (guid: 17547784407117670007, {{.*}} callsites: ((callee: ^[[BAR]], clones: (0, 1)
diff --git a/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll
--- a/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll
+++ b/llvm/test/ThinLTO/X86/pgho-duplicate-context-ids.ll
@@ -1,7 +1,8 @@
 ;; Test callsite context graph generation for call graph with with MIBs
 ;; that have pruned contexts that partially match multiple inlined
 ;; callsite contexts, requiring duplication of context ids and nodes
-;; while matching callsite nodes onto the graph.
+;; while matching callsite nodes onto the graph. Also tests graph and IR
+;; cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -58,13 +59,46 @@
 ; RUN:  -r=%t.o,_Znam, \
 ; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
 ; RUN:  -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \
-; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
 
 ; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
 ;; We should clone D once for the cold allocations via C.
 ; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
+; RUN:  -pgho-export-to-dot -pgho-dot-file-path-prefix=%t2. \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
+
+; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
+;; We should clone D once for the cold allocations via C.
+; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Check distributed index
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
+
+;; Run ThinLTO backend
+; RUN: opt -passes=pgho-context-disambiguation \
+; RUN:  -pgho-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=REMARKS
+
 ; ModuleID = 'duplicate-context-ids.ll'
 source_filename = "duplicate-context-ids.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -347,6 +381,49 @@
 ; DUMP: 	CallerEdges:
 
 
+; REMARKS: created clone _Z1Dv.pgho.1
+; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1Dv.pgho.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.pgho.1
+; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.pgho.1
+
+
+;; The allocation via F does not allocate cold memory. It should call the
+;; original D, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR: define internal {{.*}} @_Z1Dv()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z1Fv()
+; IR:   %call = call {{.*}} @_Z1Dv()
+;; The allocations via B and E allocate cold memory. They should call the
+;; cloned D, which ultimately call the cloned allocation decorated with a
+;; "cold" attribute.
+; IR: define internal {{.*}} @_Z1Bv()
+; IR:   %call.i = call {{.*}} @_Z1Dv.pgho.1()
+; IR: define internal {{.*}} @_Z1Ev()
+; IR:   %call.i = call {{.*}} @_Z1Dv.pgho.1()
+; IR: define dso_local {{.*}} @main
+; IR:   %call = call {{.*}} @_Z1Bv()
+; IR:   %call1 = call {{.*}} @_Z1Ev()
+; IR:   %call2 = call {{.*}} @_Z1Fv()
+; IR: define internal {{.*}} @_Z1Dv.pgho.1()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 2 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
+; STATS: 1 pgho-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 1 pgho-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 1 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 1 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
+
+
 ; DOTPRE: digraph CallsiteContextGraph {
 ; DOTPRE:     N[[D:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z1Dv -\> alloc",tooltip="N[[D]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
 ; DOTPRE:     N[[F:0x[a-z0-9]+]] [shape="record",label="OrigId: 13543580133643026784\nnull call (external)",tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled"]; // callsite, default
@@ -384,3 +461,9 @@
 ; DOTCLONED:     N[[B]] -> N[[D2]][tooltip=" ContextIds: 4",fillcolor="cyan"]; // cold
 ; DOTCLONED:     N[[F]] -> N[[D]][tooltip=" ContextIds: 2",fillcolor="brown1"]; // default
 ; DOTCLONED: }
+
+; DISTRIB: ^[[D:[0-9]+]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold)
+; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
+; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[B]], clones: (0), {{.*}} (callee: ^[[E:[0-9]+]], clones: (0), {{.*}} (callee: ^[[F:[0-9]+]], clones: (0)
+; DISTRIB: ^[[F]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0)
+; DISTRIB: ^[[E]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
diff --git a/llvm/test/ThinLTO/X86/pgho-funcassigncloning.ll b/llvm/test/ThinLTO/X86/pgho-funcassigncloning.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/pgho-funcassigncloning.ll
@@ -0,0 +1,418 @@
+;; Test context disambiguation for a callgraph containing multiple memprof
+;; contexts and no inlining, where we need to perform additional cloning
+;; during function assignment/cloning to handle the combination of contexts
+;; to 2 different allocations.
+;;
+;; void E(char **buf1, char **buf2) {
+;;   *buf1 = new char[10];
+;;   *buf2 = new char[10];
+;; }
+;;
+;; void B(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;;
+;; void C(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;;
+;; void D(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;; int main(int argc, char **argv) {
+;;   char *cold1, *cold2, *default1, *default2, *default3, *default4;
+;;   B(&default1, &default2);
+;;   C(&default3, &cold1);
+;;   D(&cold2, &default4);
+;;   memset(cold1, 0, 10);
+;;   memset(cold2, 0, 10);
+;;   memset(default1, 0, 10);
+;;   memset(default2, 0, 10);
+;;   memset(default3, 0, 10);
+;;   memset(default4, 0, 10);
+;;   delete[] default1;
+;;   delete[] default2;
+;;   delete[] default3;
+;;   delete[] default4;
+;;   sleep(10);
+;;   delete[] cold1;
+;;   delete[] cold2;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
+
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
+
+;; Run ThinLTO backend
+; RUN: opt -passes=pgho-context-disambiguation \
+; RUN:  -pgho-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=REMARKS
+
+; ModuleID = 'funcassigncloning.ll'
+source_filename = "funcassigncloning.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal void @_Z1EPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 {
+entry:
+  %buf1.addr = alloca ptr, align 8
+  %buf2.addr = alloca ptr, align 8
+  store ptr %buf1, ptr %buf1.addr, align 8
+  store ptr %buf2, ptr %buf2.addr, align 8
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !7, !callsite !14
+  %0 = load ptr, ptr %buf1.addr, align 8
+  store ptr %call, ptr %0, align 8
+  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !15, !callsite !22
+  %1 = load ptr, ptr %buf2.addr, align 8
+  store ptr %call1, ptr %1, align 8
+  ret void
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znam(i64 noundef) #1
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal void @_Z1BPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 {
+entry:
+  %buf1.addr = alloca ptr, align 8
+  %buf2.addr = alloca ptr, align 8
+  store ptr %buf1, ptr %buf1.addr, align 8
+  store ptr %buf2, ptr %buf2.addr, align 8
+  %0 = load ptr, ptr %buf1.addr, align 8
+  %1 = load ptr, ptr %buf2.addr, align 8
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !23
+  ret void
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal void @_Z1CPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 {
+entry:
+  %buf1.addr = alloca ptr, align 8
+  %buf2.addr = alloca ptr, align 8
+  store ptr %buf1, ptr %buf1.addr, align 8
+  store ptr %buf2, ptr %buf2.addr, align 8
+  %0 = load ptr, ptr %buf1.addr, align 8
+  %1 = load ptr, ptr %buf2.addr, align 8
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !24
+  ret void
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal void @_Z1DPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 {
+entry:
+  %buf1.addr = alloca ptr, align 8
+  %buf2.addr = alloca ptr, align 8
+  store ptr %buf1, ptr %buf1.addr, align 8
+  store ptr %buf2, ptr %buf2.addr, align 8
+  %0 = load ptr, ptr %buf1.addr, align 8
+  %1 = load ptr, ptr %buf2.addr, align 8
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !25
+  ret void
+}
+
+; Function Attrs: mustprogress noinline norecurse optnone uwtable
+define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #2 {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca ptr, align 8
+  %cold1 = alloca ptr, align 8
+  %cold2 = alloca ptr, align 8
+  %default1 = alloca ptr, align 8
+  %default2 = alloca ptr, align 8
+  %default3 = alloca ptr, align 8
+  %default4 = alloca ptr, align 8
+  store i32 0, ptr %retval, align 4
+  store i32 %argc, ptr %argc.addr, align 4
+  store ptr %argv, ptr %argv.addr, align 8
+  call void @_Z1BPPcS0_(ptr noundef %default1, ptr noundef %default2), !callsite !26
+  call void @_Z1CPPcS0_(ptr noundef %default3, ptr noundef %cold1), !callsite !27
+  call void @_Z1DPPcS0_(ptr noundef %cold2, ptr noundef %default4), !callsite !28
+  %0 = load ptr, ptr %cold1, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false)
+  %1 = load ptr, ptr %cold2, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false)
+  %2 = load ptr, ptr %default1, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false)
+  %3 = load ptr, ptr %default2, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false)
+  %4 = load ptr, ptr %default3, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 10, i1 false)
+  %5 = load ptr, ptr %default4, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %5, i8 0, i64 10, i1 false)
+  %6 = load ptr, ptr %default1, align 8
+  %isnull = icmp eq ptr %6, null
+  br i1 %isnull, label %delete.end, label %delete.notnull
+
+delete.notnull:                                   ; preds = %entry
+  call void @_ZdaPv(ptr noundef %6) #7
+  br label %delete.end
+
+delete.end:                                       ; preds = %delete.notnull, %entry
+  %7 = load ptr, ptr %default2, align 8
+  %isnull1 = icmp eq ptr %7, null
+  br i1 %isnull1, label %delete.end3, label %delete.notnull2
+
+delete.notnull2:                                  ; preds = %delete.end
+  call void @_ZdaPv(ptr noundef %7) #7
+  br label %delete.end3
+
+delete.end3:                                      ; preds = %delete.notnull2, %delete.end
+  %8 = load ptr, ptr %default3, align 8
+  %isnull4 = icmp eq ptr %8, null
+  br i1 %isnull4, label %delete.end6, label %delete.notnull5
+
+delete.notnull5:                                  ; preds = %delete.end3
+  call void @_ZdaPv(ptr noundef %8) #7
+  br label %delete.end6
+
+delete.end6:                                      ; preds = %delete.notnull5, %delete.end3
+  %9 = load ptr, ptr %default4, align 8
+  %isnull7 = icmp eq ptr %9, null
+  br i1 %isnull7, label %delete.end9, label %delete.notnull8
+
+delete.notnull8:                                  ; preds = %delete.end6
+  call void @_ZdaPv(ptr noundef %9) #7
+  br label %delete.end9
+
+delete.end9:                                      ; preds = %delete.notnull8, %delete.end6
+  %call = call i32 @sleep(i32 noundef 10)
+  %10 = load ptr, ptr %cold1, align 8
+  %isnull10 = icmp eq ptr %10, null
+  br i1 %isnull10, label %delete.end12, label %delete.notnull11
+
+delete.notnull11:                                 ; preds = %delete.end9
+  call void @_ZdaPv(ptr noundef %10) #7
+  br label %delete.end12
+
+delete.end12:                                     ; preds = %delete.notnull11, %delete.end9
+  %11 = load ptr, ptr %cold2, align 8
+  %isnull13 = icmp eq ptr %11, null
+  br i1 %isnull13, label %delete.end15, label %delete.notnull14
+
+delete.notnull14:                                 ; preds = %delete.end12
+  call void @_ZdaPv(ptr noundef %11) #7
+  br label %delete.end15
+
+delete.end15:                                     ; preds = %delete.notnull14, %delete.end12
+  ret i32 0
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3
+
+; Function Attrs: nobuiltin nounwind
+declare void @_ZdaPv(ptr noundef) #4
+
+declare i32 @sleep(i32 noundef) #5
+
+attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #6 = { builtin allocsize(0) }
+attributes #7 = { builtin nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6}
+
+!0 = !{i32 7, !"Dwarf Version", i32 5}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 8, !"PIC Level", i32 2}
+!4 = !{i32 7, !"PIE Level", i32 2}
+!5 = !{i32 7, !"uwtable", i32 2}
+!6 = !{i32 7, !"frame-pointer", i32 2}
+!7 = !{!8, !10, !12}
+!8 = !{!9, !"cold"}
+!9 = !{i64 -3461278137325233666, i64 -7799663586031895603}
+!10 = !{!11, !"notcold"}
+!11 = !{i64 -3461278137325233666, i64 -3483158674395044949}
+!12 = !{!13, !"notcold"}
+!13 = !{i64 -3461278137325233666, i64 -2441057035866683071}
+!14 = !{i64 -3461278137325233666}
+!15 = !{!16, !18, !20}
+!16 = !{!17, !"notcold"}
+!17 = !{i64 -1415475215210681400, i64 -2441057035866683071}
+!18 = !{!19, !"cold"}
+!19 = !{i64 -1415475215210681400, i64 -3483158674395044949}
+!20 = !{!21, !"notcold"}
+!21 = !{i64 -1415475215210681400, i64 -7799663586031895603}
+!22 = !{i64 -1415475215210681400}
+!23 = !{i64 -2441057035866683071}
+!24 = !{i64 -3483158674395044949}
+!25 = !{i64 -7799663586031895603}
+!26 = !{i64 4256801922104815624}
+!27 = !{i64 6438520854747849124}
+!28 = !{i64 -8402480891374135967}
+
+
+;; Originally we create a single clone of each call to new from E, since each
+;; allocates cold memory for a single caller.
+
+; DUMP: CCG after cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]]
+; DUMP:         Versions: 1 MIB:
+; DUMP:                 AllocType 2 StackIds: 0
+; DUMP:                 AllocType 1 StackIds: 1
+; DUMP:                 AllocType 1 StackIds: 2
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2 3
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	Clones: [[ENEW1CLONE:0x[a-z0-9]+]]
+
+; DUMP: Node [[C]]
+; DUMP: 	Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 1 (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 2 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[B]]
+; DUMP: 	Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 2 (clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 4 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[ENEW1CLONE]]
+; DUMP:         Versions: 1 MIB:
+; DUMP:                 AllocType 2 StackIds: 0
+; DUMP:                 AllocType 1 StackIds: 1
+; DUMP:                 AllocType 1 StackIds: 2
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW1CLONE]] to Caller: [[D:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	Clone of [[ENEW1ORIG]]
+
+; DUMP: Node [[D]]
+; DUMP: 	Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 0 (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 6 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[ENEW2ORIG]]
+; DUMP:         Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 2
+; DUMP:                 AllocType 2 StackIds: 1
+; DUMP:                 AllocType 1 StackIds: 0
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 4 6
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	Clones: [[ENEW2CLONE]]
+
+; DUMP: Node [[ENEW2CLONE]]
+; DUMP:         Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 2
+; DUMP:                 AllocType 2 StackIds: 1
+; DUMP:                 AllocType 1 StackIds: 0
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 5
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: 	Clone of [[ENEW2ORIG]]
+
+
+;; We greedily create a clone of E that is initially used by the clones of the
+;; first call to new. However, we end up with an incompatible set of callers
+;; given the second call to new which has clones with a different combination of
+;; callers. Eventually, we create 2 more clones, and the first clone becomes dead.
+; REMARKS: created clone _Z1EPPcS0_.pgho.1
+; REMARKS: created clone _Z1EPPcS0_.pgho.2
+; REMARKS: created clone _Z1EPPcS0_.pgho.3
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.pgho.2 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1EPPcS0_.pgho.3 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.pgho.2 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.pgho.3 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.pgho.3
+; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.pgho.2
+
+
+;; Original version of E is used for the non-cold allocations, both from B.
+; IR: define internal {{.*}} @_Z1EPPcS0_(
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR:   %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1BPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_(
+;; C calls a clone of E with the first new allocating cold memory and the
+;; second allocating non-cold memory.
+; IR: define internal {{.*}} @_Z1CPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_.pgho.3(
+;; D calls a clone of E with the first new allocating non-cold memory and the
+;; second allocating cold memory.
+; IR: define internal {{.*}} @_Z1DPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_.pgho.2(
+; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.2(
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR:   %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.3(
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR:   %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[COLD]]
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 2 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 2 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 4 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 4 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 8 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
+; STATS: 3 pgho-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 3 pgho-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 1 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 4 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 2 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
diff --git a/llvm/test/ThinLTO/X86/pgho-indirectcall.ll b/llvm/test/ThinLTO/X86/pgho-indirectcall.ll
--- a/llvm/test/ThinLTO/X86/pgho-indirectcall.ll
+++ b/llvm/test/ThinLTO/X86/pgho-indirectcall.ll
@@ -1,7 +1,7 @@
 ;; Tests callsite context graph generation for call graph containing indirect
 ;; calls. Currently this should result in conservative behavior, such that the
 ;; indirect call receives a null call in its graph node, to prevent subsequent
-;; cloning.
+;; cloning. Also tests graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -59,13 +59,45 @@
 ; RUN:  -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
 ; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
 ; RUN:  -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \
-; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
 
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should only create a single clone of foo, for the direct call
 ;; from main allocating cold memory.
 ; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \
+; RUN:  -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
+; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
+; RUN:  -pgho-export-to-dot -pgho-dot-file-path-prefix=%t2. \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
+
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should only create a single clone of foo, for the direct call
+;; from main allocating cold memory.
+; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Run ThinLTO backend
+; RUN: opt -passes=pgho-context-disambiguation \
+; RUN:  -pgho-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=REMARKS
+
 ; ModuleID = 'indirectcall.ll'
 source_filename = "indirectcall.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -539,6 +571,47 @@
 ; DUMP: 	CallerEdges:
 
 
+; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1
+; REMARKS: created clone _Z3foov.pgho.1
+; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3foov.pgho.1 marked with memprof allocation attribute cold
+
+
+; IR: define internal {{.*}} @_Z3barP1A(
+; IR:   %call = call {{.*}} %1(
+; IR: define {{.*}} @main(
+; IR:   %call = call {{.*}} @_Z3foov()
+;; Only the second call to foo, which allocates cold memory via direct calls,
+;; is replaced with a call to a clone that calls a cold allocation.
+; IR:   %call1 = call {{.*}} @_Z3foov.pgho.1()
+; IR:   %call2 = call {{.*}} @_Z3barP1A(
+; IR:   %call3 = call {{.*}} @_Z3barP1A(
+; IR:   %call4 = call {{.*}} @_Z3barP1A(
+; IR:   %call5 = call {{.*}} @_Z3barP1A(
+; IR: define internal {{.*}} @_ZN1A1xEv(
+; IR:   %call = call {{.*}} @_Z3foov()
+; IR: define internal {{.*}} @_ZN1B1xEv(
+; IR:   %call = call {{.*}} @_Z3foov()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.pgho.1()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 2 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
+; STATS: 1 pgho-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 1 pgho-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 1 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 1 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
+
+
 ; DOT: digraph CallsiteContextGraph {
 ; DOT:     N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3foov -\> alloc",tooltip="N[[FOO]] ContextIds: 2 4 6 1 3 5",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
 ; DOT:     N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold
diff --git a/llvm/test/ThinLTO/X86/pgho-inlined.ll b/llvm/test/ThinLTO/X86/pgho-inlined.ll
--- a/llvm/test/ThinLTO/X86/pgho-inlined.ll
+++ b/llvm/test/ThinLTO/X86/pgho-inlined.ll
@@ -1,6 +1,7 @@
 ;; Test callsite context graph generation for call graph with two memprof
 ;; contexts and partial inlining, requiring generation of a new fused node to
 ;; represent the inlined sequence while matching callsite nodes onto the graph.
+;; Also tests graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -46,13 +47,45 @@
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
 ; RUN:	-pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation -save-temps \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS --check-prefix=STATS-BE \
+; RUN:  --check-prefix=STATS-INPROCESS-BE --check-prefix=REMARKS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should create clones for foo and bar for the call from main to allocate
 ;; cold memory.
 ; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-pgho-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
+; RUN:  -pgho-export-to-dot -pgho-dot-file-path-prefix=%t2. \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
+
+; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should create clones for foo and bar for the call from main to allocate
+;; cold memory.
+; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Run ThinLTO backend
+; RUN: opt -passes=pgho-context-disambiguation \
+; RUN:  -pgho-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=STATS-DISTRIB-BE \
+; RUN:  --check-prefix=REMARKS
+
 ; ModuleID = 'inlined.ll'
 source_filename = "inlined.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -311,6 +344,52 @@
 ; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
 
 
+; REMARKS: created clone _Z3barv.pgho.1
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3barv.pgho.1 marked with memprof allocation attribute cold
+; REMARKS: created clone _Z3foov.pgho.1
+; REMARKS: call in clone _Z3foov.pgho.1 assigned to call function clone _Z3barv.pgho.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1
+
+
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   %call.i = call {{.*}} @_Z3barv()
+; IR: define dso_local {{.*}} @main(i32 noundef %argc, ptr noundef %argv)
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   %call = call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   %call1 = call {{.*}} @_Z3foov.pgho.1()
+; IR: define internal {{.*}} @_Z3barv.pgho.1()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.pgho.1()
+; IR:   %call.i = call {{.*}} @_Z3barv.pgho.1()
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 2 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-INPROCESS-BE: 2 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
+;; The distributed backend hasn't yet eliminated the now-dead baz with
+;; the allocation from bar inlined, so it has one more allocation.
+; STATS-DISTRIB-BE: 3 pgho-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
+; STATS: 2 pgho-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 2 pgho-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 2 pgho-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 pgho-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-INPROCESS-BE: 1 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
+;; The distributed backend hasn't yet eliminated the now-dead baz with
+;; the allocation from bar inlined, so it has one more allocation.
+; STATS-DISTRIB-BE: 2 pgho-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
+
+
 ; DOT: digraph CallsiteContextGraph {
 ; DOT:     N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3bazv -\> alloc",tooltip="N[[BAR]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
 ; DOT:     N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: 2732490490862098848\nnull call (external)",tooltip="N[[FOO]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll b/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll
--- a/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll
+++ b/llvm/test/Transforms/PGHOContextDisambiguation/basic.ll
@@ -1,5 +1,5 @@
 ;; Test callsite context graph generation for simple call graph with
-;; two memprof contexts and no inlining.
+;; two memprof contexts and no inlining, as well as graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -32,7 +32,9 @@
 ; RUN: opt -passes=pgho-context-disambiguation \
 ; RUN:	-pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
 ; RUN:	-pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \
-; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:	-stats -pass-remarks=pgho-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:	--check-prefix=STATS --check-prefix=REMARKS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should have cloned bar, baz, and foo, for the cold memory allocation.
@@ -261,6 +263,48 @@
 ; DUMP: 	CallerEdges:
 
 
+; REMARKS: created clone _Z3barv.pgho.1
+; REMARKS: created clone _Z3bazv.pgho.1
+; REMARKS: created clone _Z3foov.pgho.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1
+; REMARKS: call in clone _Z3foov.pgho.1 assigned to call function clone _Z3bazv.pgho.1
+; REMARKS: call in clone _Z3bazv.pgho.1 assigned to call function clone _Z3barv.pgho.1
+; REMARKS: call in clone _Z3barv.pgho.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv
+; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+
+
+; IR: define {{.*}} @main
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   %call = call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   %call1 = call {{.*}} @_Z3foov.pgho.1()
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv()
+; IR:   %call = call {{.*}} @_Z3barv()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   %call = call {{.*}} @_Z3bazv()
+; IR: define internal {{.*}} @_Z3barv.pgho.1()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv.pgho.1()
+; IR:   %call = call {{.*}} @_Z3barv.pgho.1()
+; IR: define internal {{.*}} @_Z3foov.pgho.1()
+; IR:   %call = call {{.*}} @_Z3bazv.pgho.1()
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 pgho-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOT: digraph CallsiteContextGraph {
 ; DOT:     N[[BAR:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3barv -\> _Znam",tooltip="N[[BAR]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
 ; DOT:     N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv",tooltip="N[[BAZ]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll
--- a/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll
+++ b/llvm/test/Transforms/PGHOContextDisambiguation/duplicate-context-ids.ll
@@ -1,7 +1,8 @@
 ;; Test callsite context graph generation for call graph with with MIBs
 ;; that have pruned contexts that partially match multiple inlined
 ;; callsite contexts, requiring duplication of context ids and nodes
-;; while matching callsite nodes onto the graph.
+;; while matching callsite nodes onto the graph. Also tests graph and IR
+;; cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -53,7 +54,9 @@
 ; RUN: opt -passes=pgho-context-disambiguation \
 ; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
 ; RUN:  -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \
-; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
 
 ; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
@@ -330,6 +333,44 @@
 ; DUMP: 	CallerEdges:
 
 
+; REMARKS: created clone _Z1Dv.pgho.1
+; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.pgho.1
+; REMARKS: call in clone _Z1Cv assigned to call function clone _Z1Dv.pgho.1
+; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.pgho.1
+; REMARKS: call in clone _Z1Dv.pgho.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1Fv assigned to call function clone _Z1Dv
+; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold
+
+
+;; The allocation via F does not allocate cold memory. It should call the
+;; original D, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR: define internal {{.*}} @_Z1Dv()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z1Fv()
+; IR:   %call = call {{.*}} @_Z1Dv()
+;; The allocations via B and E allocate cold memory. They should call the
+;; cloned D, which ultimately call the cloned allocation decorated with a
+;; "cold" attribute.
+; IR: define internal {{.*}} @_Z1Bv()
+; IR:   %call.i = call {{.*}} @_Z1Dv.pgho.1()
+; IR: define internal {{.*}} @_Z1Ev()
+; IR:   %call.i = call {{.*}} @_Z1Dv.pgho.1()
+; IR: define dso_local {{.*}} @main
+; IR:   %call = call {{.*}} @_Z1Bv()
+; IR:   %call1 = call {{.*}} @_Z1Ev()
+; IR:   %call2 = call {{.*}} @_Z1Fv()
+; IR: define internal {{.*}} @_Z1Dv.pgho.1()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 pgho-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOTPRE: digraph CallsiteContextGraph {
 ; DOTPRE:     N[[D:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z1Dv -\> _Znam",tooltip="N[[D]] ContextIds: 2 1",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
 ; DOTPRE:     N[[F:0x[a-z0-9]+]] [shape="record",label="OrigId: 13543580133643026784\nnull call (external)",tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled"]; // callsite, default
diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/funcassigncloning.ll b/llvm/test/Transforms/PGHOContextDisambiguation/funcassigncloning.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/PGHOContextDisambiguation/funcassigncloning.ll
@@ -0,0 +1,374 @@
+;; Test context disambiguation for a callgraph containing multiple memprof
+;; contexts and no inlining, where we need to perform additional cloning
+;; during function assignment/cloning to handle the combination of contexts
+;; to 2 different allocations.
+;;
+;; void E(char **buf1, char **buf2) {
+;;   *buf1 = new char[10];
+;;   *buf2 = new char[10];
+;; }
+;;
+;; void B(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;;
+;; void C(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;;
+;; void D(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;; int main(int argc, char **argv) {
+;;   char *cold1, *cold2, *default1, *default2, *default3, *default4;
+;;   B(&default1, &default2);
+;;   C(&default3, &cold1);
+;;   D(&cold2, &default4);
+;;   memset(cold1, 0, 10);
+;;   memset(cold2, 0, 10);
+;;   memset(default1, 0, 10);
+;;   memset(default2, 0, 10);
+;;   memset(default3, 0, 10);
+;;   memset(default4, 0, 10);
+;;   delete[] default1;
+;;   delete[] default2;
+;;   delete[] default3;
+;;   delete[] default4;
+;;   sleep(10);
+;;   delete[] cold1;
+;;   delete[] cold2;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+
+; RUN: opt -passes=pgho-context-disambiguation \
+; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
+
+; ModuleID = 'funcassigncloning.ll'
+source_filename = "funcassigncloning.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal void @_Z1EPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 {
+entry:
+  %buf1.addr = alloca ptr, align 8
+  %buf2.addr = alloca ptr, align 8
+  store ptr %buf1, ptr %buf1.addr, align 8
+  store ptr %buf2, ptr %buf2.addr, align 8
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !7, !callsite !14
+  %0 = load ptr, ptr %buf1.addr, align 8
+  store ptr %call, ptr %0, align 8
+  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !15, !callsite !22
+  %1 = load ptr, ptr %buf2.addr, align 8
+  store ptr %call1, ptr %1, align 8
+  ret void
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znam(i64 noundef) #1
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal void @_Z1BPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 {
+entry:
+  %buf1.addr = alloca ptr, align 8
+  %buf2.addr = alloca ptr, align 8
+  store ptr %buf1, ptr %buf1.addr, align 8
+  store ptr %buf2, ptr %buf2.addr, align 8
+  %0 = load ptr, ptr %buf1.addr, align 8
+  %1 = load ptr, ptr %buf2.addr, align 8
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !23
+  ret void
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal void @_Z1CPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 {
+entry:
+  %buf1.addr = alloca ptr, align 8
+  %buf2.addr = alloca ptr, align 8
+  store ptr %buf1, ptr %buf1.addr, align 8
+  store ptr %buf2, ptr %buf2.addr, align 8
+  %0 = load ptr, ptr %buf1.addr, align 8
+  %1 = load ptr, ptr %buf2.addr, align 8
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !24
+  ret void
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define internal void @_Z1DPPcS0_(ptr noundef %buf1, ptr noundef %buf2) #0 {
+entry:
+  %buf1.addr = alloca ptr, align 8
+  %buf2.addr = alloca ptr, align 8
+  store ptr %buf1, ptr %buf1.addr, align 8
+  store ptr %buf2, ptr %buf2.addr, align 8
+  %0 = load ptr, ptr %buf1.addr, align 8
+  %1 = load ptr, ptr %buf2.addr, align 8
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !25
+  ret void
+}
+
+; Function Attrs: mustprogress noinline norecurse optnone uwtable
+define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #2 {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca ptr, align 8
+  %cold1 = alloca ptr, align 8
+  %cold2 = alloca ptr, align 8
+  %default1 = alloca ptr, align 8
+  %default2 = alloca ptr, align 8
+  %default3 = alloca ptr, align 8
+  %default4 = alloca ptr, align 8
+  store i32 0, ptr %retval, align 4
+  store i32 %argc, ptr %argc.addr, align 4
+  store ptr %argv, ptr %argv.addr, align 8
+  call void @_Z1BPPcS0_(ptr noundef %default1, ptr noundef %default2), !callsite !26
+  call void @_Z1CPPcS0_(ptr noundef %default3, ptr noundef %cold1), !callsite !27
+  call void @_Z1DPPcS0_(ptr noundef %cold2, ptr noundef %default4), !callsite !28
+  %0 = load ptr, ptr %cold1, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false)
+  %1 = load ptr, ptr %cold2, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false)
+  %2 = load ptr, ptr %default1, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false)
+  %3 = load ptr, ptr %default2, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false)
+  %4 = load ptr, ptr %default3, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 10, i1 false)
+  %5 = load ptr, ptr %default4, align 8
+  call void @llvm.memset.p0.i64(ptr align 1 %5, i8 0, i64 10, i1 false)
+  %6 = load ptr, ptr %default1, align 8
+  %isnull = icmp eq ptr %6, null
+  br i1 %isnull, label %delete.end, label %delete.notnull
+
+delete.notnull:                                   ; preds = %entry
+  call void @_ZdaPv(ptr noundef %6) #7
+  br label %delete.end
+
+delete.end:                                       ; preds = %delete.notnull, %entry
+  %7 = load ptr, ptr %default2, align 8
+  %isnull1 = icmp eq ptr %7, null
+  br i1 %isnull1, label %delete.end3, label %delete.notnull2
+
+delete.notnull2:                                  ; preds = %delete.end
+  call void @_ZdaPv(ptr noundef %7) #7
+  br label %delete.end3
+
+delete.end3:                                      ; preds = %delete.notnull2, %delete.end
+  %8 = load ptr, ptr %default3, align 8
+  %isnull4 = icmp eq ptr %8, null
+  br i1 %isnull4, label %delete.end6, label %delete.notnull5
+
+delete.notnull5:                                  ; preds = %delete.end3
+  call void @_ZdaPv(ptr noundef %8) #7
+  br label %delete.end6
+
+delete.end6:                                      ; preds = %delete.notnull5, %delete.end3
+  %9 = load ptr, ptr %default4, align 8
+  %isnull7 = icmp eq ptr %9, null
+  br i1 %isnull7, label %delete.end9, label %delete.notnull8
+
+delete.notnull8:                                  ; preds = %delete.end6
+  call void @_ZdaPv(ptr noundef %9) #7
+  br label %delete.end9
+
+delete.end9:                                      ; preds = %delete.notnull8, %delete.end6
+  %call = call i32 @sleep(i32 noundef 10)
+  %10 = load ptr, ptr %cold1, align 8
+  %isnull10 = icmp eq ptr %10, null
+  br i1 %isnull10, label %delete.end12, label %delete.notnull11
+
+delete.notnull11:                                 ; preds = %delete.end9
+  call void @_ZdaPv(ptr noundef %10) #7
+  br label %delete.end12
+
+delete.end12:                                     ; preds = %delete.notnull11, %delete.end9
+  %11 = load ptr, ptr %cold2, align 8
+  %isnull13 = icmp eq ptr %11, null
+  br i1 %isnull13, label %delete.end15, label %delete.notnull14
+
+delete.notnull14:                                 ; preds = %delete.end12
+  call void @_ZdaPv(ptr noundef %11) #7
+  br label %delete.end15
+
+delete.end15:                                     ; preds = %delete.notnull14, %delete.end12
+  ret i32 0
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3
+
+; Function Attrs: nobuiltin nounwind
+declare void @_ZdaPv(ptr noundef) #4
+
+declare i32 @sleep(i32 noundef) #5
+
+attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #6 = { builtin allocsize(0) }
+attributes #7 = { builtin nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6}
+
+!0 = !{i32 7, !"Dwarf Version", i32 5}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 8, !"PIC Level", i32 2}
+!4 = !{i32 7, !"PIE Level", i32 2}
+!5 = !{i32 7, !"uwtable", i32 2}
+!6 = !{i32 7, !"frame-pointer", i32 2}
+!7 = !{!8, !10, !12}
+!8 = !{!9, !"cold"}
+!9 = !{i64 -3461278137325233666, i64 -7799663586031895603}
+!10 = !{!11, !"notcold"}
+!11 = !{i64 -3461278137325233666, i64 -3483158674395044949}
+!12 = !{!13, !"notcold"}
+!13 = !{i64 -3461278137325233666, i64 -2441057035866683071}
+!14 = !{i64 -3461278137325233666}
+!15 = !{!16, !18, !20}
+!16 = !{!17, !"notcold"}
+!17 = !{i64 -1415475215210681400, i64 -2441057035866683071}
+!18 = !{!19, !"cold"}
+!19 = !{i64 -1415475215210681400, i64 -3483158674395044949}
+!20 = !{!21, !"notcold"}
+!21 = !{i64 -1415475215210681400, i64 -7799663586031895603}
+!22 = !{i64 -1415475215210681400}
+!23 = !{i64 -2441057035866683071}
+!24 = !{i64 -3483158674395044949}
+!25 = !{i64 -7799663586031895603}
+!26 = !{i64 4256801922104815624}
+!27 = !{i64 6438520854747849124}
+!28 = !{i64 -8402480891374135967}
+
+
+;; Originally we create a single clone of each call to new from E, since each
+;; allocates cold memory for a single caller.
+
+; DUMP: CCG after cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2 3
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	Clones: [[ENEW1CLONE:0x[a-z0-9]+]]
+
+; DUMP: Node [[C]]
+; DUMP: 	  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 2 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[B]]
+; DUMP: 	  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1)	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 4 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[ENEW1CLONE]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW1CLONE]] to Caller: [[D:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	Clone of [[ENEW1ORIG]]
+
+; DUMP: Node [[D]]
+; DUMP: 	  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 6 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[ENEW2ORIG]]
+; DUMP: 	  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 4 6
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	Clones: [[ENEW2CLONE]]
+
+; DUMP: Node [[ENEW2CLONE]]
+; DUMP: 	  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 5
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: 	Clone of [[ENEW2ORIG]]
+
+
+;; We greedily create a clone of E that is initially used by the clones of the
+;; first call to new. However, we end up with an incompatible set of callers
+;; given the second call to new which has clones with a different combination of
+;; callers. Eventually, we create 2 more clones, and the first clone becomes dead.
+; REMARKS: created clone _Z1EPPcS0_.pgho.1
+; REMARKS: created clone _Z1EPPcS0_.pgho.2
+; REMARKS: created clone _Z1EPPcS0_.pgho.3
+; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.pgho.2
+; REMARKS: call in clone _Z1EPPcS0_.pgho.2 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.pgho.3
+; REMARKS: call in clone _Z1EPPcS0_.pgho.3 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1BPPcS0_ assigned to call function clone _Z1EPPcS0_
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.pgho.2 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.pgho.3 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+
+
+;; Original version of E is used for the non-cold allocations, both from B.
+; IR: define internal {{.*}} @_Z1EPPcS0_(
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR:   %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1BPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_(
+;; C calls a clone of E with the first new allocating cold memory and the
+;; second allocating non-cold memory.
+; IR: define internal {{.*}} @_Z1CPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_.pgho.3(
+;; D calls a clone of E with the first new allocating non-cold memory and the
+;; second allocating cold memory.
+; IR: define internal {{.*}} @_Z1DPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_.pgho.2(
+;; Transient clone that will get removed as it ends up with no callers.
+;; Its calls to new never get updated with a memprof attribute as a result.
+; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.1(
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT:[0-9]+]]
+; IR:   %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT]]
+; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.2(
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR:   %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1EPPcS0_.pgho.3(
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR:   %call1 = call {{.*}} @_Znam(i64 noundef 10) #[[COLD]]
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[DEFAULT]] = { builtin allocsize(0) }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 2 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 4 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 pgho-context-disambiguation - Number of function clones created during whole program analysis
diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll
--- a/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll
+++ b/llvm/test/Transforms/PGHOContextDisambiguation/indirectcall.ll
@@ -1,7 +1,7 @@
 ;; Tests callsite context graph generation for call graph containing indirect
 ;; calls. Currently this should result in conservative behavior, such that the
 ;; indirect call receives a null call in its graph node, to prevent subsequent
-;; cloning.
+;; cloning. Also tests graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -52,7 +52,9 @@
 ; RUN: opt -passes=pgho-context-disambiguation \
 ; RUN:  -pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
 ; RUN:  -pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \
-; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
 
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should only create a single clone of foo, for the direct call
@@ -511,6 +513,43 @@
 ; DUMP: 	CallerEdges:
 
 
+; REMARKS: created clone _Z3foov.pgho.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1
+; REMARKS: call in clone _Z3foov.pgho.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _ZN1A1xEv assigned to call function clone _Z3foov
+; REMARKS: call in clone _ZN1B1xEv assigned to call function clone _Z3foov
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold
+
+
+; IR: define internal {{.*}} @_Z3barP1A(
+; IR:   %call = call {{.*}} %1(
+; IR: define {{.*}} @main(
+; IR:   %call = call {{.*}} @_Z3foov()
+;; Only the second call to foo, which allocates cold memory via direct calls,
+;; is replaced with a call to a clone that calls a cold allocation.
+; IR:   %call1 = call {{.*}} @_Z3foov.pgho.1()
+; IR:   %call2 = call {{.*}} @_Z3barP1A(
+; IR:   %call3 = call {{.*}} @_Z3barP1A(
+; IR:   %call4 = call {{.*}} @_Z3barP1A(
+; IR:   %call5 = call {{.*}} @_Z3barP1A(
+; IR: define internal {{.*}} @_ZN1A1xEv(
+; IR:   %call = call {{.*}} @_Z3foov()
+; IR: define internal {{.*}} @_ZN1B1xEv(
+; IR:   %call = call {{.*}} @_Z3foov()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.pgho.1()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 pgho-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOT: digraph CallsiteContextGraph {
 ; DOT:     N[[FOO:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc0\n_Z3foov -\> _Znam",tooltip="N[[FOO]] ContextIds: 2 4 6 1 3 5",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
 ; DOT:     N[[MAIN1:0x[a-z0-9]+]] [shape="record",label="OrigId: 15025054523792398438\nmain -\> _Z3foov",tooltip="N[[MAIN1]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled"]; // callsite, cold
diff --git a/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll b/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll
--- a/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll
+++ b/llvm/test/Transforms/PGHOContextDisambiguation/inlined.ll
@@ -1,6 +1,7 @@
 ;; Test callsite context graph generation for call graph with two memprof
 ;; contexts and partial inlining, requiring generation of a new fused node to
 ;; represent the inlined sequence while matching callsite nodes onto the graph.
+;; Also tests graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -41,7 +42,9 @@
 ; RUN: opt -passes=pgho-context-disambiguation \
 ; RUN:	-pgho-verify-ccg -pgho-verify-nodes -pgho-dump-ccg \
 ; RUN:	-pgho-export-to-dot -pgho-dot-file-path-prefix=%t. \
-; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=pgho-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should create clones for foo and bar for the call from main to allocate
@@ -291,6 +294,42 @@
 ; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4
 
 
+; REMARKS: created clone _Z3barv.pgho.1
+; REMARKS: created clone _Z3foov.pgho.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.pgho.1
+; REMARKS: call in clone _Z3foov.pgho.1 assigned to call function clone _Z3barv.pgho.1
+; REMARKS: call in clone _Z3barv.pgho.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov assigned to call function clone _Z3barv
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3bazv marked with memprof allocation attribute notcold
+
+
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   %call.i = call {{.*}} @_Z3barv()
+; IR: define dso_local {{.*}} @main(i32 noundef %argc, ptr noundef %argv)
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   %call = call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   %call1 = call {{.*}} @_Z3foov.pgho.1()
+; IR: define internal {{.*}} @_Z3barv.pgho.1()
+; IR:   %call = call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.pgho.1()
+; IR:   %call.i = call {{.*}} @_Z3barv.pgho.1()
+; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin allocsize(0) "memprof"="cold" }
+
+
+; STATS: 1 pgho-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 2 pgho-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 2 pgho-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOT: digraph CallsiteContextGraph {
 ; DOT:     N[[BAZ:0x[a-z0-9]+]] [shape="record",label="OrigId: Alloc2\n_Z3bazv -\> _Znam",tooltip="N[[BAZ]] ContextIds: 4 3",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold
 ; DOT:     N[[FOO2:0x[a-z0-9]+]] [shape="record",label="OrigId: 2732490490862098848\nnull call (external)",tooltip="N[[FOO2]] ContextIds: 4 3",fillcolor="mediumorchid1",style="filled",style="filled"]; // callsite, default|cold