diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1108,6 +1108,85 @@
     return CallGraphRoot;
   }
 
+  bool hasSimilarFunction(unsigned id) { return SimilarFunctions.count(id); }
+
+  void addToSimilarFunctions(unsigned id, GlobalValue::GUID GUID) {
+    if (id == 0)
+      return; // invalid id.
+    if (DuplicateFunctions.count(GUID))
+      return;
+    ValueInfo VI = getValueInfo(GUID);
+    if (!VI)
+      return;
+    // assert(VI.getSummaryList().size() == 1);
+    if (VI.getSummaryList().size() != 1)
+      return;
+    GlobalValueSummary *S = VI.getSummaryList()[0].get();
+    if (!isa<FunctionSummary>(S))
+      return;
+    assert(isa<FunctionSummary>(S) && "Not a function summary!");
+    if (FunctionSimilarityHashes.count(GUID)) {
+      // Erase the GUID having multiple visits in the ModuleSummaryIndex.
+      FunctionSimilarityHashes.erase(GUID);
+      DuplicateFunctions.insert(GUID);
+      return;
+    }
+
+    FunctionSimilarityHashes[GUID] = id;
+  }
+
+  void populateReverseSimilarityHashMap() {
+    for (auto &p : FunctionSimilarityHashes)
+      SimilarFunctions[p.second].push_back(p.first);
+  }
+
+  void removeSingleEntriesFromSimHashMaps() {
+    // Iterate over the hash to remove entries with no duplicates.
+    for (auto I = SimilarFunctions.begin(), E = SimilarFunctions.end();
+         I != E;) {
+      auto Next = std::next(I);
+      assert(I->second.size() && "Empty Entry!");
+      if (I->second.size() == 1) {
+        FunctionSimilarityHashes.erase(I->second[0]);
+        SimilarFunctions.erase(I);
+      }
+      I = Next;
+    }
+  }
+
+  std::map<unsigned, std::vector<GlobalValue::GUID>> &getSimilarFunctions() {
+    return SimilarFunctions;
+  }
+
+  const std::map<unsigned, std::vector<GlobalValue::GUID>> &
+  getSimilarFunctions() const {
+    return SimilarFunctions;
+  }
+
+  unsigned getSimilarityHash(GlobalValue::GUID ID) const {
+    return FunctionSimilarityHashes.find(ID)->second;
+  }
+
+  std::map<GlobalValue::GUID, unsigned> &getSimilarFunctionsHash() {
+    return FunctionSimilarityHashes;
+  }
+
+  const std::map<GlobalValue::GUID, unsigned> &getSimilarFunctionsHash() const {
+    return FunctionSimilarityHashes;
+  }
+
+  void addToHostSimilarFunction(GlobalValue::GUID ID) {
+    HostSimilarFunction.insert(ID);
+  }
+
+  std::set<GlobalValue::GUID> &getHostSimilarFunction() {
+    return HostSimilarFunction;
+  }
+
+  const std::set<GlobalValue::GUID> &getHostSimilarFunction() const {
+    return HostSimilarFunction;
+  }
+
   bool withGlobalValueDeadStripping() const {
     return WithGlobalValueDeadStripping;
   }
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -292,6 +292,8 @@
 void initializeMemorySSAWrapperPassPass(PassRegistry&);
 void initializeMemorySanitizerLegacyPassPass(PassRegistry&);
 void initializeMergeFunctionsLegacyPassPass(PassRegistry&);
+void initializeMergeFunctionsPass(PassRegistry&);
+void initializeMergeSimilarFunctionsPass(PassRegistry&);
 void initializeMergeICmpsLegacyPassPass(PassRegistry &);
 void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
 void initializeMetaRenamerPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -197,6 +197,7 @@
       (void) llvm::createPostOrderFunctionAttrsLegacyPass();
       (void) llvm::createReversePostOrderFunctionAttrsPass();
       (void) llvm::createMergeFunctionsPass();
+      (void) llvm::createMergeSimilarFunctionsPass();
       (void) llvm::createMergeICmpsLegacyPass();
       (void) llvm::createExpandMemCmpPass();
       std::string buf;
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -215,6 +215,13 @@
 /// function(s).
 ModulePass *createHotColdSplittingPass();
 
+//===----------------------------------------------------------------------===//
+/// createMergeSimilarFunctionsPass - This pass discovers similar functions and
+/// merges them.
+///
+ModulePass *
+createMergeSimilarFunctionsPass(const ModuleSummaryIndex *S = nullptr);
+
 //===----------------------------------------------------------------------===//
 /// createPartialInliningPass - This pass inlines parts of functions.
 ///
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -126,6 +126,20 @@
 Function *CloneFunction(Function *F, ValueToValueMapTy &VMap,
                         ClonedCodeInfo *CodeInfo = nullptr);
 
+/// Used to control @fn CloneFunctionInto.
+enum class CloneType {
+  InvalidCloneType,
+  /// Cloning will result in module level changes.
+  ModuleLevelChanges,
+  /// !ModuleLevelChanges, When no module level changes will be made to the
+  /// cloned function.
+  NoModuleLevelChanges,
+  /// Cloning will be used for extracting functions by passes like function
+  /// merging, it does not require module level changes but debug info needs
+  /// special treatment like: DISubprogram is not cloned.
+  ExtractingFunctions,
+};
+
 /// Clone OldFunc into NewFunc, transforming the old arguments into references
 /// to VMap values.  Note that if NewFunc already has basic blocks, the ones
 /// cloned into it will be added to the end of the function.  This function
@@ -136,7 +150,7 @@
 /// mappings.
 ///
 void CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
-                       ValueToValueMapTy &VMap, bool ModuleLevelChanges,
+                       ValueToValueMapTy &VMap, CloneType CT,
                        SmallVectorImpl<ReturnInst*> &Returns,
                        const char *NameSuffix = "",
                        ClonedCodeInfo *CodeInfo = nullptr,
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -76,6 +76,89 @@
     cl::value_desc("filename"),
     cl::desc("File to emit dot graph of new summary into."));
 
+cl::opt<bool> UseGlobalAliases(
+    "mergesimilarfunc-global-aliases", cl::Hidden, cl::init(false),
+    cl::desc("Enable writing alias by enabling global aliases"));
+
+cl::opt<unsigned> MergeMinInsts(
+    "mergesimilarfunc-min-insts", cl::Hidden, cl::init(4),
+    cl::desc("Min instructions required to even consider single block fns"));
+
+// Minimize the name pollution caused by the enum values.
+namespace Opt {
+cl::opt<enum MergeLevelEnum> MergeLevel(
+    "mergesimilarfunc-level", cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Level of function merging:"), cl::init(size),
+    cl::values(clEnumVal(none, "function merging disabled"),
+               clEnumVal(size, "only try to merge functions that are optimized "
+                               "for size"),
+               clEnumVal(all, "attempt to merge all similar functions")));
+}
+
+namespace llvm {
+
+static const char *MERGED_SUFFIX = "__merged";
+
+/// Returns the type id for a type to be hashed. We turn pointer types into
+/// integers here because the actual compare logic below considers pointers and
+/// integers of the same size as equal.
+static Type::TypeID getTypeIDForHash(Type *Ty) {
+  if (Ty->isPointerTy())
+    return Type::IntegerTyID;
+  return Ty->getTypeID();
+}
+
+bool isAliasCapable(const Function* G) {
+  return
+    UseGlobalAliases && G->hasGlobalUnnamedAddr()
+    && (G->hasExternalLinkage() || G->hasLocalLinkage() || G->hasWeakLinkage());
+}
+
+bool isComparisonCandidate(const Function *F) {
+  if (Opt::MergeLevel == Opt::size) {
+    // Only consider functions that are to be optimized for size.
+    // By default, that is all functions at -Os/-Oz and nothing at -O2.
+    bool Os = F->getAttributes().
+      hasAttribute(AttributeList::FunctionIndex, Attribute::OptimizeForSize);
+    bool Oz = F->getAttributes().
+      hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+    if (!Os && !Oz)
+      return false;
+  }
+
+  // Ignore declarations and tiny functions - no point in merging those
+  if (F->isDeclaration()) return false;
+  if (F->getName().endswith(MERGED_SUFFIX)) return false;
+  if (F->hasAvailableExternallyLinkage()) return false;
+  if (F->hasFnAttribute(Attribute::AlwaysInline)) return false;
+  if (F->size() == 1 && F->begin()->size() < MergeMinInsts)
+    return isAliasCapable(F);
+
+  return true;
+}
+
+unsigned profileFunction(const Function *F) {
+  FunctionType *FTy = F->getFunctionType();
+  if (!isComparisonCandidate(F))
+    return 0;
+  if (F->hasGC() || FTy->isVarArg() || !F->hasExactDefinition())
+    return 0;
+  FoldingSetNodeID ID;
+  ID.AddInteger(F->size());
+  ID.AddInteger(F->getCallingConv());
+  // Add pure attribute, has side-effects attribute.
+  ID.AddBoolean(F->hasFnAttribute(Attribute::NoUnwind));
+  ID.AddBoolean(F->hasFnAttribute(Attribute::NoReturn));
+  //ID.AddBoolean(F->hasGC());
+  //ID.AddBoolean(F->isInterposable());
+  //ID.AddBoolean(FTy->isVarArg());
+  ID.AddInteger(getTypeIDForHash(FTy->getReturnType()));
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    ID.AddInteger(getTypeIDForHash(FTy->getParamType(i)));
+  return ID.ComputeHash();
+}
+}
+
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
 // Instruction or a GlobalVariable (which walks its initializer).
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -318,7 +318,7 @@
          "modules.");
 
   SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
-  CloneFunctionInto(NewF, &OrigF, VMap, /*ModuleLevelChanges=*/true, Returns,
+  CloneFunctionInto(NewF, &OrigF, VMap, CloneType::ModuleLevelChanges, Returns,
                     "", nullptr, nullptr, Materializer);
   OrigF.deleteBody();
 }
diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
--- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -316,7 +316,7 @@
       }
     }
     SmallVector<ReturnInst*, 8> Returns;
-    CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+    CloneFunctionInto(NewF, F, VMap, CloneType::NoModuleLevelChanges, Returns);
 
     // Build new MDNode.
     SmallVector<Metadata *, 6> KernelMDArgs;
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -667,7 +667,7 @@
   auto savedLinkage = NewF->getLinkage();
   NewF->setLinkage(llvm::GlobalValue::ExternalLinkage);
 
-  CloneFunctionInto(NewF, &OrigF, VMap, /*ModuleLevelChanges=*/true, Returns);
+  CloneFunctionInto(NewF, &OrigF, VMap, CloneType::ModuleLevelChanges, Returns);
 
   NewF->setLinkage(savedLinkage);
   NewF->setVisibility(savedVisibility);
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -27,6 +27,7 @@
   LowerTypeTests.cpp
   MergeFunctions.cpp
   OpenMPOpt.cpp
+  MergeSimilarFunctions.cpp
   PartialInlining.cpp
   PassManagerBuilder.cpp
   PruneEH.cpp
diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp
--- a/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/llvm/lib/Transforms/IPO/IPO.cpp
@@ -45,6 +45,8 @@
   initializeSingleLoopExtractorPass(Registry);
   initializeLowerTypeTestsPass(Registry);
   initializeMergeFunctionsLegacyPassPass(Registry);
+  initializeMergeFunctionsPass(Registry);
+  initializeMergeSimilarFunctionsPass(Registry);
   initializePartialInlinerLegacyPassPass(Registry);
   initializeAttributorLegacyPassPass(Registry);
   initializeAttributorCGSCCLegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/IPO/MergeSimilarFunctions.cpp b/llvm/lib/Transforms/IPO/MergeSimilarFunctions.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/MergeSimilarFunctions.cpp
@@ -0,0 +1,2197 @@
+//===- MergeSimilarFunctions.cpp - Merge similar functions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass merges both equivalent and similar functions to reduce code size.
+//
+// For a more detailed explanation of the approach, see:
+// Edler von Koch et al. "Exploiting Function Similarity for Code Size
+// Reduction", LCTES 2014.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mergesimilarfunc"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <vector>
+#include <list>
+using namespace llvm;
+
+STATISTIC(NumFunctionsMerged, "Number of functions merged");
+STATISTIC(NumThunksWritten, "Number of thunks generated");
+STATISTIC(NumAliasesWritten, "Number of aliases generated");
+STATISTIC(NumDoubleWeak, "Number of new functions created");
+STATISTIC(NumMultiMerged, "Number of multi-merged functions");
+
+STATISTIC(NumSimilarFunctionsMerged, "Number of similar functions merged");
+
+static cl::opt<unsigned> MergeMinInsts(
+    "mergesimilarfunc-min-insts", cl::Hidden, cl::init(4),
+    cl::desc("Min instructions required to even consider single block fns"));
+
+static cl::opt<unsigned> MergeDifferingMinInsts(
+    "mergesimilarfunc-diff-min-insts", cl::Hidden, cl::init(15),
+    cl::desc("Min instructions required to try merging differing functions"));
+
+static cl::opt<unsigned> MergeMaxDiffering(
+    "mergesimilarfunc-max-diff", cl::Hidden, cl::init(8),
+    cl::desc("Maximum number of differing instructions allowed"));
+
+static cl::opt<unsigned> MergeMinSimilarity(
+    "mergesimilarfunc-min-similarity", cl::Hidden, cl::init(70),
+    cl::desc("Minimum percentage of similar instructions required"));
+
+static cl::opt<bool> OptPrintMerges("mergesimilarfunc-print-merges", cl::Hidden,
+                                    cl::init(false));
+
+static cl::opt<bool> UseGlobalAliases(
+    "mergesimilarfunc-global-aliases", cl::Hidden, cl::init(false),
+    cl::desc("Enable writing alias by enabling global aliases"));
+
+void PrintMerges(const char *Desc, Function *Old, Function *New) {
+  if (OptPrintMerges) {
+    dbgs() << "=== [" << Desc << "] replacing " << Old->getName() << " with "
+           << New->getName() << "\n";
+  }
+}
+
+// Minimize the name pollution caused by the enum values.
+namespace Opt {
+enum MergeLevelEnum { none, size, all };
+static cl::opt<enum MergeLevelEnum> MergeLevel(
+    "mergesimilarfunc-level", cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Level of function merging:"), cl::init(size),
+    cl::values(clEnumVal(none, "function merging disabled"),
+               clEnumVal(size, "only try to merge functions that are optimized "
+                               "for size"),
+               clEnumVal(all, "attempt to merge all similar functions")));
+}
+
+static const char *MERGED_SUFFIX = "__merged";
+
+/// Returns the type id for a type to be hashed. We turn pointer types into
+/// integers here because the actual compare logic below considers pointers and
+/// integers of the same size as equal.
+static Type::TypeID getTypeIDForHash(Type *Ty) {
+  if (Ty->isPointerTy())
+    return Type::IntegerTyID;
+  return Ty->getTypeID();
+}
+
+/// Creates a hash-code for the function which is the same for any two
+/// functions that will compare equal, without looking at the instructions
+/// inside the function.
+static unsigned profileFunction(const Function *F) {
+  FunctionType *FTy = F->getFunctionType();
+
+  FoldingSetNodeID ID;
+  ID.AddInteger(F->size());
+  ID.AddInteger(F->getCallingConv());
+  ID.AddBoolean(F->hasGC());
+  ID.AddBoolean(F->isInterposable());
+  ID.AddBoolean(FTy->isVarArg());
+  ID.AddInteger(getTypeIDForHash(FTy->getReturnType()));
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    ID.AddInteger(getTypeIDForHash(FTy->getParamType(i)));
+  return ID.ComputeHash();
+}
+
+
+/// Replace Inst1 by a switch statement that executes Inst1 or one of Inst2s
+/// depending on the value of SwitchVal. If a value in Inst2s is NULL, it
+/// defaults to executing Inst1. Returns set of terminator instructions of newly
+/// created switch blocks in Ret.
+///
+/// For instance, the transformation may look as follows:
+///         ...Head...
+///           Inst1           with all of Insts2s without parents
+///         ...Tail...
+///  into
+///         ...Head...
+///           Switch
+///         /     |       \                    .
+///    (default) (1)       (2)
+///      Inst1   Inst2s[0] Inst2s[1]
+///     Ret[0]   Ret[1]    Ret[2]
+///        \      |       /
+///         ...Tail...
+///
+static void SplitBlockAndInsertSwitch(
+    Value *SwitchVal, Instruction *Inst1,
+    SmallVectorImpl<Instruction *> &Inst2s,
+    SmallVectorImpl<Instruction *> &Ret) {
+  // Split block
+  BasicBlock *Head = Inst1->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(Inst1);
+
+  // Create default block
+  LLVMContext &C = Head->getContext();
+  BasicBlock *DefaultBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+
+  // Insert switch instruction at end of Head
+  Instruction *HeadOldTerm = Head->getTerminator();
+  SwitchInst *Switch = SwitchInst::Create(SwitchVal, DefaultBlock,
+                                               Inst2s.size());
+  ReplaceInstWithInst(HeadOldTerm, Switch);
+
+  // Move instructions into the blocks
+  if (Inst1->isTerminator()) {
+    Inst1->removeFromParent();
+    DefaultBlock->getInstList().push_back(Inst1);
+    Ret.push_back(cast<Instruction>(Inst1));
+  } else {
+    Instruction *DefaultTerm = BranchInst::Create(Tail, DefaultBlock);
+    Inst1->moveBefore(DefaultTerm);
+    Ret.push_back(DefaultTerm);
+  }
+
+  for (unsigned InstPos = 0, InstNum = Inst2s.size(); InstPos < InstNum;
+       ++InstPos) {
+    Instruction *Inst2 = Inst2s[InstPos];
+    if (!Inst2) {
+      Ret.push_back(NULL);
+      continue;
+    }
+
+    BasicBlock *CaseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+
+    // Update the debug information of the merged instruction by marking it as
+    // 'inlined' at this location. If only Inst1 or Inst2 has debug
+    // information, we try to do something sensible that won't break the
+    // verifier.
+    if (Inst1->getDebugLoc()) {
+      if (Inst2->getDebugLoc()) {
+        const DebugLoc &I2Loc = Inst2->getDebugLoc();
+        Inst2->setDebugLoc(
+            DebugLoc::get(I2Loc.getLine(), I2Loc.getCol(), I2Loc.getScope(),
+                          /*InlinedAt*/ Inst1->getDebugLoc().getAsMDNode()));
+      } else {
+        Inst2->setDebugLoc(Inst1->getDebugLoc());
+      }
+    } else if (Inst2->getDebugLoc()) {
+      Inst2->setDebugLoc(DebugLoc());
+    }
+
+    if (Inst2->isTerminator()) {
+      assert(Inst1->isTerminator() &&
+        "Inst1 and Inst2 must both be terminators or non-terminators!");
+      CaseBlock->getInstList().push_back(Inst2);
+      Ret.push_back(cast<Instruction>(Inst2));
+    } else {
+      Instruction *CaseTerm = BranchInst::Create(Tail, CaseBlock);
+      Inst2->insertBefore(CaseTerm);
+      Ret.push_back(CaseTerm);
+    }
+
+    Switch->addCase(ConstantInt::get(cast<IntegerType>(SwitchVal->getType()),
+                                     InstPos+1),
+                    CaseBlock);
+  }
+
+  // If Inst1 (and Inst2s) are Terminator Inst's, Tail will be empty and can be
+  // deleted now. We also need to update PHI nodes to add the additional
+  // incoming blocks from the SwitchInst.
+  if (Inst1->isTerminator()) {
+    for (succ_iterator I = succ_begin(DefaultBlock), E = succ_end(DefaultBlock);
+         I != E; ++I) {
+      BasicBlock *Successor = *I;
+      PHINode *Phi;
+
+      for (BasicBlock::iterator II = Successor->begin();
+           (Phi = dyn_cast<PHINode>(II)); ++II)
+        for (unsigned ValId = 0, ValEnd = Phi->getNumIncomingValues();
+             ValId != ValEnd; ++ValId)
+          if (Phi->getIncomingBlock(ValId) == Tail) {
+            Phi->setIncomingBlock(ValId, DefaultBlock);
+            SmallVectorImpl<Instruction *>::iterator
+              SwitchI = Ret.begin(), SwitchE = Ret.end();
+            for (++SwitchI; SwitchI != SwitchE; ++SwitchI) {
+              if (!*SwitchI)
+                continue;
+              Phi->addIncoming(Phi->getIncomingValue(ValId),
+                               (*SwitchI)->getParent());
+            }
+          }
+    }
+
+    Tail->eraseFromParent();
+  }
+}
+
+/// Insert function NewF into module, placing it immediately after the
+/// existing function PredF. If PredF does not exist, insert at the end.
+static void insertFunctionAfter(Function *NewF, Function *PredF) {
+  Module *M = PredF->getParent();
+  Module::FunctionListType &FList = M->getFunctionList();
+
+  for (Module::FunctionListType::iterator I = FList.begin(), E = FList.end();
+      I != E; ++I) {
+    if (PredF == &*I) {
+      FList.insertAfter(I, NewF);
+      return;
+    }
+  }
+
+  // Couldn't find PredF, insert at end
+  FList.push_back(NewF);
+}
+
+/// Create a cast instruction if needed to cast V to type DstType. We treat
+/// pointer and integer types of the same bitwidth as equivalent, so this can be
+/// used to cast them to each other where needed. The function returns the Value
+/// itself if no cast is needed, or a new CastInst instance inserted before
+/// InsertBefore. The integer type equivalent to pointers must be passed as
+/// IntPtrType (get it from DataLayout). This is guaranteed to generate no-op
+/// casts, otherwise it will assert.
+static Value *createCastIfNeeded(Value *V, Type *DstType,
+                                 Value *InstrOrBB, Type *IntPtrType) {
+  if (V->getType() == DstType)
+    return V;
+
+  BasicBlock *InsertAtEnd = dyn_cast<BasicBlock>(InstrOrBB);
+  Instruction *InsertBefore = dyn_cast<Instruction>(InstrOrBB);
+  BasicBlock *InsertBB = InsertAtEnd ? InsertAtEnd : InsertBefore->getParent();
+
+  CastInst *Result;
+  Type *OrigType = V->getType();
+
+  if (OrigType->isStructTy()) {
+    assert(DstType->isStructTy());
+    assert(OrigType->getStructNumElements() == DstType->getStructNumElements());
+
+    IRBuilder<> Builder(InsertBB);
+    if (InsertBefore)
+      Builder.SetInsertPoint(InsertBefore);
+    Value *Result = UndefValue::get(DstType);
+    for (unsigned int I = 0, E = OrigType->getStructNumElements(); I < E; ++I) {
+      Value *ExtractedValue
+        = Builder.CreateExtractValue(V, ArrayRef<unsigned int>(I));
+      Value *Element = createCastIfNeeded(ExtractedValue,
+                                          DstType->getStructElementType(I),
+                                          InstrOrBB, IntPtrType);
+      Result =
+          Builder.CreateInsertValue(Result, Element, ArrayRef<unsigned int>(I));
+    }
+    return Result;
+  }
+  assert(!DstType->isStructTy());
+
+  if (OrigType->isPointerTy()
+      && (DstType->isIntegerTy() || DstType->isPointerTy())) {
+    if (InsertBefore)
+      Result = CastInst::CreatePointerCast(V, DstType, "", InsertBefore);
+    else
+      Result = CastInst::CreatePointerCast(V, DstType, "", InsertAtEnd);
+  } else if (OrigType->isIntegerTy() && DstType->isPointerTy()
+             && OrigType == IntPtrType) {
+    // Int -> Ptr
+    if (InsertBefore) {
+      Result = CastInst::Create(CastInst::IntToPtr, V, DstType, "",
+                                InsertBefore);
+    } else {
+      Result = CastInst::Create(CastInst::IntToPtr, V, DstType, "",
+                                InsertAtEnd);
+    }
+  } else {
+    llvm_unreachable("Can only cast int -> ptr or ptr -> (ptr or int)");
+  }
+
+  assert(cast<CastInst>(Result)->isNoopCast(IntPtrType) &&
+      "Cast is not a no-op cast. Potential loss of precision");
+
+  return Result;
+}
+
+namespace {
+
+/// ComparableFunction - A struct that pairs together functions with a
+/// DataLayout so that we can keep them together as elements in the DenseSet.
+class ComparableFunction {
+public:
+  ComparableFunction() : Func(0), IsNew(false) { }
+
+  ComparableFunction(const ComparableFunction &that)
+    : Func(that.Func), IsNew(that.IsNew) {
+  }
+
+  ComparableFunction(Function *Func) : Func(Func), IsNew(true) { }
+
+  ~ComparableFunction() { }
+
+  ComparableFunction &operator=(const ComparableFunction &that) {
+    Func = that.Func;
+    IsNew = that.IsNew;
+    return *this;
+  }
+
+  Function *getFunc() const { return Func; }
+  bool isNew() const { return IsNew; }
+
+  // Drops AssertingVH reference to the function. Outside of debug mode, this
+  // does nothing.
+  void release() {
+    assert(Func &&
+           "Attempted to release function twice, or release empty/tombstone!");
+    Func = NULL;
+  }
+
+  void markCompared() {
+    IsNew = false;
+  }
+private:
+  AssertingVH<Function> Func;
+  bool IsNew;
+};
+
+}
+
+namespace {
+
+/// FunctionComparator - Compares two functions to determine whether or not
+/// they will generate machine code with the same behaviour. DataLayout is
+/// used if available. The comparator always fails conservatively (erring on the
+/// side of claiming that two functions are different).
+class FunctionComparator {
+public:
+  FunctionComparator(const DataLayout *DL, Function *F1, Function *F2)
+    : isDifferent(false), isNotMergeable(false),
+      BasicBlockCount(0), InstructionCount(0), DifferingInstructionsCount(0),
+      F1(F1), F2(F2), SimilarityMetric(0), DL(DL), ID(CurID++) {}
+
+  ~FunctionComparator() {}
+
+  /// Test whether the two functions have equivalent behaviour. Returns true if
+  /// they are equal or can be merged, false if not.
+  bool compare();
+
+  /// Indicate whether the two functions are an exact match after comparison
+  bool isExactMatch();
+
+  /// Indicate whether the two functions candidates for merging after comparison
+  bool isMergeCandidate();
+
+  /// Get a similarity metric between the two functions. Higher means more
+  /// similar.
+  unsigned getSimilarityMetric() {
+    if (!SimilarityMetric)
+      SimilarityMetric = (unsigned)(((float)InstructionCount -
+            DifferingInstructionsCount)/InstructionCount*10000);
+    return SimilarityMetric;
+  }
+
+  Function *getF1() { return F1; }
+  Function *getF2() { return F2; }
+  ValueToValueMapTy &getF1toF2Map() { return id_map; }
+  ValueToValueMapTy &getF2toF1Map() { return seen_values; }
+  const DataLayout *getDataLayout() { return DL; }
+
+  /// Assign or look up previously assigned numbers for the two values, and
+  /// return whether the numbers are equal. Numbers are assigned in the order
+  /// visited. If NoSelfRef is set, F1 and F2 are not assigned to each other
+  /// (treated as 'equal').
+  bool enumerate(const Value *V1, const Value *V2, bool NoSelfRef=false);
+
+  /// Compare two Types, treating all pointer types as equal.
+  bool isEquivalentType(Type *Ty1, Type *Ty2) const;
+
+  /// Instructions that differ between the two functions (F1's -> F2's inst).
+  MapVector<const Instruction *, const Instruction *> DifferingInstructions;
+
+  /// Instructions that reference F1/F2 itself (recursive calls etc.)
+  /// These may need special treatment when merging differing functions.
+  MapVector<const Instruction *, const Instruction *> SelfRefInstructions;
+
+  /// Return the unique ID for the object.
+  unsigned getID() { return ID; }
+
+  bool isDifferent;
+  bool isNotMergeable;
+
+  // Comparison statistics
+  unsigned BasicBlockCount;
+  unsigned InstructionCount;
+  unsigned DifferingInstructionsCount;
+
+private:
+  /// Test whether two basic blocks have equivalent behaviour. Returns true if
+  /// they are equal or can be merged, false if not. PHINodes are not compared
+  /// in this function, but added to the PHIsFound list for delayed processing.
+  bool compare(const BasicBlock *BB1, const BasicBlock *BB2,
+      std::list<std::pair<const PHINode*,const PHINode*> > *PHIsFound);
+
+  /// Compare pairs of PHI nodes. Returns true if all pairs are equal or can
+  /// be merged, false if not.
+  bool comparePHIs(
+      const std::list<std::pair<const PHINode*,const PHINode*> > &PHIs);
+
+  /// Compare two Instructions for equivalence, similar to
+  /// Instruction::isSameOperationAs but with modifications to the type
+  /// comparison.
+  bool isEquivalentOperation(const Instruction *I1,
+                             const Instruction *I2) const;
+
+  /// Compare two GEPs for equivalent pointer arithmetic.
+  bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2);
+  bool isEquivalentGEP(const GetElementPtrInst *GEP1,
+                       const GetElementPtrInst *GEP2) {
+    return isEquivalentGEP(cast<GEPOperator>(GEP1), cast<GEPOperator>(GEP2));
+  }
+
+  // The two functions undergoing comparison.
+  Function *F1, *F2;
+
+  unsigned SimilarityMetric;
+
+  const DataLayout *DL;
+
+  ValueToValueMapTy id_map;
+  ValueToValueMapTy seen_values;
+
+  // Maintain a unique ID for each object.
+  static unsigned CurID;
+  unsigned ID;
+};
+
+}
+
+unsigned FunctionComparator::CurID = 0;
+
+// Any two pointers in the same address space are equivalent, intptr_t and
+// pointers are equivalent. Otherwise, standard type equivalence rules apply.
+bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
+  if (Ty1 == Ty2)
+    return true;
+  if (Ty1->getTypeID() != Ty2->getTypeID()) {
+    LLVMContext &Ctx = Ty1->getContext();
+    if (isa<PointerType>(Ty1) && Ty2 == DL->getIntPtrType(Ctx)) return true;
+    if (isa<PointerType>(Ty2) && Ty1 == DL->getIntPtrType(Ctx)) return true;
+    return false;
+  }
+
+  switch (Ty1->getTypeID()) {
+  default:
+    llvm_unreachable("Unknown type!");
+    // Fall through in Release mode.
+  case Type::IntegerTyID:
+  case Type::VectorTyID:
+    // Ty1 == Ty2 would have returned true earlier.
+    return false;
+
+  case Type::VoidTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+  case Type::MetadataTyID:
+    return true;
+
+  case Type::PointerTyID: {
+    PointerType *PTy1 = cast<PointerType>(Ty1);
+    PointerType *PTy2 = cast<PointerType>(Ty2);
+    return PTy1->getAddressSpace() == PTy2->getAddressSpace();
+  }
+
+  case Type::StructTyID: {
+    StructType *STy1 = cast<StructType>(Ty1);
+    StructType *STy2 = cast<StructType>(Ty2);
+    if (STy1->getNumElements() != STy2->getNumElements())
+      return false;
+
+    if (STy1->isPacked() != STy2->isPacked())
+      return false;
+
+    for (unsigned i = 0, e = STy1->getNumElements(); i != e; ++i) {
+      if (!isEquivalentType(STy1->getElementType(i), STy2->getElementType(i)))
+        return false;
+    }
+    return true;
+  }
+
+  case Type::FunctionTyID: {
+    FunctionType *FTy1 = cast<FunctionType>(Ty1);
+    FunctionType *FTy2 = cast<FunctionType>(Ty2);
+    if (FTy1->getNumParams() != FTy2->getNumParams() ||
+        FTy1->isVarArg() != FTy2->isVarArg())
+      return false;
+
+    if (!isEquivalentType(FTy1->getReturnType(), FTy2->getReturnType()))
+      return false;
+
+    for (unsigned i = 0, e = FTy1->getNumParams(); i != e; ++i) {
+      if (!isEquivalentType(FTy1->getParamType(i), FTy2->getParamType(i)))
+        return false;
+    }
+    return true;
+  }
+
+  case Type::ArrayTyID: {
+    ArrayType *ATy1 = cast<ArrayType>(Ty1);
+    ArrayType *ATy2 = cast<ArrayType>(Ty2);
+    return ATy1->getNumElements() == ATy2->getNumElements() &&
+           isEquivalentType(ATy1->getElementType(), ATy2->getElementType());
+  }
+  }
+}
+
+// Determine whether the two operations are the same except that pointer-to-A
+// and pointer-to-B are equivalent. This should be kept in sync with
+// Instruction::isSameOperationAs.
+bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
+                                               const Instruction *I2) const {
+  // Differences from Instruction::isSameOperationAs:
+  //  * replace type comparison with calls to isEquivalentType.
+  //  * we test for I->hasSameSubclassOptionalData (nuw/nsw/tail) at the top
+  //  * because of the above, we don't test for the tail bit on calls later on
+  if (I1->getOpcode() != I2->getOpcode() ||
+      I1->getNumOperands() != I2->getNumOperands() ||
+      !isEquivalentType(I1->getType(), I2->getType()) ||
+      !I1->hasSameSubclassOptionalData(I2))
+    return false;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same type
+  for (unsigned i = 0, e = I1->getNumOperands(); i != e; ++i)
+    if (!isEquivalentType(I1->getOperand(i)->getType(),
+                          I2->getOperand(i)->getType()))
+      return false;
+
+  // Check special state that is a part of some instructions.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I1)) {
+    const LoadInst *LI2 = cast<LoadInst>(I2);
+    return LI->isVolatile() == LI2->isVolatile() &&
+           LI->getAlignment() == LI2->getAlignment() &&
+           LI->getOrdering() == LI2->getOrdering() &&
+           LI->getSyncScopeID() == LI2->getSyncScopeID() &&
+           LI->getMetadata(LLVMContext::MD_range)
+             == LI2->getMetadata(LLVMContext::MD_range);
+  }
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
+    return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
+           SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() &&
+           SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
+           SI->getSyncScopeID() == cast<StoreInst>(I2)->getSyncScopeID();
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(I1)) {
+    if (AI->getArraySize() != cast<AllocaInst>(I2)->getArraySize() ||
+        AI->getAlignment() != cast<AllocaInst>(I2)->getAlignment())
+      return false;
+
+    // If size is known, I2 can be seen as equivalent to I1 if it allocates
+    // the same or less memory.
+    if (DL->getTypeAllocSize(AI->getAllocatedType())
+          < DL->getTypeAllocSize(cast<AllocaInst>(I2)->getAllocatedType()))
+      return false;
+
+    return true;
+  }
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
+    return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(I1))
+    return CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() &&
+           CI->getAttributes() == cast<CallInst>(I2)->getAttributes();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
+    return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
+           CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1))
+    return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices();
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1))
+    return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(I1))
+    return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() &&
+           FI->getSyncScopeID() == cast<FenceInst>(I2)->getSyncScopeID();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1)) {
+    const AtomicCmpXchgInst *CXI2 = cast<AtomicCmpXchgInst>(I2);
+    return CXI->isVolatile() == CXI2->isVolatile() &&
+           CXI->isWeak() == CXI2->isWeak() &&
+           CXI->getSuccessOrdering() == CXI2->getSuccessOrdering() &&
+           CXI->getFailureOrdering() == CXI2->getFailureOrdering() &&
+           CXI->getSyncScopeID() == CXI2->getSyncScopeID();
+  }
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
+           RMWI->getSyncScopeID() == cast<AtomicRMWInst>(I2)->getSyncScopeID();
+
+  return true;
+}
+
+// Determine whether two GEP operations perform the same underlying arithmetic.
+bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
+                                         const GEPOperator *GEP2) {
+  // When we have target data, we can reduce the GEP down to the value in bytes
+  // added to the address.
+  if (GEP1->hasAllConstantIndices() && GEP2->hasAllConstantIndices()) {
+    SmallVector<Value *, 8> Indices1(GEP1->idx_begin(), GEP1->idx_end());
+    SmallVector<Value *, 8> Indices2(GEP2->idx_begin(), GEP2->idx_end());
+    uint64_t Offset1 = DL->getIndexedOffsetInType(GEP1->getSourceElementType(),
+                                            Indices1);
+    uint64_t Offset2 = DL->getIndexedOffsetInType(GEP2->getSourceElementType(),
+                                            Indices2);
+    return Offset1 == Offset2;
+  }
+
+  if (GEP1->getPointerOperand()->getType() !=
+      GEP2->getPointerOperand()->getType())
+    return false;
+
+  if (GEP1->getNumOperands() != GEP2->getNumOperands())
+    return false;
+
+  for (unsigned i = 0, e = GEP1->getNumOperands(); i != e; ++i) {
+    if (!enumerate(GEP1->getOperand(i), GEP2->getOperand(i)))
+      return false;
+  }
+
+  return true;
+}
+
+// Compare two values used by the two functions under pair-wise comparison. If
+// this is the first time the values are seen, they're added to the mapping so
+// that we will detect mismatches on next use.
+bool FunctionComparator::enumerate(const Value *V1, const Value *V2,
+    bool NoSelfRef/*=false*/) {
+  // Check for function @f1 referring to itself and function @f2 referring to
+  // itself. For compatibility with llvm's MergeFunctions, disallow referring to
+  // each other, or both referring to either of them.
+  if (!NoSelfRef && V1 == F1 && V2 == F2)
+    return true;
+
+  // FIXME: This is very conservative for now, but keeping this for thinlto.
+  if (isa<GlobalVariable>(V1) || isa<GlobalVariable>(V2))
+    return false;
+  if (const Constant *C1 = dyn_cast<Constant>(V1)) {
+    if (V1 == V2) return true;
+    const Constant *C2 = dyn_cast<Constant>(V2);
+    if (!C2) return false;
+    // TODO: constant expressions with GEP or references to F1 or F2.
+    if (C1->isNullValue() && C2->isNullValue() &&
+        isEquivalentType(C1->getType(), C2->getType()))
+      return true;
+    // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1
+    // then they must have equal bit patterns. Aggregate types cannot be
+    // bitcast.
+    if (C1->getType()->isAggregateType() || C2->getType()->isAggregateType())
+      return false;
+    return C1->getType()->canLosslesslyBitCastTo(C2->getType()) &&
+      C1 == ConstantExpr::getBitCast(const_cast<Constant*>(C2), C1->getType());
+  }
+
+  if (isa<InlineAsm>(V1) || isa<InlineAsm>(V2))
+    return V1 == V2;
+
+  // Check that V1 maps to V2. If we find a value that V1 maps to then we simply
+  // check whether it's equal to V2. When there is no mapping then we need to
+  // ensure that V2 isn't already equivalent to something else. For this
+  // purpose, we track the V2 values in a set.
+
+  ValueToValueMapTy::iterator I = id_map.find(V1);
+  if (I != id_map.end())
+    return V2 == I->second;
+  // FIXME: Const casts!!!
+  if (!seen_values.insert(std::make_pair(V2, const_cast<Value *>(V1))).second)
+    return false;
+  id_map[V1] = const_cast<Value *>(V2);
+  return true;
+}
+
+/// Test whether two basic blocks have equivalent behaviour. Returns true if the
+/// blocks can be merged, false if they cannot. Differing instructions are
+/// recorded in DifferingInstructions.
+bool FunctionComparator::compare(const BasicBlock *BB1, const BasicBlock *BB2,
+    std::list<std::pair<const PHINode*,const PHINode*> > *PHIsFound) {
+  BasicBlock::const_iterator F1I, F1E, F2I, F2E;
+
+  for (F1I = BB1->begin(), F1E = BB1->end(),
+       F2I = BB2->begin(), F2E = BB2->end();
+       F1I != F1E && F2I != F2E; ++F1I, ++F2I) {
+    // Skip debug information
+    const CallInst *DbgCall;
+    while (F1I != F1E && (DbgCall = dyn_cast<CallInst>(F1I)) &&
+           DbgCall->getCalledFunction() &&
+           DbgCall->getCalledFunction()->hasName() &&
+           DbgCall->getCalledFunction()->getName().startswith("llvm.dbg."))
+      ++F1I;
+
+    while (F2I != F2E && (DbgCall = dyn_cast<CallInst>(F2I)) &&
+           DbgCall->getCalledFunction() &&
+           DbgCall->getCalledFunction()->hasName() &&
+           DbgCall->getCalledFunction()->getName().startswith("llvm.dbg."))
+      ++F2I;
+
+    if (F1I == F1E || F2I == F2E)
+      break;
+
+    // Ok, we're dealing with real instructions. Check a few cases that will
+    // prevent merging first.
+    const Instruction *F1In = &*F1I;
+    const Instruction *F2In = &*F2I;
+
+    // Cannot merge insts that differ in whether they have uses
+    if (F1In->use_empty() != F2In->use_empty()) {
+      // TODO: Could implement merging for this case (would need to introduce a
+      // dummy value in the PHI node etc.)
+      return false;
+    }
+
+    // Cannot merge insts whose types are non-equivalent
+    if (!isEquivalentType(F1In->getType(), F2In->getType())) {
+      return false;
+    }
+
+    // TODO:  Currently cannot merge InvokeInsts with differing result types
+    //        that have uses. We cannot push up a bitcast into their block after
+    //        them because they are terminators. Would need to insert an
+    //        additional BB.
+    if (isa<InvokeInst>(F1In) && !F1In->use_empty() &&
+        F1In->getType() != F2In->getType())
+      return false;
+
+    if (!enumerate(F1In, F2In))
+      goto differing_instructions;
+
+    if (const GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(F1In)) {
+      const GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(F2In);
+      if (!GEP2)
+        goto differing_instructions;
+
+      if (!enumerate(GEP1->getPointerOperand(), GEP2->getPointerOperand()))
+        goto differing_instructions;
+
+      if (!isEquivalentGEP(GEP1, GEP2))
+        goto differing_instructions;
+    } else if (const PHINode *Phi1 = dyn_cast<PHINode>(F1In)) {
+      const PHINode *Phi2 = dyn_cast<PHINode>(F2In);
+      // We can't currently merge a PHI and non-PHI instruction
+      if (!Phi2)
+        return false;
+
+      // We can't currently merge PHI nodes with different numbers of incoming
+      // values
+      if (F1In->getNumOperands() != F2In->getNumOperands())
+        return false;
+
+      // We need to treat PHI nodes specially. Their incoming values may be in a
+      // different order even if they are equivalent. We can't compare them
+      // until we've seen the incoming blocks and know which values are
+      // equivalent. Therefore postpone PHINode comparison until the end.
+      PHIsFound->push_back(std::make_pair(Phi1, Phi2));
+    } else {
+      if (!isEquivalentOperation(F1In, F2In))
+        goto differing_instructions;
+
+      bool IsCall = isa<CallInst>(F1In);
+      assert(F1In->getNumOperands() == F2In->getNumOperands());
+      for (unsigned i = 0, e = F1In->getNumOperands(); i != e; ++i) {
+        Value *OpF1 = F1In->getOperand(i);
+        Value *OpF2 = F2In->getOperand(i);
+
+        // Allow self-reference if this is a call instruction and the last
+        // operand which is the called function
+        bool AllowSelfRef = IsCall && (i + 1) == e;
+
+        if (!enumerate(OpF1, OpF2, !AllowSelfRef))
+          goto differing_instructions;
+
+        if (!isEquivalentType(OpF1->getType(), OpF2->getType()))
+          goto differing_instructions;
+
+        if ((OpF1 == F1 && OpF2 == F2) || (OpF1 == F2 && OpF2 == F1))
+          SelfRefInstructions[F1In] = F2In;
+      }
+    }
+
+    continue;
+
+differing_instructions:
+    // Cannot merge functions with differing landing pad instructions yet. They
+    // would need special treatment which involves updating the corresponding
+    // invoke instructions.
+    if (isa<LandingPadInst>(F1In))
+      return false;
+    if (isa<InvokeInst>(F1In))
+      return false;
+
+    DifferingInstructions[F1In] = F2In;
+  }
+
+  // We cannot currently merge basic blocks with different instruction counts
+  return F1I == F1E && F2I == F2E;
+}
+
+bool FunctionComparator::comparePHIs(
+  const std::list<std::pair<const PHINode*,const PHINode*> > &PHIs) {
+  if (PHIs.empty())
+    return true;
+
+  for (std::list<std::pair<const PHINode*,const PHINode*> >::const_iterator
+         I = PHIs.begin(), E = PHIs.end(); I != E; ++I) {
+    const PHINode *Phi1 = I->first, *Phi2 = I->second;
+
+    for (unsigned ValId = 0, ValNum = Phi1->getNumIncomingValues();
+         ValId < ValNum; ++ValId) {
+      Value *Phi1Val = Phi1->getIncomingValue(ValId);
+
+      // Get corresponding Phi2Val
+      Value *BBinPhi2Val = getF1toF2Map()[Phi1->getIncomingBlock(ValId)];
+
+      if (!BBinPhi2Val)
+        return false; // Currently can't handle differing predecessor blocks
+
+      BasicBlock *BBinPhi2 = cast<BasicBlock>(BBinPhi2Val);
+      Value *Phi2Val = Phi2->getIncomingValueForBlock(BBinPhi2);
+
+      // Enumerate the values. If the PHI node references the function itself (a
+      // very rare case), we mark it as different (NoSelfRef). This is only
+      // necessary for outline merging, not equiv merging. TODO: Make equal
+      // merging possible with such PHI nodes.
+      if (!enumerate(Phi1Val, Phi2Val,/*NoSelfRef=*/true)) {
+        DifferingInstructions[Phi1] = Phi2;
+        break;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Test whether the two functions have equivalent behaviour.
+bool FunctionComparator::compare() {
+  // We need to recheck everything, but check the things that weren't included
+  // in the hash first.
+  if (F1->getAttributes() != F2->getAttributes())
+    goto not_mergeable;
+
+  if (F1->hasGC() != F2->hasGC())
+    goto not_mergeable;
+
+  if (F1->hasGC() && F1->getGC() != F2->getGC())
+    goto not_mergeable;
+
+  if (!F1->getSection().equals(F2->getSection()))
+    goto not_mergeable;
+
+  if (F1->isVarArg() != F2->isVarArg())
+    goto not_mergeable;
+
+  if (F1->isInterposable() != F2->isInterposable())
+    goto not_mergeable;
+
+  if (F1->size() != F2->size())
+    goto not_mergeable;
+
+  // TODO: if it's internal and only used in direct calls, we could handle
+  // this case too.
+  if (F1->getCallingConv() != F2->getCallingConv())
+    goto not_mergeable;
+
+  if (!isEquivalentType(F1->getFunctionType(), F2->getFunctionType()))
+    goto not_mergeable;
+
+  assert(F1->arg_size() == F2->arg_size() &&
+         "Identically typed functions have different numbers of args!");
+
+  // Visit the arguments so that they get enumerated in the order they're
+  // passed in.
+  for (Function::const_arg_iterator f1i = F1->arg_begin(),
+         f2i = F2->arg_begin(), f1e = F1->arg_end(); f1i != f1e; ++f1i, ++f2i) {
+    if (!enumerate(&*f1i, &*f2i))
+      llvm_unreachable("Arguments repeat!");
+  }
+
+  // We do a CFG-ordered walk since the actual ordering of the blocks in the
+  // linked list is immaterial. Our walk starts at the entry block for both
+  // functions, then takes each block from each terminator in order. As an
+  // artifact, this also means that unreachable blocks are ignored.
+  {
+    SmallVector<const BasicBlock *, 8> F1BBs, F2BBs;
+    SmallSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1.
+    std::list<std::pair<const PHINode*,const PHINode*> > PHIsFound;
+
+    F1BBs.push_back(&F1->getEntryBlock());
+    F2BBs.push_back(&F2->getEntryBlock());
+
+    VisitedBBs.insert(F1BBs[0]);
+    while (!F1BBs.empty()) {
+      const BasicBlock *F1BB = F1BBs.pop_back_val();
+      const BasicBlock *F2BB = F2BBs.pop_back_val();
+
+      // Check for control flow divergence
+      if (!enumerate(F1BB, F2BB))
+        goto not_mergeable;
+
+      const Instruction *F1TI = F1BB->getTerminator();
+      const Instruction *F2TI = F2BB->getTerminator();
+
+      // TODO: Implement merging of blocks with different numbers of
+      // instructions.
+      if (F1TI->getNumSuccessors() != F2TI->getNumSuccessors() ||
+          F1BB->size() != F2BB->size())
+        goto not_mergeable;
+
+      // The actual instruction-by-instruction comparison
+      if (!compare(F1BB, F2BB, &PHIsFound))
+        goto not_mergeable;
+
+      // FIXME: Count this in compare(F1BB,F2BB) so it doesn't include debug
+      // instructions.
+      InstructionCount += std::max(F1BB->size(), F2BB->size());
+
+      assert(F1TI->getNumSuccessors() == F2TI->getNumSuccessors());
+      for (unsigned i = 0, e = F1TI->getNumSuccessors(); i != e; ++i) {
+        if (!VisitedBBs.insert(F1TI->getSuccessor(i)).second)
+          continue;
+
+        F1BBs.push_back(F1TI->getSuccessor(i));
+        F2BBs.push_back(F2TI->getSuccessor(i));
+      }
+    }
+
+    BasicBlockCount = VisitedBBs.size();
+
+    // After we've seen all values and BBs, compare the PHI nodes
+    if (!comparePHIs(PHIsFound))
+      goto not_mergeable;
+  }
+
+  if (DifferingInstructions.size()) {
+    // Currently we can't merge vararg functions with differing instructions.
+    // TODO: Explore whether this is feasible; the difficult bit is the
+    // additional argument we need to add.
+    if (F1->isVarArg())
+      goto not_mergeable;
+
+    isDifferent = true;
+    DifferingInstructionsCount += DifferingInstructions.size();
+
+    LLVM_DEBUG(float Metric = ((float)InstructionCount - DifferingInstructionsCount)
+                         / InstructionCount*100;
+          dbgs() << "Similar fns: " << F1->getName() << " and " << F2->getName()
+                 << " bbs=" << BasicBlockCount << " insts=" << InstructionCount
+                 << " failed=" << DifferingInstructionsCount << " metric="
+                 << format("%0.2f", Metric)
+                 << '\n');
+  }
+
+  return true;
+
+not_mergeable:
+  // Fail: cannot merge the two functions
+  isNotMergeable = true;
+  return false;
+}
+
+bool FunctionComparator::isExactMatch() {
+  return (!isNotMergeable && !isDifferent);
+}
+
+bool FunctionComparator::isMergeCandidate() {
+  if (isNotMergeable)
+    return false;
+
+  if (!isDifferent)
+    return true;
+
+  // Heuristic when to attempt merging
+  if (InstructionCount > MergeDifferingMinInsts &&
+      DifferingInstructionsCount <= MergeMaxDiffering &&
+      getSimilarityMetric() > MergeMinSimilarity)
+    return true;
+
+  // Tolerate higher difference with higher similarity.
+  if (InstructionCount > 100 &&
+      DifferingInstructionsCount <= 60 &&
+      getSimilarityMetric() > 90 )
+    return true;
+
+  return false;
+}
+
+namespace {
+
+struct FunctionComparatorOrdering {
+  bool operator () (FunctionComparator *LHS, FunctionComparator *RHS) const {
+    unsigned MetricLHS = LHS->getSimilarityMetric(),
+             MetricRHS = RHS->getSimilarityMetric();
+
+    // If the metric is the same, then default to the unique ID. We need
+    // to use a unique value instead of the object address to ensure
+    // deterministic ordering.
+    if (MetricLHS == MetricRHS)
+      return LHS->getID() > RHS->getID();
+    return MetricLHS > MetricRHS;
+  }
+};
+
+class MergeRegistry {
+public:
+  typedef MapVector<unsigned, std::list<ComparableFunction> > FnCompareMap;
+  typedef std::set<FunctionComparator *, FunctionComparatorOrdering>
+    FnComparatorSet;
+  typedef std::map<Function *, FnComparatorSet> SimilarFnMap;
+
+  ~MergeRegistry();
+
+  void clear();
+
+  /// Defer a function for consideration in the next round.
+  void defer(Function *F);
+
+  /// Return true if we have deferred functions that can be enqueued.
+  bool haveDeferred() { return !Deferred.empty(); }
+
+  /// Move all the deferred functions into buckets to consider them for merging.
+  /// Returns number of functions that have been added.
+  unsigned enqueue();
+
+  /// Add a candidate for merging
+  void insertCandidate(FunctionComparator *Comp);
+
+  /// Remove a Function from the FnSet and queue it up for a second sweep of
+  /// analysis if Reanalyze is set. If it is a candidate for merging, remove it
+  /// from consideration.
+  void remove(Function *F, bool Reanalyze=true);
+
+  /// Return the similarity metric of the most similar function to F that is
+  /// not listed in the Ignore set.
+  unsigned getMaxSimilarity(Function *F, const DenseSet<Function *> &Ignore);
+
+  /// The collection of buckets that contain functions that may be similar to
+  /// each other (same hash value).
+  FnCompareMap FunctionsToCompare;
+
+  std::list<FunctionComparator *> FunctionsToMerge;
+  SimilarFnMap SimilarFunctions;
+
+private:
+  typedef std::vector<WeakVH> FnDeferredQueue;
+
+  /// A work queue of functions that may have been modified and should be
+  /// analyzed again.
+  FnDeferredQueue Deferred;
+};
+
+}  // end anonymous namespace
+
+MergeRegistry::~MergeRegistry() {
+  this->clear();
+}
+
+void MergeRegistry::clear() {
+  Deferred.clear();
+  SimilarFunctions.clear();
+  for (std::list<FunctionComparator *>::iterator
+        I = FunctionsToMerge.begin(), E = FunctionsToMerge.end();
+       I != E; ++I) {
+    FunctionComparator *FC = *I;
+    delete FC;
+  }
+  FunctionsToMerge.clear();
+  FunctionsToCompare.clear();
+}
+
+static bool isAliasCapable(Function* G) {
+  return
+    UseGlobalAliases && G->hasGlobalUnnamedAddr()
+    && (G->hasExternalLinkage() || G->hasLocalLinkage() || G->hasWeakLinkage());
+}
+
+static bool isComparisonCandidate(Function *F) {
+  if (Opt::MergeLevel == Opt::size) {
+    // Only consider functions that are to be optimized for size.
+    // By default, that is all functions at -Os/-Oz and nothing at -O2.
+    bool Os = F->getAttributes().
+      hasAttribute(AttributeList::FunctionIndex, Attribute::OptimizeForSize);
+    bool Oz = F->getAttributes().
+      hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+    if (!Os && !Oz)
+      return false;
+  }
+
+  // Ignore declarations and tiny functions - no point in merging those
+  if (F->isDeclaration()) return false;
+  if (F->getName().endswith(MERGED_SUFFIX)) return false;
+  if (F->hasAvailableExternallyLinkage()) return false;
+  if (F->hasFnAttribute(Attribute::AlwaysInline)) return false;
+  if (F->size() == 1 && F->begin()->size() < MergeMinInsts)
+    return isAliasCapable(F);
+
+  return true;
+}
+
+void MergeRegistry::defer(Function *F) {
+  if (isComparisonCandidate(F))
+    Deferred.push_back(F);
+}
+
+// Move functions from Deferred into buckets. remove() may have been called
+// multiple times for the same function, so eliminate duplicates using the
+// set. We reverse them because MergeSimilarFunctions::insert inserts at the
+// front of each bucket.
+unsigned MergeRegistry::enqueue() {
+  DenseSet<Function *> InsertedFuncs;
+
+  for (std::vector<WeakVH>::reverse_iterator
+      DefI = Deferred.rbegin(), DefE = Deferred.rend();
+      DefI != DefE; ++DefI) {
+    Value *V = *DefI;
+    Function *F = dyn_cast_or_null<Function>(V);
+    if (!F) continue;
+    if (InsertedFuncs.find(F) != InsertedFuncs.end()) continue;
+    if (!isComparisonCandidate(F)) continue;
+
+    unsigned Hash = profileFunction(F);
+    FunctionsToCompare[Hash].push_front(F);
+
+    InsertedFuncs.insert(F);
+  }
+
+  Deferred.clear();
+
+  return InsertedFuncs.size();
+}
+
+void MergeRegistry::insertCandidate(FunctionComparator *Comp) {
+  FunctionsToMerge.push_back(Comp);
+  SimilarFunctions[Comp->getF1()].insert(Comp);
+}
+
+static void removeFromBucket(Function *F,
+    std::list<ComparableFunction> &Bucket) {
+  for (std::list<ComparableFunction>::iterator
+      I = Bucket.begin(), E = Bucket.end(); I != E; ++I) {
+    if (I->getFunc() == F) {
+      Bucket.erase(I);
+      return;
+    }
+  }
+}
+
+void MergeRegistry::remove(Function *F, bool Reanalyze/*=true*/) {
+  // There is no need to remove a function that is not already
+  // in a bucket.
+  if (!isComparisonCandidate(F))
+    return;
+
+  unsigned Hash = profileFunction(F);
+  std::list<ComparableFunction> &Bucket = FunctionsToCompare[Hash];
+
+  removeFromBucket(F, Bucket);
+
+  if (Reanalyze)
+    Deferred.push_back(F);
+
+  // Check whether we have any existing FunctionComparator objects for this fn.
+  // If yes, discard them because F has changed. Retry merging for those
+  // functions by adding them to Deferred.
+  std::list<FunctionComparator *>::iterator I = FunctionsToMerge.begin();
+  while (I != FunctionsToMerge.end()) {
+    FunctionComparator *Comp = *I;
+    if (Comp->getF1() == F) {
+      Function *OtherF = Comp->getF2();
+      Deferred.push_back(OtherF);
+      removeFromBucket(OtherF, Bucket);
+      if (!SimilarFunctions[F].erase(Comp))
+        llvm_unreachable("Inconsistent SimilarFunctions set");
+      I = FunctionsToMerge.erase(I);
+      delete Comp;
+    } else if (Comp->getF2() == F) {
+      Function *OtherF = Comp->getF1();
+      Deferred.push_back(OtherF);
+      removeFromBucket(OtherF, Bucket);
+      if (!SimilarFunctions[OtherF].erase(Comp))
+        llvm_unreachable("Inconsistent SimilarFunctions set");
+      I = FunctionsToMerge.erase(I);
+      delete Comp;
+    } else {
+      ++I;
+    }
+  }
+}
+
+unsigned MergeRegistry::getMaxSimilarity(Function *F,
+    const DenseSet<Function *> &Ignore) {
+  FnComparatorSet &Similar = SimilarFunctions[F];
+
+  for (FnComparatorSet::iterator I = Similar.begin(), E = Similar.end();
+       I != E; ++I) {
+    FunctionComparator *Comp = *I;
+    if (Ignore.count(Comp->getF2()))
+      continue;
+
+    return Comp->getSimilarityMetric();
+  }
+
+  return 0;
+}
+
+namespace {
+
+class MergeSimilarFunctions : public ModulePass {
+public:
+  static char ID;
+  MergeSimilarFunctions(const ModuleSummaryIndex *Summary = nullptr)
+    : ModulePass(ID) {
+    initializeMergeSimilarFunctionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M);
+
+private:
+  /// Find the functions that use this Value and remove them from FnSet and
+  /// queue the functions.
+  void removeUsers(Value *V);
+
+  /// Replace all direct calls of Old with calls of New. Will bitcast New if
+  /// necessary to make types match.
+  void replaceDirectCallers(Function *Old, Function *New);
+
+  /// Process functions in the specified bucket, by either doing equiv merging
+  /// marking them for diff merging. Returns false if the bucket needs to be
+  /// re-scanned after an equiv merge. Sets Changed if the module was changed by
+  /// equiv merge.
+  bool mergeBucket(std::list<ComparableFunction> &Fns, bool &Changed);
+
+  /// Exhaustively compare all functions in each bucket and do equiv merging
+  /// where possible. Functions that have already been compared will not be
+  /// compared again. Returns true if the module was modified.
+  bool doExhaustiveCompareMerge();
+
+  /// Merge all the functions marked for diff merging. Returns true if the
+  /// module was modified.
+  bool doDiffMerge();
+
+  /// Merge two equivalent functions. Upon completion, G may be deleted, or may
+  /// be converted into a thunk. In either case, it should never be visited
+  /// again.
+  void mergeTwoFunctions(Function *F, Function *G);
+
+  /// Merge a set of functions with differences.
+  void outlineAndMergeFunctions(SmallVectorImpl<FunctionComparator *> &Fns);
+
+  /// Replace G with a thunk or an alias to F. Deletes G.
+  void writeThunkOrAlias(Function *F, Function *G);
+
+  /// Replace G with a simple tail call to bitcast(F). Also replace direct uses
+  /// of G with bitcast(F). Deletes G.
+  void writeThunk(Function *F, Function *G);
+
+  /// Replace G with a tail call to F with an additional argument.
+  ///
+  void writeThunkWithChoice(Function *NewF, Function *OldF, int Choice);
+
+  /// Replace G with an alias to F. Deletes G.
+  void writeAlias(Function *F, Function *G);
+
+  /// DataLayout for more accurate GEP comparisons. May be NULL.
+  const DataLayout *DL;
+
+  /// Merge registry. Stores all the information about functions being
+  /// considered for merging as well as current candidates for merging.
+  MergeRegistry Registry;
+
+};
+
+}  // end anonymous namespace
+
+char MergeSimilarFunctions::ID = 0;
+INITIALIZE_PASS(MergeSimilarFunctions, "mergesimilarfunc",
+                "Merge Similar Functions", false, false)
+
+ModulePass *
+llvm::createMergeSimilarFunctionsPass(const ModuleSummaryIndex *S) {
+  return new MergeSimilarFunctions(S);
+}
+
+bool MergeSimilarFunctions::runOnModule(Module &M) {
+  if (Opt::MergeLevel == Opt::none)
+    return false;
+
+  bool Changed = false;
+
+  DL = &M.getDataLayout();
+
+  for (auto &I : M)
+    Registry.defer(&I);
+
+  do {
+    unsigned InsertCount = Registry.enqueue();
+
+    LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n');
+    LLVM_DEBUG(dbgs() << "size of worklist: " << InsertCount << '\n');
+    (void)InsertCount;
+
+    Changed |= doExhaustiveCompareMerge();
+  } while (Registry.haveDeferred());
+
+  Changed |= doDiffMerge();
+
+  Registry.clear();
+  return Changed;
+}
+
+// Replace direct callers of Old with New.
+void MergeSimilarFunctions::replaceDirectCallers(Function *Old, Function *New) {
+  Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
+  for (Value::use_iterator UI = Old->use_begin(), UE = Old->use_end();
+       UI != UE;) {
+    Use *U = &*UI;
+    ++UI;
+    CallSite CS(U->getUser());
+    if (CS && CS.isCallee(U)) {
+      Registry.remove(CS.getInstruction()->getParent()->getParent());
+      U->set(BitcastNew);
+    }
+  }
+}
+
+// Replace G with an alias to F if possible, or else a thunk to F. Deletes G.
+void MergeSimilarFunctions::writeThunkOrAlias(Function *F, Function *G) {
+  if (isAliasCapable(G)) {
+    writeAlias(F, G);
+    return;
+  }
+
+  writeThunk(F, G);
+}
+
+static void writeThunkBody(Function *Thunk, Function *F,
+                           ConstantInt *Choice, const DataLayout *DL) {
+  BasicBlock *BB = &Thunk->getEntryBlock();
+  IRBuilder<> Builder(BB);
+
+  SmallVector<Value *, 16> Args;
+  unsigned i = 0;
+  FunctionType *FFTy = F->getFunctionType();
+  Type *IntPtrTy = DL->getIntPtrType(FFTy->getContext());
+  for (auto &AI : Thunk->args()) {
+    Value *Cast = createCastIfNeeded(&AI, FFTy->getParamType(i), BB, IntPtrTy);
+    Args.push_back(Cast);
+    ++i;
+  }
+  if (Choice)
+    Args.push_back(Choice);
+
+  CallInst *CI = Builder.CreateCall(F, Args);
+  CI->setTailCall();
+  CI->setCallingConv(F->getCallingConv());
+  CI->setAttributes(F->getAttributes());
+  CI->setIsNoInline();
+  if (Thunk->getReturnType()->isVoidTy()) {
+    Builder.CreateRetVoid();
+  } else {
+    Type *RetTy = Thunk->getReturnType();
+    if (CI->getType()->isIntegerTy() && RetTy->isPointerTy())
+      Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy));
+    else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy())
+      Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy));
+    else {
+      Value *Cast = createCastIfNeeded(CI, RetTy, BB, IntPtrTy);
+      Builder.CreateRet(Cast);
+    }
+  }
+}
+
+// Replace G with a simple tail call to bitcast(F). Also replace direct uses
+// of G with bitcast(F). Deletes G.
+void MergeSimilarFunctions::writeThunk(Function *F, Function *G) {
+  if (!G->isInterposable()) {
+    // Redirect direct callers of G to F.
+    replaceDirectCallers(G, F);
+  }
+
+  // If G was internal then we may have replaced all uses of G with F. If so,
+  // stop here and delete G. There's no need for a thunk.
+  if (G->hasLocalLinkage() && G->use_empty()) {
+    LLVM_DEBUG(dbgs() << "All uses of " << G->getName() << " replaced by "
+                 << F->getName() << ". Removing it.\n");
+    G->eraseFromParent();
+    return;
+  }
+
+  Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
+                                    G->getParent());
+  BasicBlock::Create(F->getContext(), "", NewG);
+
+  writeThunkBody(NewG, F, nullptr, DL);
+
+  NewG->copyAttributesFrom(G);
+  NewG->takeName(G);
+  removeUsers(G);
+  G->replaceAllUsesWith(NewG);
+  G->eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "writeThunk: " << NewG->getName() << " calling "
+               << F->getName() << '\n');
+  ++NumThunksWritten;
+}
+
+void MergeSimilarFunctions::writeThunkWithChoice(Function *NewF, Function *OldF,
+                                          int Choice) {
+  // Deleting the body of a function sets its linkage to External. Save the old
+  // one here and restore it at the end.
+  GlobalValue::LinkageTypes OldFLinkage = OldF->getLinkage();
+
+  // Delete OldF's body
+  OldF->deleteBody();
+  BasicBlock::Create(OldF->getContext(), "", OldF);
+
+  // Insert single BB with tail call
+  IntegerType *Int32Ty = Type::getInt32Ty(OldF->getContext());
+  ConstantInt *ChoiceConst = ConstantInt::get(Int32Ty, Choice);
+  writeThunkBody(OldF, NewF, ChoiceConst, DL);
+  OldF->setLinkage(OldFLinkage);
+}
+
+// Replace G with an alias to F and delete G.
+void MergeSimilarFunctions::writeAlias(Function *F, Function *G) {
+
+  // Replace all current uses of G in constants with F. This handles virtual
+  // table and other references. Do this first so that we don't modify thge
+  // global alias we're about to create.
+  SmallVector<Use *, 8> Uses;
+  for (auto I = G->use_begin(), E = G->use_end(); I != E; ++I) {
+    Use *U = I.operator->();
+    Constant *CV = dyn_cast<Constant>(U->getUser());
+    if (!CV) continue;
+    Uses.push_back(U);
+  }
+  for (auto I = Uses.begin(), E= Uses.end(); I != E; ++I) {
+    Use *U = *I;
+    U->set(F);
+  }
+
+  PointerType *PTy = G->getType();
+  auto *GA = GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                                 G->getLinkage(), "", F);
+  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+  GA->takeName(G);
+  GA->setVisibility(G->getVisibility());
+  removeUsers(G);
+  G->replaceAllUsesWith(GA);
+  G->eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n');
+  ++NumAliasesWritten;
+}
+
+// Merge two equivalent functions. Upon completion, Function G is deleted.
+void MergeSimilarFunctions::mergeTwoFunctions(Function *F, Function *G) {
+  if (F->isInterposable()) {
+    assert(G->isInterposable());
+
+    if (UseGlobalAliases) {
+      // Make them both thunks to the same internal function.
+      Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
+                                     F->getParent());
+      H->copyAttributesFrom(F);
+      H->takeName(F);
+      removeUsers(F);
+      F->replaceAllUsesWith(H);
+
+      unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment());
+
+      writeAlias(F, G);
+      writeAlias(F, H);
+
+      F->setAlignment(MaxAlignment);
+      F->setLinkage(GlobalValue::PrivateLinkage);
+    } else {
+      // We can't merge them. Instead, pick one and update all direct callers
+      // to call it and hope that we improve the instruction cache hit rate.
+      replaceDirectCallers(G, F);
+    }
+
+    ++NumDoubleWeak;
+  } else {
+    writeThunkOrAlias(F, G);
+  }
+
+  ++NumFunctionsMerged;
+}
+
+static Value *getLastArg(Function *F) {
+  auto it = F->arg_begin();
+  std::advance(it, F->arg_size()-1);
+  return it;
+}
+
+static void insertCondAndRemapInstructions(
+    Instruction *F1InstInNewF, const std::vector<const Instruction *> &F2Insts,
+    Function *NewF, ValueToValueMapTy &F1toNewF,
+    const SmallVectorImpl<FunctionComparator *> &Comps,
+    Type *IntPtrTy) {
+  assert(F2Insts.size() == Comps.size() &&
+      "Mis-match between F2Insts & Comps!");
+
+  SmallVector<Instruction *, 8> F2InstsInNewF;
+  for (unsigned FnI = 0, FnE = F2Insts.size(); FnI != FnE; ++FnI) {
+    const Instruction *F2Inst = F2Insts[FnI];
+    if (!F2Inst) {
+      F2InstsInNewF.push_back(NULL);
+      continue;
+    }
+
+    Instruction *F2InstInNewF = F2Inst->clone();
+
+    // Remap F2Inst: F2 values -> F1 values
+    RemapInstruction(F2InstInNewF, Comps[FnI]->getF2toF1Map(),
+                     RF_NoModuleLevelChanges);
+    // Remap F2Inst: F1 values -> NewF values
+    RemapInstruction(F2InstInNewF, F1toNewF, RF_NoModuleLevelChanges);
+
+    F2InstsInNewF.push_back(F2InstInNewF);
+  }
+
+  SmallVector<Instruction *, 8> Terminators;
+  SplitBlockAndInsertSwitch(getLastArg(NewF), F1InstInNewF,
+                            F2InstsInNewF, Terminators);
+
+  assert(Terminators.size() == F2InstsInNewF.size() + 1 &&
+      "Not enough terminators returned");
+
+  // F2InstsInNewF are now hooked up to the correct values in NewF. However,
+  // some of their operands may be pointers/integers so they could potentially
+  // have the wrong type in NewF (since we treat all pointers and integers of
+  // same size as equal). Insert casts if needed.
+  for (unsigned FnI = 0, FnE = F2InstsInNewF.size(); FnI != FnE; ++FnI) {
+    Instruction *F2InstInNewF = F2InstsInNewF[FnI];
+    if (!F2InstInNewF)
+      continue;
+    const Instruction *F2Inst = F2Insts[FnI];
+
+    for (unsigned OpId=0; OpId < F2InstInNewF->getNumOperands(); ++OpId) {
+      Value *F2NewFOperand = F2InstInNewF->getOperand(OpId);
+      Value *F2OrigOperand = F2Inst->getOperand(OpId);
+
+      if (F2NewFOperand->getType() != F2OrigOperand->getType()) {
+        Value *Cast = createCastIfNeeded(F2NewFOperand,
+            F2OrigOperand->getType(),
+            F2InstInNewF,
+            IntPtrTy);
+        F2InstInNewF->setOperand(OpId, Cast);
+      }
+    }
+  }
+
+  if (ReturnInst *F1Ret = dyn_cast<ReturnInst>(F1InstInNewF)) {
+    // If we're handling differing return instructions, we need to ensure that
+    // they all return the same type. Since we treat pointer types as equal, we
+    // may need to insert a bitcast.
+    for (Instruction *F2Inst : F2InstsInNewF) {
+      if (!F2Inst)
+        continue;
+
+      // F2Inst must also be a return instruction due to control flow
+      // isomorphism.
+      ReturnInst *F2Ret = cast<ReturnInst>(F2Inst);
+
+      if (F2Ret->getReturnValue()->getType() !=
+          F1Ret->getReturnValue()->getType())
+        F2Ret->setOperand(0,
+                          createCastIfNeeded(F2Ret->getReturnValue(),
+                                             F1Ret->getReturnValue()->getType(),
+                                             F2Ret, IntPtrTy));
+    }
+  } else if (!F1InstInNewF->use_empty()) {
+    // If the instructions have uses, we need to insert a PHI node.
+    //
+    // We treat all pointer types as equal, so we may need to insert
+    // a bitcast to ensure that all incoming values of the PHI node have the
+    // same type
+    Type *F1IType = F1InstInNewF->getType();
+    BasicBlock *TailBB = Terminators[0]->getSuccessor(0);
+    PHINode *Phi =
+        PHINode::Create(F1IType, F2InstsInNewF.size(), "", &TailBB->front());
+    F1InstInNewF->replaceAllUsesWith(Phi);
+
+    Phi->addIncoming(F1InstInNewF, F1InstInNewF->getParent());
+    for (unsigned FnI = 0, FnE = F2InstsInNewF.size(); FnI != FnE; ++FnI) {
+      Instruction *F2InstInNewF = F2InstsInNewF[FnI];
+      if (!F2InstInNewF)
+        continue;
+
+      if (F2InstInNewF->getType() != F1IType) {
+        assert(!F2InstInNewF->isTerminator() &&
+            "Cannot cast result of terminator instruction");
+
+        F2InstInNewF = cast<Instruction>(
+            createCastIfNeeded(F2InstInNewF,
+              F1IType,
+              Terminators[FnI+1],
+              IntPtrTy));
+      }
+
+      Phi->addIncoming(F2InstInNewF, F2InstInNewF->getParent());
+    }
+  }
+}
+
+static void mergePHINode(const SmallVectorImpl<FunctionComparator *> &Fns,
+                         Function *NewF,
+                         ValueToValueMapTy &VMap, /* F1->FNew */
+                         const PHINode *F1PhiInst,
+                         std::vector<const Instruction *> F2PhiInsts) {
+  PHINode *F1PhiInNewF = dyn_cast<PHINode>(VMap[F1PhiInst]);
+  assert(F1PhiInNewF && "Cannot find F1Inst in NewF!");
+
+  // The incoming blocks in any of the F2PhiInsts may be in a different order.
+  // If this is the case, we have to reorder them. F2PhiInsts is intentionally a
+  // copy, so we can modify it
+  SmallVector<PHINode *, 4> GCInsts; // so we can delete them later.
+  for (unsigned FnI = 0, FnE = F2PhiInsts.size(); FnI != FnE; ++FnI) {
+    const PHINode *F2PhiInst = dyn_cast_or_null<const PHINode>(F2PhiInsts[FnI]);
+    if (!F2PhiInst)
+      continue;
+
+    for (unsigned I = 0, E = F1PhiInNewF->getNumIncomingValues(); I < E; ++I) {
+      if (!Fns[FnI]->enumerate(F1PhiInst->getIncomingBlock(I),
+                               F2PhiInst->getIncomingBlock(I))) {
+        // Non-equivalent blocks in the same position - need to reorder PhiInst
+        PHINode *ReorderedF2PhiInst = PHINode::Create(F2PhiInst->getType(), E);
+
+        for (unsigned II = 0; II < E; ++II) {
+          Value *BBVal =
+            Fns[FnI]->getF1toF2Map()[F1PhiInst->getIncomingBlock(II)];
+          BasicBlock *BB = cast<BasicBlock>(BBVal);
+          Value *Val = F2PhiInst->getIncomingValueForBlock(BB);
+          ReorderedF2PhiInst->addIncoming(Val, BB);
+        }
+
+        F2PhiInsts[FnI] = ReorderedF2PhiInst;
+        GCInsts.push_back(ReorderedF2PhiInst);
+        break;
+      }
+    }
+  }
+
+  // Now merge the PHI nodes.
+  for (unsigned i = 0; i < F1PhiInNewF->getNumIncomingValues(); ++i) {
+    Value *F1InValNewF = F1PhiInNewF->getIncomingValue(i),
+          *F1InVal = F1PhiInst->getIncomingValue(i);
+    BasicBlock *F1NewFInBlock = F1PhiInNewF->getIncomingBlock(i);
+    // If this is a repeat occurrence of the same incoming BasicBlock, we
+    // will have already dealt with it in a previous iteration.
+    if (F1PhiInNewF->getBasicBlockIndex(F1PhiInNewF->getIncomingBlock(i)) !=
+        (int)i)
+      continue;
+
+    Value *NewIncoming = F1InValNewF;
+
+    Instruction *InsertPt = F1NewFInBlock->getTerminator();
+
+    // Build up a chain of cmps and selects that pick the correct incoming
+    // value.
+    for (unsigned FnI = 0, FnE = F2PhiInsts.size(); FnI != FnE; ++FnI) {
+      if (!F2PhiInsts[FnI])
+        continue;
+      const PHINode *F2PhiInst = cast<const PHINode>(F2PhiInsts[FnI]);
+      Value *F2InVal = F2PhiInst->getIncomingValue(i);
+
+      // If we know these are equivalent, there's no further work to do
+      if (Fns[FnI]->enumerate(F1InVal, F2InVal,/*NoSelfRef=*/true) &&
+          Fns[FnI]->enumerate(F1PhiInst->getIncomingBlock(i),
+            F2PhiInst->getIncomingBlock(i)))
+        continue;
+
+      assert(Fns[FnI]->enumerate(F1PhiInst->getIncomingBlock(i),
+                                 F2PhiInst->getIncomingBlock(i)) &&
+             "Non-equivalent incoming BBs in PHI.");
+
+      // We have different incoming values from the same block
+      // Translate F2's incoming value to NewF if needed
+      Value *F2InValNewF = F2InVal;
+      if (!isa<Constant>(F2InVal)) {
+        Value *V = Fns[FnI]->getF2toF1Map()[F2InVal]; // F2->F1
+        F2InValNewF = VMap[V]; // F1->NewF
+        assert(V && F2InValNewF && "Cannot map F2InVal to NewF");
+      }
+
+      // Cast F2InValNewF to the correct type if needed
+      LLVMContext &Ctx = F1InValNewF->getType()->getContext();
+      const DataLayout *FTD = Fns[FnI]->getDataLayout();
+      Type *IntPtrTy = FTD ? FTD->getIntPtrType(Ctx) : NULL;
+      F2InValNewF = createCastIfNeeded(F2InValNewF, F1InValNewF->getType(),
+                                       InsertPt, IntPtrTy);
+
+      // Create compare & select
+      Value *ChoiceArg = getLastArg(NewF);
+      Value *SelectBit = new ICmpInst(InsertPt,
+                                      ICmpInst::ICMP_EQ,
+                                      getLastArg(NewF),
+                                      ConstantInt::get(ChoiceArg->getType(),
+                                                       FnI+1));
+
+      // SelectBit true -> F2InValNewF, SelectBit false -> existing NewIncoming.
+      NewIncoming = SelectInst::Create(SelectBit, F2InValNewF, NewIncoming, "",
+          InsertPt);
+    }
+
+    if (NewIncoming == F1InValNewF)
+      continue; // no change for this incoming value
+
+    // Replace all occurrences of this incoming value/block by the new
+    // ones (phi nodes can have repeated arguments)
+    for (unsigned j=i; j < F1PhiInNewF->getNumIncomingValues(); ++j) {
+      if (F1PhiInNewF->getIncomingBlock(j) == F1NewFInBlock) {
+        F1PhiInNewF->setIncomingValue(j, NewIncoming);
+      }
+    }
+  }
+
+  // Garbage-collect the reordered PHI nodes we temporarily created.
+  for (SmallVectorImpl<PHINode *>::iterator I = GCInsts.begin(),
+        E = GCInsts.end(); I != E; ++I)
+    delete *I;
+}
+
+static bool rewriteRecursiveCall(
+    const CallInst *F1I, const CallInst *F2I, CallInst *NewFI,
+    const Function *F1, const Function *F2, Function *NewF) {
+  if (!(F1I->getCalledFunction() == F1 && F2I->getCalledFunction() == F2) &&
+      !(F1I->getCalledFunction() == F2 && F2I->getCalledFunction() == F1))
+    return false; // not a recursive/mutually recursive call
+
+  // Replace NewFI by recursive call to NewF with additional choice argument
+  SmallVector<Value *, 16> Args;
+  for (unsigned AI = 0, End = NewFI->getNumArgOperands(); AI < End; ++AI) {
+    Value *Arg = NewFI->getArgOperand(AI);
+
+    // Check if F1 or F2 is one of the arguments (veeery unusual case, don't
+    // handle it for now).
+    if (Arg == F1 || Arg == F2)
+      return false;
+
+    Args.push_back(Arg);
+  }
+
+  if (F1I->getCalledFunction() == F1 && F2I->getCalledFunction() == F2) {
+    Args.push_back(getLastArg(NewF));
+  } else {
+    // Need to invert the choice argument
+    Value *ChoiceArg = getLastArg(NewF);
+    Constant *One = ConstantInt::get(ChoiceArg->getType(), 1);
+    Args.push_back(BinaryOperator::Create(Instruction::Xor, ChoiceArg, One, "",
+          NewFI));
+  }
+
+  CallInst *CI = CallInst::Create(NewF, Args);
+  CI->setCallingConv(NewF->getCallingConv());
+
+  ReplaceInstWithInst(NewFI, CI);
+
+  return true;
+}
+
+/// Clone F1 into a new function with F1's name + MERGE_SUFFIX. Adds an
+/// additional i32 argument to the function.
+static Function *cloneAndAddArgument(Function *F1, ValueToValueMapTy &VMap) {
+  LLVMContext &Context = F1->getContext();
+
+  std::vector<Type*> ArgTypes;
+  for (const auto &Arg : F1->args())
+    ArgTypes.push_back(Arg.getType());
+  ArgTypes.push_back(Type::getInt32Ty(Context));
+
+  FunctionType *FTy = FunctionType::get(F1->getFunctionType()->getReturnType(),
+                                        ArgTypes,
+                                        F1->getFunctionType()->isVarArg());
+  Function *NewF = Function::Create(FTy, F1->getLinkage(),
+                                    F1->getName()+MERGED_SUFFIX);
+
+  insertFunctionAfter(NewF, F1);
+
+  if (F1->hasSection())
+    NewF->setSection(F1->getSection());
+
+  if (F1->getFunctionType()->isVarArg())
+    NewF->setCallingConv(CallingConv::C);
+  else
+    NewF->setCallingConv(CallingConv::Fast);
+
+  Function::arg_iterator DestI = NewF->arg_begin();
+  for (auto &Arg : F1->args()) {
+    Argument *DestIn = &*DestI;
+    DestIn->setName(Arg.getName()); // Copy the name over...
+    VMap[&Arg] = DestIn;            // Add mapping to VMap
+    ++DestI;
+  }
+
+  // Name the selector argument
+  (*DestI).setName("__merge_arg");
+
+  SmallVector<ReturnInst*, 8> Returns;
+  CloneFunctionInto(NewF, F1, VMap, CloneType::ExtractingFunctions, Returns);
+  // Set linkage to set visibility to default.
+  NewF->setLinkage(GlobalValue::InternalLinkage);
+
+  return NewF;
+}
+
+typedef MapVector<const Instruction *, std::vector<const Instruction *> >
+  CombinedDiffMap;
+
+void MergeSimilarFunctions::outlineAndMergeFunctions(
+    SmallVectorImpl<FunctionComparator *> &Fns) {
+  assert(!Fns.empty() && "Cannot merge empty set of functions");
+
+  // All comparator instances in Fns share the same F1
+  Function *F1 = Fns.front()->getF1();
+
+  // Clone F1 into new function with an additional i32 argument
+  ValueToValueMapTy VMap; // maps F1 values -> NewF values
+  Function *NewF = cloneAndAddArgument(F1, VMap);
+
+  // Combine all the DifferingInstructions maps in Fns into one single map of
+  // lists to aid the merging process.
+  //
+  // Map F1 instruction -> list of F2 instructions indexed by position in Fns.
+  CombinedDiffMap AllDifferingInstructions;
+  for (unsigned I = 0, E = Fns.size(); I != E; ++I) {
+    FunctionComparator *Comp = Fns[I];
+    for (MapVector<const Instruction *, const Instruction *>::iterator
+          DiffI = Comp->DifferingInstructions.begin(),
+          DiffE = Comp->DifferingInstructions.end();
+         DiffI != DiffE; ++DiffI) {
+      AllDifferingInstructions[DiffI->first].resize(Fns.size());
+      AllDifferingInstructions[DiffI->first][I] = DiffI->second;
+    }
+  }
+
+  // Merge differing PHI nodes. We need to handle these first because they could
+  // be affected later on when we split basic blocks, thus making them
+  // impossible to merge.
+  for (CombinedDiffMap::const_iterator I = AllDifferingInstructions.begin(),
+         E = AllDifferingInstructions.end();
+       I != E; ++I) {
+    const PHINode *F1PhiInst = dyn_cast<PHINode>(I->first);
+    if (!F1PhiInst)
+      continue;
+
+    const std::vector<const Instruction *> &F2PhiInsts = I->second;
+
+    mergePHINode(Fns, NewF, VMap, F1PhiInst, F2PhiInsts);
+  }
+
+  // Merge recursive calls
+  //
+  // TODO: We currently only support this optimization for pairs of functions.
+  // If more than two functions are merged, we mark the recursive calls as
+  // DifferingInstructions which causes switch statements to be inserted and
+  // recursive calls going through thunks. It wouldn't be too hard to implement
+  // self-recursive calls for multi-merges. *Mutually* recursive calls with
+  // multi-merges are a little trickier - that's why we leave it for now.
+  if (Fns.size() == 1) {
+    FunctionComparator *Comp = Fns.front();
+    for (MapVector<const Instruction *, const Instruction *>::const_iterator
+        I = Comp->SelfRefInstructions.begin(),
+        E = Comp->SelfRefInstructions.end();
+        I != E; ++I) {
+      const Instruction *F1I = I->first;
+      if (Comp->DifferingInstructions.count(F1I))
+        continue; // Differing in other ways too, so deal with it later.
+
+      // Attempt recursive call rewriting
+      if (isa<CallInst>(F1I)) {
+        const CallInst *F1Call = cast<const CallInst>(F1I);
+        const CallInst *F2Call = dyn_cast<const CallInst>(I->second);
+        CallInst *NewFCall = dyn_cast<CallInst>(VMap[F1I]);
+
+        if (F1Call && F2Call && NewFCall &&
+            rewriteRecursiveCall(F1Call, F2Call, NewFCall,
+                                 Comp->getF1(), Comp->getF2(), NewF))
+          continue;
+      }
+
+      // Can't rewrite it. Mark as differing and insert conditional later
+      Comp->DifferingInstructions[F1I] = I->second;
+    }
+  } else {
+    for (unsigned I = 0, E = Fns.size(); I != E; ++I) {
+      FunctionComparator *Comp = Fns[I];
+      for (MapVector<const Instruction *, const Instruction *>::const_iterator
+          II = Comp->SelfRefInstructions.begin(),
+          EE = Comp->SelfRefInstructions.end();
+          II != EE; ++II) {
+        const Instruction *F1I = II->first;
+        if (Comp->DifferingInstructions.count(F1I))
+          continue; // Differing in other ways too, so deal with it later.
+
+        AllDifferingInstructions[F1I].resize(Fns.size());
+        AllDifferingInstructions[F1I][I] = II->second;
+      }
+    }
+  }
+
+  // For each differing instruction, splice basic block, and insert conditional
+  LLVMContext &Context = NewF->getContext();
+  Type *IntPtrType = DL->getIntPtrType(Context);
+  for (CombinedDiffMap::const_iterator I = AllDifferingInstructions.begin(),
+         E = AllDifferingInstructions.end();
+       I != E; ++I) {
+    const Instruction *F1Inst = I->first;
+    const std::vector<const Instruction *> &F2Insts = I->second;
+
+    assert(VMap.find(F1Inst) != VMap.end() &&
+      "Cannot find differing inst!");
+    Instruction *F1InstInNewF = cast<Instruction>(VMap[F1Inst]);
+
+    if (isa<PHINode>(F1InstInNewF))
+      continue; // we already handled these above
+
+    insertCondAndRemapInstructions(F1InstInNewF, F2Insts,
+      NewF, VMap, Fns, IntPtrType);
+  }
+
+  // Replace functions with thunks
+  PrintMerges("FNSM", F1, NewF);
+  writeThunkWithChoice(NewF, F1, 0);
+  for (unsigned FnI = 0, FnE = Fns.size(); FnI != FnE; ++FnI) {
+    Function *F2 = Fns[FnI]->getF2();
+    PrintMerges("FNSM", F2, NewF);
+    writeThunkWithChoice(NewF, F2, FnI + 1);
+  }
+  NumSimilarFunctionsMerged += Fns.size() + 1;
+}
+
+// For each instruction used by the value, remove() the function that contains
+// the instruction. This should happen right before a call to RAUW.
+void MergeSimilarFunctions::removeUsers(Value *V) {
+  std::vector<Value *> Worklist;
+  Worklist.push_back(V);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.back();
+    Worklist.pop_back();
+
+    for (User *U : V->users()) {
+      if (Instruction *I = dyn_cast<Instruction>(U)) {
+        Registry.remove(I->getParent()->getParent());
+      } else if (isa<GlobalValue>(U)) {
+        // do nothing
+      } else if (Constant *C = dyn_cast<Constant>(U)) {
+        for (User *UU : C->users())
+          Worklist.push_back(UU);
+      }
+    }
+  }
+}
+
+bool MergeSimilarFunctions::mergeBucket(std::list<ComparableFunction> &Fns,
+    bool &Changed) {
+  for (std::list<ComparableFunction>::iterator FnI = Fns.begin(),
+      FnE = Fns.end(); FnI != FnE; ++FnI) {
+    if (!FnI->isNew())
+      continue;
+
+    if (!FnI->getFunc())
+      continue;
+
+    SmallVector<FunctionComparator *, 8> DiffMergeCandidates;
+
+    std::list<ComparableFunction>::iterator Fn2I = FnI;
+    for (++Fn2I; Fn2I != FnE; ++Fn2I) {
+      if (!Fn2I->getFunc())
+        continue;
+
+      assert(FnI->getFunc() != Fn2I->getFunc() &&
+          "Duplicate function in list!");
+
+      FunctionComparator *Comp = new FunctionComparator(DL, FnI->getFunc(),
+          Fn2I->getFunc());
+
+      if (!Comp->compare() || !Comp->isMergeCandidate()) {
+        delete Comp;
+        continue;
+      }
+
+      // Never thunk a strong function to a weak function.
+      assert(!FnI->getFunc()->isInterposable() ||
+          Fn2I->getFunc()->isInterposable());
+
+      if (Comp->isExactMatch()) {
+        // Equiv merge the two functions. Throw away any diff merge
+        // candidate we might have found so far.
+        delete Comp;
+
+        LLVM_DEBUG(dbgs() << "- Equiv merge " << FnI->getFunc()->getName()
+            << " == " << Fn2I->getFunc()->getName() << '\n');
+
+        PrintMerges("FNEQ", FnI->getFunc(), Fn2I->getFunc());
+
+        Function *DeleteF = Fn2I->getFunc();
+        Registry.remove(DeleteF, /*reanalyze=*/false);
+
+        mergeTwoFunctions(FnI->getFunc(), DeleteF);
+
+        Changed = true;
+
+        // mergeTwoFunctions may have removed functions from this bucket and
+        // invalidated the iterators. Rescan the whole bucket, continuing
+        // from the current function (previous ones will have been
+        // markCompared())
+        for (SmallVector<FunctionComparator *, 8>::iterator
+            I = DiffMergeCandidates.begin(), E = DiffMergeCandidates.end();
+           I != E; ++I)
+          delete *I;
+
+        return false;
+      } else {
+        DiffMergeCandidates.push_back(Comp);
+      }
+    }
+
+    if (!DiffMergeCandidates.empty()) {
+      // Add to our list of candidates for diff merging
+      for (SmallVector<FunctionComparator *, 8>::iterator
+            I = DiffMergeCandidates.begin(), E = DiffMergeCandidates.end();
+           I != E; ++I) {
+        Registry.insertCandidate(*I);
+      }
+    }
+
+    FnI->markCompared();
+  }
+
+  return true;
+}
+
+bool MergeSimilarFunctions::doExhaustiveCompareMerge() {
+  bool Changed = false;
+
+  // Process buckets with strong functions first.
+  for (MergeRegistry::FnCompareMap::iterator
+        BucketsI = Registry.FunctionsToCompare.begin(),
+        BucketsE = Registry.FunctionsToCompare.end();
+       BucketsI != BucketsE; ++BucketsI) {
+    std::list<ComparableFunction> &Fns = BucketsI->second;
+    if (Fns.size() < 2 || Fns.front().getFunc()->isInterposable())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Processing strong bucket " << BucketsI->first << " with "
+                 << Fns.size() << " functions\n");
+    // Repeatedly scan this bucket, until we find no more functions to equiv
+    // merge.
+    while (!mergeBucket(Fns, Changed) && Fns.size() > 1) {
+      LLVM_DEBUG(dbgs() << "Rescanning bucket.\n");
+    }
+  }
+
+  // Process buckets with weak functions.
+  for (MergeRegistry::FnCompareMap::iterator
+        BucketsI = Registry.FunctionsToCompare.begin(),
+        BucketsE = Registry.FunctionsToCompare.end();
+       BucketsI != BucketsE; ++BucketsI) {
+    std::list<ComparableFunction> &Fns = BucketsI->second;
+    if (Fns.size() < 2 || !Fns.front().getFunc()->isInterposable())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Processing weak bucket " << BucketsI->first << " with "
+                 << Fns.size() << " functions\n");
+    // Repeatedly scan this bucket, until we find no more functions to equiv
+    // merge.
+    while (!mergeBucket(Fns, Changed) && Fns.size() > 1) {
+      LLVM_DEBUG(dbgs() << "Rescanning bucket.\n");
+    }
+  }
+
+  return Changed;
+}
+
+static bool orderComparatorsByMetric(FunctionComparator *Cmp1,
+                                     FunctionComparator *Cmp2) {
+  return (Cmp1->getSimilarityMetric() > Cmp2->getSimilarityMetric());
+}
+
+bool MergeSimilarFunctions::doDiffMerge() {
+  if (Registry.FunctionsToMerge.empty())
+    return false;
+
+  bool Changed = false;
+  DenseSet<Function *> MergedFns; // Functions that have already been merged
+  Registry.FunctionsToMerge.sort(orderComparatorsByMetric);
+
+  for (std::list<FunctionComparator *>::iterator
+        I = Registry.FunctionsToMerge.begin(),
+        E = Registry.FunctionsToMerge.end();
+       I != E; ++I) {
+    FunctionComparator *Comp = *I;
+    Function *F1 = Comp->getF1();
+    // Ignore it if we've already merged this fn
+    if (MergedFns.count(F1) || MergedFns.count(Comp->getF2()))
+      continue;
+
+    assert(Registry.SimilarFunctions.count(F1) &&
+        "Comparator doesn't exist in SimilarFunctions map");
+
+    // Look at all functions F that F1 could be merged with. Merge with each F,
+    // unless there is another function F' that is more similar to F than F1.
+    MergeRegistry::FnComparatorSet &SimilarFns = Registry.SimilarFunctions[F1];
+    SmallVector<FunctionComparator *, 4> CurrentMerge;
+
+    for (MergeRegistry::FnComparatorSet::iterator
+          CandidateI = SimilarFns.begin(), CandidateE = SimilarFns.end();
+        CandidateI != CandidateE; ++CandidateI) {
+      FunctionComparator *Comp2 = *CandidateI;
+      assert(Comp2->getF1() == F1 && "Inconsistency in SimilarFunctions");
+      Function *F2 = Comp2->getF2();
+
+      // Ignore it if we've already merged this fn
+      if (MergedFns.count(F2))
+        continue;
+
+      // Check whether there is a better merge candidate for F2
+      if (Registry.getMaxSimilarity(F2, MergedFns) >
+          Comp2->getSimilarityMetric())
+        continue;
+
+      // Ok, we actually want to merge with F2
+      CurrentMerge.push_back(Comp2);
+      MergedFns.insert(F2);
+    }
+
+    if (CurrentMerge.empty())
+      continue;
+
+    MergedFns.insert(F1);
+
+    NumMultiMerged += CurrentMerge.size();
+
+    LLVM_DEBUG(dbgs() << "- Multi merge of " << F1->getName() << " with "
+                 << CurrentMerge.size() << " functions.\n");
+
+    Changed = true;
+    outlineAndMergeFunctions(CurrentMerge);
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -153,6 +153,10 @@
     EnableMatrix("enable-matrix", cl::init(false), cl::Hidden,
                  cl::desc("Enable lowering of the matrix intrinsics"));
 
+static cl::opt<bool> EnableMergeSimilarFunctions(
+    "enable-merge-sim-functions", cl::init(false), cl::Hidden,
+    cl::desc("Enable the Function merging pass (default = on)"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -609,6 +613,11 @@
     MPM.add(createOpenMPOptLegacyPass());
 
   MPM.add(createPostOrderFunctionAttrsLegacyPass());
+  if (EnableMergeSimilarFunctions) {
+    auto *Summary = (ImportSummary ? ImportSummary : ExportSummary);
+    MPM.add(createMergeSimilarFunctionsPass(Summary));
+  }
+
   if (OptLevel > 2)
     MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
 
@@ -820,6 +829,9 @@
   if (MergeFunctions)
     MPM.add(createMergeFunctionsPass());
 
+  if (EnableMergeSimilarFunctions)
+    MPM.add(createMergeSimilarFunctionsPass());
+
   // LoopSink pass sinks instructions hoisted by LICM, which serves as a
   // canonicalization pass that enables other optimizations. As a result,
   // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
@@ -1047,6 +1059,8 @@
   // currently it damages debug info.
   if (MergeFunctions)
     PM.add(createMergeFunctionsPass());
+  if (EnableMergeSimilarFunctions)
+    PM.add(createMergeSimilarFunctionsPass());
 }
 
 void PassManagerBuilder::populateThinLTOPassManager(
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -80,10 +80,8 @@
 
 // Clone OldFunc into NewFunc, transforming the old arguments into references to
 // VMap values.
-//
 void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
-                             ValueToValueMapTy &VMap,
-                             bool ModuleLevelChanges,
+                             ValueToValueMapTy &VMap, CloneType CT,
                              SmallVectorImpl<ReturnInst*> &Returns,
                              const char *NameSuffix, ClonedCodeInfo *CodeInfo,
                              ValueMapTypeRemapper *TypeMapper,
@@ -101,12 +99,12 @@
   NewFunc->copyAttributesFrom(OldFunc);
   NewFunc->setAttributes(NewAttrs);
 
+  RemapFlags RF =
+      (CT == CloneType::ModuleLevelChanges) ? RF_None : RF_NoModuleLevelChanges;
   // Fix up the personality function that got copied over.
   if (OldFunc->hasPersonalityFn())
-    NewFunc->setPersonalityFn(
-        MapValue(OldFunc->getPersonalityFn(), VMap,
-                 ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                 TypeMapper, Materializer));
+    NewFunc->setPersonalityFn(MapValue(OldFunc->getPersonalityFn(), VMap, RF,
+                                       TypeMapper, Materializer));
 
   SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
   AttributeList OldAttrs = OldFunc->getAttributes();
@@ -123,11 +121,11 @@
       AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
                          OldAttrs.getRetAttributes(), NewArgAttrs));
 
-  bool MustCloneSP =
-      OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
+  bool MustCloneSP = CT != CloneType::ExtractingFunctions &&
+                     OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
   DISubprogram *SP = OldFunc->getSubprogram();
   if (SP) {
-    assert(!MustCloneSP || ModuleLevelChanges);
+    assert(!MustCloneSP || CT == CloneType::ModuleLevelChanges);
     // Add mappings for some DebugInfo nodes that we don't want duplicated
     // even if they're distinct.
     auto &MD = VMap.MD();
@@ -144,10 +142,7 @@
   OldFunc->getAllMetadata(MDs);
   for (auto MD : MDs) {
     NewFunc->addMetadata(
-        MD.first,
-        *MapMetadata(MD.second, VMap,
-                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                     TypeMapper, Materializer));
+        MD.first, *MapMetadata(MD.second, VMap, RF, TypeMapper, Materializer));
   }
 
   // When we remap instructions, we want to avoid duplicating inlined
@@ -207,9 +202,7 @@
        BB != BE; ++BB)
     // Loop over all instructions, fixing each one as we find it...
     for (Instruction &II : *BB)
-      RemapInstruction(&II, VMap,
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                       TypeMapper, Materializer);
+      RemapInstruction(&II, VMap, RF, TypeMapper, Materializer);
 
   // Register all DICompileUnits of the old parent module in the new parent module
   auto* OldModule = OldFunc->getParent();
@@ -262,8 +255,9 @@
     }
 
   SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-  CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "",
-                    CodeInfo);
+  CloneType CT = F->getSubprogram() != nullptr ? CloneType::ModuleLevelChanges
+                                               : CloneType::InvalidCloneType;
+  CloneFunctionInto(NewF, F, VMap, CT, Returns, "", CodeInfo);
 
   return NewF;
 }
diff --git a/llvm/lib/Transforms/Utils/CloneModule.cpp b/llvm/lib/Transforms/Utils/CloneModule.cpp
--- a/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -161,7 +161,7 @@
     }
 
     SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
-    CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns);
+    CloneFunctionInto(F, &I, VMap, CloneType::ModuleLevelChanges, Returns);
 
     if (I.hasPersonalityFn())
       F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-alloca.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-alloca.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-alloca.ll
@@ -0,0 +1,94 @@
+;RUN: opt -mergesimilarfunc -mergesimilarfunc-level=all -S < %s | FileCheck %s
+;
+; Test whether mergefunc merges allocas of different sizes correctly
+;
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+
+%struct.A = type { i32, i32 }
+%struct.B = type { i32, i32, i32 }
+
+; Function Attrs: nounwind optsize
+define void @f1() #0 {
+; CHECK-LABEL: @f1__merged(
+; CHECK: alloca %struct.A
+; CHECK: alloca %struct.B
+entry:
+  %a = alloca %struct.A, align 4
+  %0 = bitcast %struct.A* %a to i8*
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  ret void
+}
+
+; Function Attrs: optsize
+declare void @externalFun(i8*) #1
+
+; Function Attrs: nounwind optsize
+define void @f2() #0 {
+entry:
+  %a = alloca %struct.B, align 4
+  %0 = bitcast %struct.B* %a to i8*
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  call void @externalFun(i8* %0) #2
+  ret void
+}
+
+; Function Attrs: nounwind optsize
+define void @f3() #0 {
+entry:
+  %a = alloca i8, align 1
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  call void @externalFun(i8* %a) #2
+  ret void
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind optsize }
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info-2.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info-2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info-2.ll
@@ -0,0 +1,101 @@
+; RUN: opt -S -mergesimilarfunc -mergesimilarfunc-diff-min-insts=5 < %s | FileCheck %s
+; This used to fail with assertion in CloneFunction
+; REQUIRES: asserts
+; CHECK-LABEL: @foo__merged(
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios8.0.0"
+
+%struct.wibble = type { %struct.wibble.0*, %struct.wibble* }
+%struct.wibble.0 = type { i64*, i8*, i64*, i64*, %struct.eggs*, %struct.wibble* }
+%struct.eggs = type { %struct.wombat*, %struct.eggs* }
+%struct.wombat = type { i8*, %struct.blam*, %struct.blam* }
+%struct.blam = type { i8*, %struct.blam* }
+%struct.snork = type { %struct.bar*, %struct.snork* }
+%struct.bar = type { i64*, i8* }
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+define hidden void @foo(%struct.wibble* %arg) #1 align 2 !dbg !4 {
+bb:
+  %tmp = alloca %struct.wibble*, align 8
+  %tmp2 = load %struct.wibble*, %struct.wibble** %tmp, align 8, !dbg !12
+  %tmp3 = icmp ne %struct.wibble* %tmp2, null, !dbg !13
+  br i1 %tmp3, label %bb4, label %bb13, !dbg !14
+
+bb4:                                              ; preds = %bb
+  call void @foo.1() #3, !dbg !26
+  unreachable
+
+bb13:                                             ; preds = %bb
+  ret void, !dbg !27
+}
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+declare hidden void @foo.1() #1 align 2
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+define void @quux() unnamed_addr #1 align 2 !dbg !28 {
+bb:
+  ret void
+}
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+define hidden void @baz(%struct.snork* %arg) #1 align 2 !dbg !30 {
+bb:
+  %tmp = alloca %struct.snork*, align 8
+  %tmp2 = load %struct.snork*, %struct.snork** %tmp, align 8, !dbg !31
+  %tmp3 = icmp ne %struct.snork* %tmp2, null, !dbg !32
+  br i1 %tmp3, label %bb4, label %bb13, !dbg !33
+
+bb4:                                              ; preds = %bb
+  call void @blam() #3, !dbg !42
+  unreachable
+
+bb13:                                             ; preds = %bb
+  ret void, !dbg !43
+}
+
+; Function Attrs: minsize nounwind optsize ssp uwtable
+declare hidden void @blam() #1 align 2
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { minsize nounwind optsize ssp uwtable }
+attributes #2 = { nounwind }
+attributes #3 = { minsize optsize }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "(based on LLVM 5.0.0)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "foo.cpp", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = distinct !DISubprogram(name: "delete", scope: !5, file: !5, line: 31, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!5 = !DIFile(filename: "foo.h", directory: "/")
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!9 = !{!"any pointer", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C++ TBAA"}
+!12 = !DILocation(line: 33, column: 11, scope: !4)
+!13 = !DILocation(line: 33, column: 16, scope: !4)
+!14 = !DILocation(line: 33, column: 5, scope: !4)
+!21 = !{!"_ZTSN", !9, i64 0, !9, i64 8}
+!23 = !DILocation(line: 37, column: 25, scope: !4)
+!24 = !DILocation(line: 37, column: 31, scope: !4)
+!25 = !{!21, !9, i64 0}
+!26 = !DILocation(line: 37, column: 7, scope: !4)
+!27 = !DILocation(line: 41, column: 3, scope: !4)
+!28 = distinct !DISubprogram(name: "~destruct", scope: !29, file: !29, line: 31, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!29 = !DIFile(filename: "bar.h", directory: "/")
+!30 = distinct !DISubprogram(name: "delete", scope: !5, file: !5, line: 31, type: !6, isLocal: false, isDefinition: true, scopeLine: 32, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!31 = !DILocation(line: 33, column: 11, scope: !30)
+!32 = !DILocation(line: 33, column: 16, scope: !30)
+!33 = !DILocation(line: 33, column: 5, scope: !30)
+!40 = !DILocation(line: 37, column: 25, scope: !30)
+!41 = !DILocation(line: 37, column: 31, scope: !30)
+!42 = !DILocation(line: 37, column: 7, scope: !30)
+!43 = !DILocation(line: 41, column: 3, scope: !30)
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-debug-info.ll
@@ -0,0 +1,243 @@
+; This used to fail the verifier with the following error:
+; "dbg attachment points at wrong subprogram for function"
+; RUN: opt -S -mergesimilarfunc < %s | FileCheck %s
+; REQUIRES: asserts
+; CHECK-LABEL: @bar__merged(
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+
+%struct.str_type = type { i8*, i32, i32 }
+
+; Function Attrs: nounwind optsize
+define i32 @bar(i8* %b) #0 !dbg !14 {
+entry:
+  %retval = alloca i32, align 4
+  %b.addr = alloca i8*, align 4
+  %res = alloca i32, align 4
+  %ee = alloca %struct.str_type*, align 4
+  %cleanup.dest.slot = alloca i32
+  store i8* %b, i8** %b.addr, align 4, !tbaa !31
+  call void @llvm.dbg.declare(metadata i8** %b.addr, metadata !18, metadata !35), !dbg !36
+  %0 = bitcast i32* %res to i8*, !dbg !37
+  call void @llvm.lifetime.start(i64 4, i8* %0) #4, !dbg !37
+  call void @llvm.dbg.declare(metadata i32* %res, metadata !19, metadata !35), !dbg !38
+  %1 = bitcast %struct.str_type** %ee to i8*, !dbg !39
+  call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !39
+  call void @llvm.dbg.declare(metadata %struct.str_type** %ee, metadata !20, metadata !35), !dbg !40
+  %2 = load i8*, i8** %b.addr, align 4, !dbg !41, !tbaa !31
+  %3 = bitcast i8* %2 to %struct.str_type*, !dbg !42
+  store %struct.str_type* %3, %struct.str_type** %ee, align 4, !dbg !40, !tbaa !31
+  %4 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !43, !tbaa !31
+  %set = getelementptr inbounds %struct.str_type, %struct.str_type* %4, i32 0, i32 1, !dbg !45
+  %5 = load i32, i32* %set, align 4, !dbg !45, !tbaa !46
+  %tobool = icmp ne i32 %5, 0, !dbg !43
+  br i1 %tobool, label %if.end4, label %if.then, !dbg !49
+
+if.then:                                          ; preds = %entry
+  %6 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !50, !tbaa !31
+  %set1 = getelementptr inbounds %struct.str_type, %struct.str_type* %6, i32 0, i32 1, !dbg !52
+  store i32 1, i32* %set1, align 4, !dbg !53, !tbaa !46
+  %7 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !54, !tbaa !31
+  %x = getelementptr inbounds %struct.str_type, %struct.str_type* %7, i32 0, i32 0, !dbg !55
+  %8 = load i8*, i8** %x, align 4, !dbg !55, !tbaa !56
+  %call = call i32 @foo(i8* %8) #5, !dbg !57
+  store i32 %call, i32* %res, align 4, !dbg !58, !tbaa !59
+  %9 = load i32, i32* %res, align 4, !dbg !60, !tbaa !59
+  %tobool2 = icmp ne i32 %9, 0, !dbg !60
+  br i1 %tobool2, label %if.then3, label %if.end, !dbg !62
+
+if.then3:                                         ; preds = %if.then
+  store i32 1, i32* %retval, align 4, !dbg !63
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup, !dbg !63
+
+if.end:                                           ; preds = %if.then
+  br label %if.end4, !dbg !64
+
+if.end4:                                          ; preds = %if.end, %entry
+  store i32 0, i32* %retval, align 4, !dbg !65
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup, !dbg !65
+
+cleanup:                                          ; preds = %if.end4, %if.then3
+  %10 = bitcast %struct.str_type** %ee to i8*, !dbg !66
+  call void @llvm.lifetime.end(i64 4, i8* %10) #4, !dbg !66
+  %11 = bitcast i32* %res to i8*, !dbg !66
+  call void @llvm.lifetime.end(i64 4, i8* %11) #4, !dbg !66
+  %12 = load i32, i32* %retval, align 4, !dbg !66
+  ret i32 %12, !dbg !66
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+
+; Function Attrs: optsize
+declare i32 @foo(i8*) #3
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+
+; Function Attrs: nounwind optsize
+define i32 @bar1(i8* %b) #0 !dbg !21 {
+entry:
+  %retval = alloca i32, align 4
+  %b.addr = alloca i8*, align 4
+  %res = alloca i32, align 4
+  %ee = alloca %struct.str_type*, align 4
+  %cleanup.dest.slot = alloca i32
+  store i8* %b, i8** %b.addr, align 4, !tbaa !31
+  call void @llvm.dbg.declare(metadata i8** %b.addr, metadata !23, metadata !35), !dbg !67
+  %0 = bitcast i32* %res to i8*, !dbg !68
+  call void @llvm.lifetime.start(i64 4, i8* %0) #4, !dbg !68
+  call void @llvm.dbg.declare(metadata i32* %res, metadata !24, metadata !35), !dbg !69
+  %1 = bitcast %struct.str_type** %ee to i8*, !dbg !70
+  call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !70
+  call void @llvm.dbg.declare(metadata %struct.str_type** %ee, metadata !25, metadata !35), !dbg !71
+  %2 = load i8*, i8** %b.addr, align 4, !dbg !72, !tbaa !31
+  %3 = bitcast i8* %2 to %struct.str_type*, !dbg !73
+  store %struct.str_type* %3, %struct.str_type** %ee, align 4, !dbg !71, !tbaa !31
+  %4 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !74, !tbaa !31
+  %get = getelementptr inbounds %struct.str_type, %struct.str_type* %4, i32 0, i32 2, !dbg !76
+  %5 = load i32, i32* %get, align 4, !dbg !76, !tbaa !77
+  %tobool = icmp ne i32 %5, 0, !dbg !74
+  br i1 %tobool, label %if.end4, label %if.then, !dbg !78
+
+if.then:                                          ; preds = %entry
+  %6 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !79, !tbaa !31
+  %get1 = getelementptr inbounds %struct.str_type, %struct.str_type* %6, i32 0, i32 2, !dbg !81
+  store i32 1, i32* %get1, align 4, !dbg !82, !tbaa !77
+  %7 = load %struct.str_type*, %struct.str_type** %ee, align 4, !dbg !83, !tbaa !31
+  %x = getelementptr inbounds %struct.str_type, %struct.str_type* %7, i32 0, i32 0, !dbg !84
+  %8 = load i8*, i8** %x, align 4, !dbg !84, !tbaa !56
+  %call = call i32 @foo(i8* %8) #5, !dbg !85
+  store i32 %call, i32* %res, align 4, !dbg !86, !tbaa !59
+  %9 = load i32, i32* %res, align 4, !dbg !87, !tbaa !59
+  %tobool2 = icmp ne i32 %9, 0, !dbg !87
+  br i1 %tobool2, label %if.then3, label %if.end, !dbg !89
+
+if.then3:                                         ; preds = %if.then
+  store i32 1, i32* %retval, align 4, !dbg !90
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup, !dbg !90
+
+if.end:                                           ; preds = %if.then
+  br label %if.end4, !dbg !91
+
+if.end4:                                          ; preds = %if.end, %entry
+  store i32 0, i32* %retval, align 4, !dbg !92
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup, !dbg !92
+
+cleanup:                                          ; preds = %if.end4, %if.then3
+  %10 = bitcast %struct.str_type** %ee to i8*, !dbg !93
+  call void @llvm.lifetime.end(i64 4, i8* %10) #4, !dbg !93
+  %11 = bitcast i32* %res to i8*, !dbg !93
+  call void @llvm.lifetime.end(i64 4, i8* %11) #4, !dbg !93
+  %12 = load i32, i32* %retval, align 4, !dbg !93
+  ret i32 %12, !dbg !93
+}
+
+attributes #0 = { nounwind optsize }
+attributes #1 = { nounwind readnone }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { optsize }
+attributes #4 = { nounwind }
+attributes #5 = { optsize }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26, !27}
+!llvm.ident = !{!28}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.9.0)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.i", directory: "/local/mnt/")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32, align: 32)
+!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "str_type", file: !1, line: 8, baseType: !6)
+!6 = !DICompositeType(tag: DW_TAG_structure_type, name: "str_type", file: !1, line: 3, size: 96, align: 32, elements: !7)
+!7 = !{!8, !10, !12}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !6, file: !1, line: 5, baseType: !9, size: 32, align: 32)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 32, align: 32)
+!10 = !DIDerivedType(tag: DW_TAG_member, name: "set", scope: !6, file: !1, line: 6, baseType: !11, size: 32, align: 32, offset: 32)
+!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "get", scope: !6, file: !1, line: 7, baseType: !11, size: 32, align: 32, offset: 64)
+!14 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 10, type: !15, isLocal: false, isDefinition: true, scopeLine: 10, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{!11, !9}
+!17 = !{!18, !19, !20}
+!18 = !DILocalVariable(name: "b", arg: 1, scope: !14, file: !1, line: 10, type: !9)
+!19 = !DILocalVariable(name: "res", scope: !14, file: !1, line: 11, type: !11)
+!20 = !DILocalVariable(name: "ee", scope: !14, file: !1, line: 12, type: !4)
+!21 = distinct !DISubprogram(name: "bar1", scope: !1, file: !1, line: 24, type: !15, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !22)
+!22 = !{!23, !24, !25}
+!23 = !DILocalVariable(name: "b", arg: 1, scope: !21, file: !1, line: 24, type: !9)
+!24 = !DILocalVariable(name: "res", scope: !21, file: !1, line: 25, type: !11)
+!25 = !DILocalVariable(name: "ee", scope: !21, file: !1, line: 26, type: !4)
+!26 = !{i32 2, !"Dwarf Version", i32 4}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.9.0)"}
+!31 = !{!32, !32, i64 0}
+!32 = !{!"any pointer", !33, i64 0}
+!33 = !{!"omnipotent char", !34, i64 0}
+!34 = !{!"Simple C/C++ TBAA"}
+!35 = !DIExpression()
+!36 = !DILocation(line: 10, column: 23, scope: !14)
+!37 = !DILocation(line: 11, column: 3, scope: !14)
+!38 = !DILocation(line: 11, column: 7, scope: !14)
+!39 = !DILocation(line: 12, column: 3, scope: !14)
+!40 = !DILocation(line: 12, column: 13, scope: !14)
+!41 = !DILocation(line: 12, column: 31, scope: !14)
+!42 = !DILocation(line: 12, column: 19, scope: !14)
+!43 = !DILocation(line: 14, column: 8, scope: !44)
+!44 = distinct !DILexicalBlock(scope: !14, file: !1, line: 14, column: 7)
+!45 = !DILocation(line: 14, column: 12, scope: !44)
+!46 = !{!47, !48, i64 4}
+!47 = !{!"str_type", !32, i64 0, !48, i64 4, !48, i64 8}
+!48 = !{!"int", !33, i64 0}
+!49 = !DILocation(line: 14, column: 7, scope: !14)
+!50 = !DILocation(line: 15, column: 5, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !44, file: !1, line: 14, column: 18)
+!52 = !DILocation(line: 15, column: 9, scope: !51)
+!53 = !DILocation(line: 15, column: 13, scope: !51)
+!54 = !DILocation(line: 17, column: 16, scope: !51)
+!55 = !DILocation(line: 17, column: 20, scope: !51)
+!56 = !{!47, !32, i64 0}
+!57 = !DILocation(line: 17, column: 11, scope: !51)
+!58 = !DILocation(line: 17, column: 9, scope: !51)
+!59 = !{!48, !48, i64 0}
+!60 = !DILocation(line: 18, column: 9, scope: !61)
+!61 = distinct !DILexicalBlock(scope: !51, file: !1, line: 18, column: 9)
+!62 = !DILocation(line: 18, column: 9, scope: !51)
+!63 = !DILocation(line: 19, column: 7, scope: !61)
+!64 = !DILocation(line: 20, column: 3, scope: !51)
+!65 = !DILocation(line: 21, column: 3, scope: !14)
+!66 = !DILocation(line: 22, column: 1, scope: !14)
+!67 = !DILocation(line: 24, column: 24, scope: !21)
+!68 = !DILocation(line: 25, column: 3, scope: !21)
+!69 = !DILocation(line: 25, column: 7, scope: !21)
+!70 = !DILocation(line: 26, column: 3, scope: !21)
+!71 = !DILocation(line: 26, column: 13, scope: !21)
+!72 = !DILocation(line: 26, column: 31, scope: !21)
+!73 = !DILocation(line: 26, column: 19, scope: !21)
+!74 = !DILocation(line: 28, column: 8, scope: !75)
+!75 = distinct !DILexicalBlock(scope: !21, file: !1, line: 28, column: 7)
+!76 = !DILocation(line: 28, column: 12, scope: !75)
+!77 = !{!47, !48, i64 8}
+!78 = !DILocation(line: 28, column: 7, scope: !21)
+!79 = !DILocation(line: 29, column: 5, scope: !80)
+!80 = distinct !DILexicalBlock(scope: !75, file: !1, line: 28, column: 17)
+!81 = !DILocation(line: 29, column: 9, scope: !80)
+!82 = !DILocation(line: 29, column: 13, scope: !80)
+!83 = !DILocation(line: 31, column: 16, scope: !80)
+!84 = !DILocation(line: 31, column: 20, scope: !80)
+!85 = !DILocation(line: 31, column: 11, scope: !80)
+!86 = !DILocation(line: 31, column: 9, scope: !80)
+!87 = !DILocation(line: 32, column: 9, scope: !88)
+!88 = distinct !DILexicalBlock(scope: !80, file: !1, line: 32, column: 9)
+!89 = !DILocation(line: 32, column: 9, scope: !80)
+!90 = !DILocation(line: 33, column: 7, scope: !88)
+!91 = !DILocation(line: 34, column: 3, scope: !80)
+!92 = !DILocation(line: 35, column: 3, scope: !21)
+!93 = !DILocation(line: 36, column: 1, scope: !21)
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent-template.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent-template.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent-template.ll
@@ -0,0 +1,138 @@
+; RUN: opt -S -mergesimilarfunc %s -o - | FileCheck %s
+
+; CHECK:      define linkonce_odr void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* nocapture readnone %this, %struct.FooStruct* %arg0) #1 align 2 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %field27 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0, i32 0, i32 2
+; CHECK-NEXT:   %0 = load i8, i8* %field27, align 1
+; CHECK-NEXT:   %lnot8 = icmp eq i8 %0, 0
+; CHECK-NEXT:   br i1 %lnot8, label %for.body, label %for.end
+
+; CHECK:      for.body:                                         ; preds = %for.body, %entry
+; CHECK-NEXT:   %arg0.addr.09 = phi %struct.FooStruct* [ %2, %for.body ], [ %arg0, %entry ]
+; CHECK-NEXT:   %field1 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 1
+; CHECK-NEXT:   %1 = load %struct.FooStruct*, %struct.FooStruct** %field1, align 4
+; CHECK-NEXT:   tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* %this, %struct.FooStruct* %1) #2
+; CHECK-NEXT:   %field0 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 0
+; CHECK-NEXT:   %2 = load %struct.FooStruct*, %struct.FooStruct** %field0, align 4
+; CHECK-NEXT:   %3 = bitcast %struct.FooStruct* %arg0.addr.09 to i8*
+; CHECK-NEXT:   tail call void @_Z4bar0Pv(i8* %3) #3
+; CHECK-NEXT:   tail call void @_Z4bar1Pv(i8* %3) #3
+; CHECK-NEXT:   %field2 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %2, i32 0, i32 2
+; CHECK-NEXT:   %4 = load i8, i8* %field2, align 1
+; CHECK-NEXT:   %lnot = icmp eq i8 %4, 0
+; CHECK-NEXT:   br i1 %lnot, label %for.body, label %for.end
+
+; CHECK:      for.end:                                          ; preds = %for.body, %entry
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+; CHECK:      define linkonce_odr void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* nocapture readnone, %struct.FooStruct*) #1 align 2 {
+; CHECK-NEXT:   %3 = bitcast %class.FooTemplate* %0 to %class.FooTemplate.0*
+; CHECK-NEXT:   tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* nocapture readnone %3, %struct.FooStruct* %1)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+; CHECK:      attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; CHECK-NEXT: attributes #1 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; CHECK-NEXT: attributes #2 = { optsize }
+; CHECK-NEXT: attributes #3 = { nounwind optsize }
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+
+%class.Foo = type { %class.FooTemplate, %class.FooTemplate.0 }
+%class.FooTemplate = type { i8 }
+%class.FooTemplate.0 = type { i8 }
+%struct.FooStruct = type { %struct.FooStruct*, %struct.FooStruct*, i8 }
+
+@_ZN3FooD1Ev = alias void(%class.Foo*), void (%class.Foo*)* @_ZN3FooD2Ev
+
+declare %struct.FooStruct* @_ZNK11FooTemplateIPvE7method2Ev(%class.FooTemplate.0*) #1
+declare void @_Z4bar0Pv(i8*) #1
+declare void @_Z4bar1Pv(i8*) #1
+declare %struct.FooStruct* @_ZNK11FooTemplateIiE7method2Ev(%class.FooTemplate*) #1
+
+; Function Attrs: nounwind optsize ssp
+define void @_ZN3FooD2Ev(%class.Foo* %this) unnamed_addr #0 align 2 {
+entry:
+  %bar_things = getelementptr inbounds %class.Foo, %class.Foo* %this, i32 0, i32 0
+  tail call void @_ZN11FooTemplateIiE7method0Ev(%class.FooTemplate* %bar_things) #2
+  %baz_things = getelementptr inbounds %class.Foo, %class.Foo* %this, i32 0, i32 1
+  tail call void @_ZN11FooTemplateIPvE7method0Ev(%class.FooTemplate.0* %baz_things) #2
+  ret void
+}
+
+; Function Attrs: nounwind optsize ssp
+define linkonce_odr void @_ZN11FooTemplateIiE7method0Ev(%class.FooTemplate* %this) #0 align 2 {
+entry:
+  %call = tail call %struct.FooStruct* @_ZNK11FooTemplateIiE7method2Ev(%class.FooTemplate* %this) #3
+  tail call void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* %this, %struct.FooStruct* %call) #2
+  ret void
+}
+
+; Function Attrs: nounwind optsize ssp
+define linkonce_odr void @_ZN11FooTemplateIPvE7method0Ev(%class.FooTemplate.0* %this) #0 align 2 {
+entry:
+  %call = tail call %struct.FooStruct* @_ZNK11FooTemplateIPvE7method2Ev(%class.FooTemplate.0* %this) #3
+  tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* %this, %struct.FooStruct* %call) #2
+  ret void
+}
+
+; Function Attrs: nounwind optsize ssp
+define linkonce_odr void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* nocapture readnone %this, %struct.FooStruct* %arg0) #0 align 2 {
+entry:
+  %field27 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0, i32 0, i32 2
+  %0 = load i8, i8* %field27, align 1
+  %lnot8 = icmp eq i8 %0, 0
+  br i1 %lnot8, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %arg0.addr.09 = phi %struct.FooStruct* [ %2, %for.body ], [ %arg0, %entry ]
+  %field1 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 1
+  %1 = load %struct.FooStruct*, %struct.FooStruct** %field1, align 4
+  tail call void @_ZN11FooTemplateIPvE7method1EP9FooStruct(%class.FooTemplate.0* %this, %struct.FooStruct* %1) #2
+  %field0 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 0
+  %2 = load %struct.FooStruct*, %struct.FooStruct** %field0, align 4
+  %3 = bitcast %struct.FooStruct* %arg0.addr.09 to i8*
+  tail call void @_Z4bar0Pv(i8* %3) #3
+  tail call void @_Z4bar1Pv(i8* %3) #3
+  %field2 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %2, i32 0, i32 2
+  %4 = load i8, i8* %field2, align 1
+  %lnot = icmp eq i8 %4, 0
+  br i1 %lnot, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Function Attrs: nounwind optsize ssp
+define linkonce_odr void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* nocapture readnone %this, %struct.FooStruct* %arg0) #0 align 2 {
+entry:
+  %field27 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0, i32 0, i32 2
+  %0 = load i8, i8* %field27, align 1
+  %lnot8 = icmp eq i8 %0, 0
+  br i1 %lnot8, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %arg0.addr.09 = phi %struct.FooStruct* [ %2, %for.body ], [ %arg0, %entry ]
+  %field1 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 1
+  %1 = load %struct.FooStruct*, %struct.FooStruct** %field1, align 4
+  tail call void @_ZN11FooTemplateIiE7method1EP9FooStruct(%class.FooTemplate* %this, %struct.FooStruct* %1) #2
+  %field0 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %arg0.addr.09, i32 0, i32 0
+  %2 = load %struct.FooStruct*, %struct.FooStruct** %field0, align 4
+  %3 = bitcast %struct.FooStruct* %arg0.addr.09 to i8*
+  tail call void @_Z4bar0Pv(i8* %3) #3
+  tail call void @_Z4bar1Pv(i8* %3) #3
+  %field2 = getelementptr inbounds %struct.FooStruct, %struct.FooStruct* %2, i32 0, i32 2
+  %4 = load i8, i8* %field2, align 1
+  %lnot = icmp eq i8 %4, 0
+  br i1 %lnot, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { optsize }
+attributes #3 = { nounwind optsize }
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-equivalent.ll
@@ -0,0 +1,226 @@
+; RUN: opt -mergesimilarfunc -S %s -o - | FileCheck %s
+;
+; CHECK:      define i8* @foo_a(%struct.a_type* %arg0) #1 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %for.cond
+
+; CHECK:      for.cond:                                         ; preds = %for.inc24, %entry
+; CHECK-NEXT:   %i.0 = phi i8 [ 0, %entry ], [ %inc25, %for.inc24 ]
+; CHECK-NEXT:   %ptr0.0 = phi i8* [ null, %entry ], [ %ptr0.3, %for.inc24 ]
+; CHECK-NEXT:   %conv = zext i8 %i.0 to i32
+; CHECK-NEXT:   %cmp = icmp slt i32 %conv, 16
+; CHECK-NEXT:   br i1 %cmp, label %for.body, label %for.end26
+
+; CHECK:      for.body:                                         ; preds = %for.cond
+; CHECK-NEXT:   %call = call i8* @bar4(i32 %conv) #2
+; CHECK-NEXT:   %call4 = call i8* @bar0(i32 %conv) #2
+; CHECK-NEXT:   %0 = bitcast i8* %call4 to %struct.a_type*
+; CHECK-NEXT:   %call5 = call i32 @bar1(i8* %call) #2
+; CHECK-NEXT:   %tobool = icmp ne i32 %call5, 0
+; CHECK-NEXT:   br i1 %tobool, label %for.cond6, label %for.inc24
+
+; CHECK:      for.cond6:                                        ; preds = %for.inc, %for.body
+; CHECK-NEXT:   %k.0 = phi i8 [ %inc, %for.inc ], [ 0, %for.body ]
+; CHECK-NEXT:   %ptr0.1 = phi i8* [ %ptr0.2, %for.inc ], [ %call, %for.body ]
+; CHECK-NEXT:   %idxprom = zext i8 %k.0 to i32
+; CHECK-NEXT:   %field4 = getelementptr inbounds %struct.a_type, %struct.a_type* %arg0, i32 0, i32 4
+; CHECK-NEXT:   %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* %field4, i32 0, i32 %idxprom
+; CHECK-NEXT:   %1 = load i8*, i8** %arrayidx, align 4
+; CHECK-NEXT:   %cmp7 = icmp ne i8* %1, null
+; CHECK-NEXT:   br i1 %cmp7, label %land.rhs, label %for.inc24
+
+; CHECK:      land.rhs:                                         ; preds = %for.cond6
+; CHECK-NEXT:   %field410 = getelementptr inbounds %struct.a_type, %struct.a_type* %0, i32 0, i32 4
+; CHECK-NEXT:   %arrayidx11 = getelementptr inbounds [2 x i8*], [2 x i8*]* %field410, i32 0, i32 %idxprom
+; CHECK-NEXT:   %2 = load i8*, i8** %arrayidx11, align 4
+; CHECK-NEXT:   %cmp12 = icmp ne i8* %2, null
+; CHECK-NEXT:   br i1 %cmp12, label %for.body14, label %for.inc24
+
+; CHECK:      for.body14:                                       ; preds = %land.rhs
+; CHECK-NEXT:   %3 = bitcast %struct.a_type* %0 to i8*
+; CHECK-NEXT:   %4 = bitcast %struct.a_type* %arg0 to i8*
+; CHECK-NEXT:   %call15 = call i32 @bar2(i8* %3, i8* %4) #2
+; CHECK-NEXT:   %tobool16 = icmp ne i32 %call15, 0
+; CHECK-NEXT:   br i1 %tobool16, label %if.then17, label %for.inc
+
+; CHECK:      if.then17:                                        ; preds = %for.body14
+; CHECK-NEXT:   %call18 = call i32 @bar3(i8* %3, i8* %4) #2
+; CHECK-NEXT:   %tobool19 = icmp ne i32 %call18, 0
+; CHECK-NEXT:   br i1 %tobool19, label %if.then20, label %for.inc
+
+; CHECK:      if.then20:                                        ; preds = %if.then17
+; CHECK-NEXT:   br label %for.inc
+
+; CHECK:      for.inc:                                          ; preds = %if.then20, %if.then17, %for.body14
+; CHECK-NEXT:   %ptr0.2 = phi i8* [ null, %if.then20 ], [ %ptr0.1, %if.then17 ], [ null, %for.body14 ]
+; CHECK-NEXT:   %inc = add i8 %k.0, 1
+; CHECK-NEXT:   br label %for.cond6
+
+; CHECK:      for.inc24:                                        ; preds = %land.rhs, %for.cond6, %for.body
+; CHECK-NEXT:   %ptr0.3 = phi i8* [ %ptr0.1, %land.rhs ], [ %ptr0.1, %for.cond6 ], [ null, %for.body ]
+; CHECK-NEXT:   %inc25 = add i8 %i.0, 1
+; CHECK-NEXT:   br label %for.cond
+
+; CHECK:      for.end26:                                        ; preds = %for.cond
+; CHECK-NEXT:   ret i8* %ptr0.0
+; CHECK-NEXT: }
+
+; CHECK:      ; Function Attrs: nounwind optsize ssp
+; CHECK-NEXT: define i8* @foo_b(%struct.b_type*) #1 {
+; CHECK-NEXT:   %2 = bitcast %struct.b_type* %0 to %struct.a_type*
+; CHECK-NEXT:   %3 = tail call i8* @foo_a(%struct.a_type* %2)
+; CHECK-NEXT:   ret i8* %3
+; CHECK-NEXT: }
+
+; CHECK:      attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; CHECK-NEXT: attributes #1 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; CHECK-NEXT: attributes #2 = { optsize }
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+
+%struct.a_type = type { i32, i32, i32, i8*, [2 x i8*] }
+%struct.b_type = type { i32, i32, i32, i8*, [2 x i8*], i32 }
+
+; Function Attrs: optsize
+declare i8* @bar4(i32) #1
+declare i8* @bar0(i32) #1
+declare i32 @bar1(i8*) #1
+declare i32 @bar2(i8*, i8*) #1
+declare i32 @bar3(i8*, i8*) #1
+
+; Function Attrs: nounwind optsize ssp
+define i8* @foo_a(%struct.a_type* %arg0) #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc24, %entry
+  %i.0 = phi i8 [ 0, %entry ], [ %inc25, %for.inc24 ]
+  %ptr0.0 = phi i8* [ null, %entry ], [ %ptr0.3, %for.inc24 ]
+  %conv = zext i8 %i.0 to i32
+  %cmp = icmp slt i32 %conv, 16
+  br i1 %cmp, label %for.body, label %for.end26
+
+for.body:                                         ; preds = %for.cond
+  %call = call i8* @bar4(i32 %conv) #2
+  %call4 = call i8* @bar0(i32 %conv) #2
+  %0 = bitcast i8* %call4 to %struct.a_type*
+  %call5 = call i32 @bar1(i8* %call) #2
+  %tobool = icmp ne i32 %call5, 0
+  br i1 %tobool, label %for.cond6, label %for.inc24
+
+for.cond6:                                        ; preds = %for.body, %for.inc
+  %k.0 = phi i8 [ %inc, %for.inc ], [ 0, %for.body ]
+  %ptr0.1 = phi i8* [ %ptr0.2, %for.inc ], [ %call, %for.body ]
+  %idxprom = zext i8 %k.0 to i32
+  %field4 = getelementptr inbounds %struct.a_type, %struct.a_type* %arg0, i32 0, i32 4
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* %field4, i32 0, i32 %idxprom
+  %1 = load i8*, i8** %arrayidx, align 4
+  %cmp7 = icmp ne i8* %1, null
+  br i1 %cmp7, label %land.rhs, label %for.inc24
+
+land.rhs:                                         ; preds = %for.cond6
+  %field410 = getelementptr inbounds %struct.a_type, %struct.a_type* %0, i32 0, i32 4
+  %arrayidx11 = getelementptr inbounds [2 x i8*], [2 x i8*]* %field410, i32 0, i32 %idxprom
+  %2 = load i8*, i8** %arrayidx11, align 4
+  %cmp12 = icmp ne i8* %2, null
+  br i1 %cmp12, label %for.body14, label %for.inc24
+
+for.body14:                                       ; preds = %land.rhs
+  %3 = bitcast %struct.a_type* %0 to i8*
+  %4 = bitcast %struct.a_type* %arg0 to i8*
+  %call15 = call i32 @bar2(i8* %3, i8* %4) #2
+  %tobool16 = icmp ne i32 %call15, 0
+  br i1 %tobool16, label %if.then17, label %for.inc
+
+if.then17:                                        ; preds = %for.body14
+  %call18 = call i32 @bar3(i8* %3, i8* %4) #2
+  %tobool19 = icmp ne i32 %call18, 0
+  br i1 %tobool19, label %if.then20, label %for.inc
+
+if.then20:                                        ; preds = %if.then17
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body14, %if.then17, %if.then20
+  %ptr0.2 = phi i8* [ null, %if.then20 ], [ %ptr0.1, %if.then17 ], [ null, %for.body14 ]
+  %inc = add i8 %k.0, 1
+  br label %for.cond6
+
+for.inc24:                                        ; preds = %for.body, %for.cond6, %land.rhs
+  %ptr0.3 = phi i8* [ %ptr0.1, %land.rhs ], [ %ptr0.1, %for.cond6 ], [ null, %for.body ]
+  %inc25 = add i8 %i.0, 1
+  br label %for.cond
+
+for.end26:                                        ; preds = %for.cond
+  ret i8* %ptr0.0
+}
+
+; Function Attrs: nounwind optsize ssp
+define i8* @foo_b(%struct.b_type* %arg0) #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc24, %entry
+  %i.0 = phi i8 [ 0, %entry ], [ %inc25, %for.inc24 ]
+  %ptr0.0 = phi i8* [ null, %entry ], [ %ptr0.3, %for.inc24 ]
+  %conv = zext i8 %i.0 to i32
+  %cmp = icmp slt i32 %conv, 16
+  br i1 %cmp, label %for.body, label %for.end26
+
+for.body:                                         ; preds = %for.cond
+  %call = call i8* @bar4(i32 %conv) #2
+  %call4 = call i8* @bar0(i32 %conv) #2
+  %0 = bitcast i8* %call4 to %struct.b_type*
+  %call5 = call i32 @bar1(i8* %call) #2
+  %tobool = icmp ne i32 %call5, 0
+  br i1 %tobool, label %for.cond6, label %for.inc24
+
+for.cond6:                                        ; preds = %for.body, %for.inc
+  %k.0 = phi i8 [ %inc, %for.inc ], [ 0, %for.body ]
+  %ptr0.1 = phi i8* [ %ptr0.2, %for.inc ], [ %call, %for.body ]
+  %idxprom = zext i8 %k.0 to i32
+  %field4 = getelementptr inbounds %struct.b_type, %struct.b_type* %arg0, i32 0, i32 4
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* %field4, i32 0, i32 %idxprom
+  %1 = load i8*, i8** %arrayidx, align 4
+  %cmp7 = icmp ne i8* %1, null
+  br i1 %cmp7, label %land.rhs, label %for.inc24
+
+land.rhs:                                         ; preds = %for.cond6
+  %field410 = getelementptr inbounds %struct.b_type, %struct.b_type* %0, i32 0, i32 4
+  %arrayidx11 = getelementptr inbounds [2 x i8*], [2 x i8*]* %field410, i32 0, i32 %idxprom
+  %2 = load i8*, i8** %arrayidx11, align 4
+  %cmp12 = icmp ne i8* %2, null
+  br i1 %cmp12, label %for.body14, label %for.inc24
+
+for.body14:                                       ; preds = %land.rhs
+  %3 = bitcast %struct.b_type* %0 to i8*
+  %4 = bitcast %struct.b_type* %arg0 to i8*
+  %call15 = call i32 @bar2(i8* %3, i8* %4) #2
+  %tobool16 = icmp ne i32 %call15, 0
+  br i1 %tobool16, label %if.then17, label %for.inc
+
+if.then17:                                        ; preds = %for.body14
+  %call18 = call i32 @bar3(i8* %3, i8* %4) #2
+  %tobool19 = icmp ne i32 %call18, 0
+  br i1 %tobool19, label %if.then20, label %for.inc
+
+if.then20:                                        ; preds = %if.then17
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body14, %if.then17, %if.then20
+  %ptr0.2 = phi i8* [ null, %if.then20 ], [ %ptr0.1, %if.then17 ], [ null, %for.body14 ]
+  %inc = add i8 %k.0, 1
+  br label %for.cond6
+
+for.inc24:                                        ; preds = %for.body, %for.cond6, %land.rhs
+  %ptr0.3 = phi i8* [ %ptr0.1, %land.rhs ], [ %ptr0.1, %for.cond6 ], [ null, %for.body ]
+  %inc25 = add i8 %i.0, 1
+  br label %for.cond
+
+for.end26:                                        ; preds = %for.cond
+  ret i8* %ptr0.0
+}
+
+attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { optsize }
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-noinline.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-noinline.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-noinline.ll
@@ -0,0 +1,25 @@
+; RUN: opt -S -mergesimilarfunc -mergesimilarfunc-level=all < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @stuff()
+
+; CHECK-LABEL: @f0(
+define void @f0(i64 %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
+; CHECK-LABEL: @f1(
+; CHECK: call void @f0{{.*}} #[[ATTR:[0-9]+]]
+; CHECK: attributes #[[ATTR]] = { {{.*}}noinline
+define void @f1(i64 %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-ret.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-ret.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-ret.ll
@@ -0,0 +1,207 @@
+; Check that the ret instructions are merged correctly. A bug caused
+; an incorrect merge and a verifier failure for this input.
+;
+; RUN: opt -S -mergesimilarfunc < %s | FileCheck %s
+;
+; CHECK-LABEL: define internal %0* @LLVMGetReturnType__merged
+; CHECK: phi %0* [
+; CHECK-NEXT: ret %0*
+; CHECK-NEXT: }
+;
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+
+%0 = type opaque
+%1 = type { %2*, i32, i32, %1** }
+%2 = type { %3* }
+%3 = type opaque
+%4 = type { %1 }
+%5 = type opaque
+%6 = type { i32 (...)**, %1*, %7*, i8, i8, i16, i32 }
+%7 = type { %6*, %7*, %8 }
+%8 = type { i32 }
+%9 = type { %10, %1*, i32, %12, i32, i8, %18* }
+%10 = type { %11 }
+%11 = type { %6 }
+%12 = type { %13 }
+%13 = type { %14 }
+%14 = type { %15 }
+%15 = type { %16 }
+%16 = type { %17 }
+%17 = type { i32, i32, i8* }
+%18 = type <{ %2*, %19, %30, %70, %79, %87, %12, %93*, %94, %98, %12, %12, %12, i8*, %102, i8, [3 x i8] }>
+%19 = type { %20 }
+%20 = type { %21, %26* }
+%21 = type { %22 }
+%22 = type { %23 }
+%23 = type { %24 }
+%24 = type { %25, %26* }
+%25 = type { %26* }
+%26 = type <{ %27, [3 x i8], %24, i8, [3 x i8] }>
+%27 = type <{ %9, %12, %28*, %12, %12, i8 }>
+%28 = type <{ %29*, i8, [3 x i8] }>
+%29 = type opaque
+%30 = type { %31 }
+%31 = type { %32, %37* }
+%32 = type { %33 }
+%33 = type { %34 }
+%34 = type { %35 }
+%35 = type { %36, %37* }
+%36 = type { %37* }
+%37 = type { %27, %35, %38, %60, %93*, %68, %4* }
+%38 = type { %39 }
+%39 = type { %40, %44* }
+%40 = type { %41 }
+%41 = type { %42 }
+%42 = type { %43 }
+%43 = type { %44* }
+%44 = type { %6, %45, i32, %47, %37* }
+%45 = type { %46 }
+%46 = type { %43, %44* }
+%47 = type { %48 }
+%48 = type { %49, %53* }
+%49 = type { %50 }
+%50 = type { %51 }
+%51 = type { %52 }
+%52 = type { %53* }
+%53 = type { %11, %54, %44*, %56 }
+%54 = type { %55 }
+%55 = type { %52, %53* }
+%56 = type { %57 }
+%57 = type { %58 }
+%58 = type { %59* }
+%59 = type { i8, i8, i16, i32 }
+%60 = type { %61 }
+%61 = type { %62, %66* }
+%62 = type { %63 }
+%63 = type { %64 }
+%64 = type { %65 }
+%65 = type { %66* }
+%66 = type { %6, %67, %37* }
+%67 = type { %65, %66* }
+%68 = type { %69* }
+%69 = type opaque
+%70 = type { %71 }
+%71 = type { %72, %77* }
+%72 = type { %73 }
+%73 = type { %74 }
+%74 = type { %75 }
+%75 = type { %76, %77* }
+%76 = type { %77* }
+%77 = type { %78, %75 }
+%78 = type { %9 }
+%79 = type { %80 }
+%80 = type { %81, %86* }
+%81 = type { %82 }
+%82 = type { %83 }
+%83 = type { %84 }
+%84 = type { %85, %86* }
+%85 = type { %86* }
+%86 = type { %78, %84 }
+%87 = type { %88 }
+%88 = type { %89, %92* }
+%89 = type { %90 }
+%90 = type { %91, %92* }
+%91 = type { %92* }
+%92 = type { %90, %12, %18*, i8* }
+%93 = type opaque
+%94 = type <{ %95, %97, [3 x i8] }>
+%95 = type { %96**, i32, i32, i32, i32 }
+%96 = type { i32 }
+%97 = type { i8 }
+%98 = type { %99 }
+%99 = type { %100 }
+%100 = type { %101* }
+%101 = type opaque
+%102 = type { i8, i32, i8, [3 x i8], %103, %111, %12, %118, i8* }
+%103 = type { %104, %110 }
+%104 = type { %105 }
+%105 = type { %106 }
+%106 = type <{ %107, %108 }>
+%107 = type { i8*, i8*, i8* }
+%108 = type { %109 }
+%109 = type { [1 x i8] }
+%110 = type { [7 x %108] }
+%111 = type { %112, %117 }
+%112 = type { %113 }
+%113 = type { %114 }
+%114 = type { %107, %115 }
+%115 = type { %116 }
+%116 = type { [8 x i8] }
+%117 = type { [15 x %115] }
+%118 = type { %119, %124 }
+%119 = type { %120 }
+%120 = type { %121 }
+%121 = type { %107, %122 }
+%122 = type { %123 }
+%123 = type { [16 x i8] }
+%124 = type { [7 x %122] }
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: optsize
+define %0* @LLVMGetReturnType(%0*) #1 {
+  %2 = alloca %1*, align 4
+  %3 = bitcast %1** %2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %3)
+  %4 = bitcast %1** %2 to %0**
+  store %0* %0, %0** %4, align 4
+  %5 = call zeroext i1 @_ZN4llvm13isa_impl_wrapINS_12FunctionTypeEKPNS_4TypeEPKS2_E4doitERS4_(%1** nonnull dereferenceable(4) %2) #3
+  %6 = bitcast %0* %0 to %4*
+  br i1 %5, label %8, label %7
+
+; <label>:7:                                      ; preds = %1
+  tail call void @__assert_fail.5.0.1.2_i32_237.3() #4
+  br label %8
+
+; <label>:8:                                      ; preds = %7, %1
+  call void @llvm.lifetime.end(i64 4, i8* %3)
+  %9 = getelementptr inbounds %4, %4* %6, i32 0, i32 0, i32 3
+  %10 = bitcast %1*** %9 to %0***
+  %11 = load %0**, %0*** %10, align 4
+  %12 = load %0*, %0** %11, align 4
+  ret %0* %12
+}
+
+; Function Attrs: optsize
+declare void @__assert_fail.5.0.1.2_i32_237.3() #2
+
+; Function Attrs: optsize
+define i32 @LLVMHasUnnamedAddr(%5*) #1 {
+  %2 = alloca %6*, align 4
+  %3 = bitcast %6** %2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %3)
+  %4 = bitcast %6** %2 to %5**
+  store %5* %0, %5** %4, align 4
+  %5 = call zeroext i1 @_ZN4llvm13isa_impl_wrapINS_11GlobalValueEKPNS_5ValueEPKS2_E4doitERS4_(%6** nonnull dereferenceable(4) %2) #3
+  %6 = bitcast %5* %0 to %9*
+  br i1 %5, label %8, label %7
+
+; <label>:7:                                      ; preds = %1
+  tail call void @__assert_fail.5.0.1.2_i32_237.3() #4
+  br label %8
+
+; <label>:8:                                      ; preds = %7, %1
+  call void @llvm.lifetime.end(i64 4, i8* %3)
+  %9 = getelementptr inbounds %9, %9* %6, i32 0, i32 2
+  %10 = load i32, i32* %9, align 4
+  %11 = lshr i32 %10, 6
+  %12 = and i32 %11, 1
+  ret i32 %12
+}
+
+; Function Attrs: optsize
+declare zeroext i1 @_ZN4llvm13isa_impl_wrapINS_12FunctionTypeEKPNS_4TypeEPKS2_E4doitERS4_(%1** nocapture readonly dereferenceable(4)) #1 align 2
+
+; Function Attrs: optsize
+declare zeroext i1 @_ZN4llvm13isa_impl_wrapINS_11GlobalValueEKPNS_5ValueEPKS2_E4doitERS4_(%6** nocapture readonly dereferenceable(4)) #1 align 2
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { optsize }
+attributes #2 = { optsize }
+attributes #3 = { optsize }
+attributes #4 = { noinline optsize }
diff --git a/llvm/test/Transforms/MergeSimilarFunc/merge-ret2.ll b/llvm/test/Transforms/MergeSimilarFunc/merge-ret2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/merge-ret2.ll
@@ -0,0 +1,147 @@
+; Check that two ret instructions with the same return type
+; are generated in the merged function. A bug caused a mismatch
+; in return types here and a verifier failure.
+;
+; RUN: opt -S -mergesimilarfunc < %s | FileCheck %s
+;
+; CHECK: @mrg1__merged
+; CHECK: switch i32 %__merge_arg
+; CHECK: ret %0*
+; CHECK: ret %0*
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+
+%0 = type { {}*, %0* (%8*)*, %0* (%13*)*, %0* (%39*)*, %0* (%14*)*, %0* (%14*)*, %0* (%15*)*, %0* (%16*)*, %0* ()*, %0* (%17*)*, %0* (%22*)*, %0* (%28*)*, %0* (%32*)*, %0* (%37*)*, %0* (%37*)*, %0* (%37*)*, %0* ()*, %0* (%33*)*, %0* (%36*)*, %0* (%37*)*, %0* (%38*)*, %0* ()*, %0* (%39*)*, %0* ()*, %0* ()*, %0* (%40*)*, %0* ()*, %0* ()*, %0* ()*, %0* ()*, i8 ()*, void ()* }
+%1 = type { %2, i32, i8, i8, i64, i8, i8, i8, %3, %3, i8, i8 }
+%2 = type { i8, i8, i8, i8 }
+%3 = type { i32, i32, [10 x %4], %7 }
+%4 = type { %5, i8, i32, i8, %5 }
+%5 = type { %6 }
+%6 = type { i64, i64, i64, i64 }
+%7 = type { i8, i8 }
+%8 = type { %2, i32, i8, i8, i8, i8, i8, i64, %9, %3, i8, i8, i8, %12, i32, i8 }
+%9 = type { %10 }
+%10 = type { i8, i8, %11, i32, i8 }
+%11 = type { [3 x i8] }
+%12 = type { i8, i8, i8 }
+%13 = type { %2, i32, i8, i32, i8, i8, i8 }
+%14 = type { %2, i32, i8, i8, i8 }
+%15 = type { %2, i8, i8, i8 }
+%16 = type { %2, i8, %5, i8, %11, i16, i8 }
+%17 = type { %2, %18, i8 }
+%18 = type { i8, i8, i8, i8, i8, i8, i8, i8, %11, %19, %19, %20, i32, %21, i8, i16, i8 }
+%19 = type { i8, i8, i8, [255 x i8], i8 }
+%20 = type { i8, i8, i8, i8, i8, i8, i32 }
+%21 = type { i8, [3 x i8] }
+%22 = type { %2, i8, i8, %23, i8, i8, i8 }
+%23 = type { %24 }
+%24 = type { i32, [40 x %25] }
+%25 = type { %11, i8, i8, i8, i8, i8, i32, %26, i32 }
+%26 = type { i32, %27, i8 }
+%27 = type { i8, [48 x i8] }
+%28 = type { %2, i8, %29, i8, %24, i8, %31, i8, i8, i8, i8, i16, i8 }
+%29 = type { i8, %11, i8, i8, i16, i8, i8, i8, %5, i16, i16, i8, %26, i8, i8, i8, i32, i8, i8, i8, i8, %30, i8, i8, %11, i8 }
+%30 = type { i8, i8, i8, i8, i8 }
+%31 = type { i32, [40 x %11] }
+%32 = type { %2, i8, %29, i8, i8, %31, i8, i16, i8 }
+%33 = type { %2, %34, i8 }
+%34 = type { i8, [16 x %35] }
+%35 = type { i8, i8, [41 x i8] }
+%36 = type { %2, i32, i8 }
+%37 = type { %2, i8 }
+%38 = type { %2, i8, i8, %11, i16, i8, i32, i8 }
+%39 = type { %2, i32, i8, i8 }
+%40 = type { %2, i8, %11, i32, i8, i8, i8 }
+%41 = type { {}*, %41* (%8*)*, %41* (%13*)*, %41* (%39*)*, %41* (%14*)*, %41* (%14*)*, %41* (%15*)*, %41* (%16*)*, %41* ()*, %41* (%17*)*, %41* (%22*)*, %41* (%28*)*, %41* (%32*)*, %41* (%37*)*, %41* (%37*)*, %41* (%37*)*, %41* ()*, %41* (%33*)*, %41* (%36*)*, %41* (%37*)*, %41* (%38*)*, %41* ()*, %41* (%39*)*, %41* ()*, %41* ()*, %41* (%40*)*, %41* ()*, %41* ()*, %41* ()*, %41* ()*, i8 ()*, void ()* }
+
+@xx = external global i8, align 1
+@yy = external global { %0* (%1*)*, %0* (%8*)*, %0* (%13*)*, %0* (%39*)*, %0* (%14*)*, %0* (%14*)*, %0* (%15*)*, %0* (%16*)*, %0* ()*, %0* (%17*)*, %0* (%22*)*, %0* (%28*)*, %0* (%32*)*, %0* (%37*)*, %0* (%37*)*, %0* (%37*)*, %0* ()*, %0* (%33*)*, %0* (%36*)*, %0* (%37*)*, %0* (%38*)*, %0* ()*, %0* (%39*)*, %0* ()*, %0* ()*, %0* (%40*)*, %0* ()*, %0* ()*, %0* ()*, %0* ()*, i8 ()*, void ()* }, align 4
+@zz = external global { %41* (%1*)*, %41* (%8*)*, %41* (%13*)*, %41* (%39*)*, %41* (%14*)*, %41* (%14*)*, %41* (%15*)*, %41* (%16*)*, %41* ()*, %41* (%17*)*, %41* (%22*)*, %41* (%28*)*, %41* (%32*)*, %41* (%37*)*, %41* (%37*)*, %41* (%37*)*, %41* ()*, %41* (%33*)*, %41* (%36*)*, %41* (%37*)*, %41* (%38*)*, %41* ()*, %41* (%39*)*, %41* ()*, %41* ()*, %41* (%40*)*, %41* ()*, %41* ()*, %41* ()*, %41* ()*, i8 ()*, void ()* }, align 4
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @f1()
+declare void @f1a(i8 zeroext)
+declare signext i8 @f2()
+declare void @f3(%26* noalias nocapture sret)
+
+define internal %0* @mrg1() optsize {
+  %1 = alloca %26, align 4
+  %2 = bitcast %26* %1 to i8*
+  call void @llvm.lifetime.start(i64 56, i8* %2)
+  call void @f3(%26* nonnull sret %1)
+  call void @f1a(i8 zeroext 1)
+  %3 = load i8, i8* @xx, align 1
+  %4 = icmp eq i8 %3, 1
+  br i1 %4, label %5, label %18
+
+; <label>:5:                                      ; preds = %0
+  %6 = call signext i8 @f2()
+  %7 = icmp eq i8 %6, 2
+  br i1 %7, label %8, label %18
+
+; <label>:8:                                      ; preds = %5
+  %9 = getelementptr inbounds %26, %26* %1, i32 0, i32 0
+  %10 = load i32, i32* %9, align 4
+  %11 = icmp eq i32 %10, -1
+  br i1 %11, label %16, label %12
+
+; <label>:12:                                     ; preds = %8
+  %13 = getelementptr inbounds %26, %26* %1, i32 0, i32 2
+  %14 = load i8, i8* %13, align 1
+  %15 = icmp eq i8 %14, 1
+  br i1 %15, label %16, label %17
+
+; <label>:16:                                     ; preds = %12, %8
+  call void @f1()
+  br label %17
+
+; <label>:17:                                     ; preds = %16, %12
+  store i8 0, i8* @xx, align 1
+  br label %18
+
+; <label>:18:                                     ; preds = %17, %5, %0
+  call void @llvm.lifetime.end(i64 56, i8* %2)
+  ret %0* bitcast ({ %0* (%1*)*, %0* (%8*)*, %0* (%13*)*, %0* (%39*)*, %0* (%14*)*, %0* (%14*)*, %0* (%15*)*, %0* (%16*)*, %0* ()*, %0* (%17*)*, %0* (%22*)*, %0* (%28*)*, %0* (%32*)*, %0* (%37*)*, %0* (%37*)*, %0* (%37*)*, %0* ()*, %0* (%33*)*, %0* (%36*)*, %0* (%37*)*, %0* (%38*)*, %0* ()*, %0* (%39*)*, %0* ()*, %0* ()*, %0* (%40*)*, %0* ()*, %0* ()*, %0* ()*, %0* ()*, i8 ()*, void ()* }* @yy to %0*)
+}
+
+define internal %41* @mrg2() optsize {
+  %1 = alloca %26, align 4
+  %2 = bitcast %26* %1 to i8*
+  call void @llvm.lifetime.start(i64 56, i8* %2)
+  call void @f3(%26* nonnull sret %1)
+  call void @f1a(i8 zeroext 1)
+  %3 = load i8, i8* @xx, align 1
+  %4 = icmp eq i8 %3, 1
+  br i1 %4, label %5, label %18
+
+; <label>:5:                                      ; preds = %0
+  %6 = call signext i8 @f2()
+  %7 = icmp eq i8 %6, 2
+  br i1 %7, label %8, label %18
+
+; <label>:8:                                      ; preds = %5
+  %9 = getelementptr inbounds %26, %26* %1, i32 0, i32 0
+  %10 = load i32, i32* %9, align 4
+  %11 = icmp eq i32 %10, -1
+  br i1 %11, label %16, label %12
+
+; <label>:12:                                     ; preds = %8
+  %13 = getelementptr inbounds %26, %26* %1, i32 0, i32 2
+  %14 = load i8, i8* %13, align 1
+  %15 = icmp eq i8 %14, 1
+  br i1 %15, label %16, label %17
+
+; <label>:16:                                     ; preds = %12, %8
+  call void @f1()
+  br label %17
+
+; <label>:17:                                     ; preds = %16, %12
+  store i8 0, i8* @xx, align 1
+  br label %18
+
+; <label>:18:                                     ; preds = %17, %5, %0
+  call void @llvm.lifetime.end(i64 56, i8* %2)
+  ret %41* bitcast ({ %41* (%1*)*, %41* (%8*)*, %41* (%13*)*, %41* (%39*)*, %41* (%14*)*, %41* (%14*)*, %41* (%15*)*, %41* (%16*)*, %41* ()*, %41* (%17*)*, %41* (%22*)*, %41* (%28*)*, %41* (%32*)*, %41* (%37*)*, %41* (%37*)*, %41* (%37*)*, %41* ()*, %41* (%33*)*, %41* (%36*)*, %41* (%37*)*, %41* (%38*)*, %41* ()*, %41* (%39*)*, %41* ()*, %41* ()*, %41* (%40*)*, %41* ()*, %41* ()*, %41* ()*, %41* ()*, i8 ()*, void ()* }* @zz to %41*)
+}
+
diff --git a/llvm/test/Transforms/MergeSimilarFunc/multi-merge.ll b/llvm/test/Transforms/MergeSimilarFunc/multi-merge.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/multi-merge.ll
@@ -0,0 +1,124 @@
+; RUN: opt -mergesimilarfunc -mergesimilarfunc-level=all < %s -S | FileCheck %s
+
+; CHECK: define i32 @A
+; CHECK: define internal i32 @A__merged
+; CHECK: define i32 @B
+; CHECK: call i32 @A__merged
+; CHECK: define i32 @C
+; CHECK: call i32 @A__merged
+; CHECK: define i32 @D
+; CHECK: call i32 @A__merged
+
+define i32 @A(i32* %a, i32 %n, i32 %c) nounwind {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %mul = mul i32 %n, 4
+  %call = tail call i8* @memset1(i8* %0, i32 %c, i32 %mul) nounwind
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %a, %entry ]
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %1 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %1, %sum.08
+  %inc = add nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = add i32 %add, 1
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 1, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+declare i8* @memset1(i8*, i32, i32)
+
+define i32 @B(i32* %a, i32 %n, i32 %c) nounwind {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %mul = mul i32 %n, 4
+  %call = tail call i8* @memset1(i8* %0, i32 %c, i32 %mul) nounwind
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %a, %entry ]
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %1 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %1, %sum.08
+  %inc = add nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = add i32 %add, 2
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 2, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @C(i32* %a, i32 %n, i32 %c) nounwind {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %mul = mul i32 %n, 4
+  %call = tail call i8* @memset1(i8* %0, i32 %c, i32 %mul) nounwind
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %a, %entry ]
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.08 = phi i32 [ %add, %for.body ], [ 1, %entry ]
+  %1 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %1, %sum.08
+  %inc = add nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = add i32 %add, 3
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 4, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @D(i32* %a, i32 %n, i32 %c) nounwind {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %mul = mul i32 %n, 4
+  %call = tail call i8* @memset1(i8* %0, i32 %c, i32 %mul) nounwind
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %a, %entry ]
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.08 = phi i32 [ %add, %for.body ], [ 1, %entry ]
+  %1 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %1, %sum.08
+  %inc = add nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = add i32 %add, 4
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 5, %entry ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/llvm/test/Transforms/MergeSimilarFunc/self-ref.ll b/llvm/test/Transforms/MergeSimilarFunc/self-ref.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/self-ref.ll
@@ -0,0 +1,41 @@
+; RUN: opt -mergesimilarfunc -mergesimilarfunc-level=all -mergesimilarfunc-min-insts=0 -mergesimilarfunc-diff-min-insts=0 < %s -S | FileCheck %s
+
+; This test checks if we can tell the difference between the two corresponding
+; calls to foobar.  Each call has a  reference to the containing function as the
+; only argument to foobar.
+; If we cannot tell the difference between these two calls, then the functions
+; foo and bar below will be considered identical and bar will simply be patched
+; up to call foo. However, these functions are not identical as the calls to foobar
+; are themselves different. So, these functions are similar, but not identical.
+; bar should be patched up to call foo__merged which has code to deal with the
+; two different calls to foobar.
+
+; CHECK: define i32 @bar(i32 %a) #0 {
+; CHECK: %1 = tail call i32 @foo__merged(i32 %a, i32 1)
+define i32 @foo(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %call = call i32 @foobar(i32 (i32)* @foo)
+  store i32 %call, i32* %b, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+declare i32 @foobar(i32 (i32)*)
+
+define i32 @bar(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %call = call i32 @foobar(i32 (i32)* @bar)
+  store i32 %call, i32* %b, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
diff --git a/llvm/test/Transforms/MergeSimilarFunc/strong-weak.ll b/llvm/test/Transforms/MergeSimilarFunc/strong-weak.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MergeSimilarFunc/strong-weak.ll
@@ -0,0 +1,38 @@
+; RUN: opt -mergesimilarfunc -mergesimilarfunc-level=all -mergesimilarfunc-min-insts=0 -mergesimilarfunc-diff-min-insts=1 < %s -S
+; REQUIRES: asserts
+; CHECK-NOT: __merged
+define weak i32 @bar(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  %d = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %1 = load i32, i32* %d, align 4
+  %add = add nsw i32 %0, %1
+  %call = call i32 bitcast (i32 (...)* @foo to i32 ()*)()
+  %add1 = add nsw i32 %add, %call
+  store i32 %add1, i32* %b, align 4
+  %2 = load i32, i32* %b, align 4
+  ret i32 %2
+}
+
+declare i32 @foo(...)
+
+define i32 @barred(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  %d = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %1 = load i32, i32* %d, align 4
+  %add = add nsw i32 %0, %1
+  %call = call i32 bitcast (i32 (...)* @boohoo to i32 ()*)()
+  %add1 = add nsw i32 %add, %call
+  store i32 %add1, i32* %b, align 4
+  %2 = load i32, i32* %b, align 4
+  ret i32 %2
+}
+
+declare i32 @boohoo(...)
diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp
--- a/llvm/unittests/Transforms/Utils/CloningTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp
@@ -176,7 +176,7 @@
   ValueToValueMapTy VMap;
   VMap[A] = UndefValue::get(A->getType());
 
-  CloneFunctionInto(F2, F1, VMap, false, Returns);
+  CloneFunctionInto(F2, F1, VMap, CloneType::NoModuleLevelChanges, Returns);
   EXPECT_FALSE(F2->arg_begin()->hasNoCaptureAttr());
 
   delete F1;
@@ -199,7 +199,7 @@
   ValueToValueMapTy VMap;
   VMap[&*F1->arg_begin()] = &*F2->arg_begin();
 
-  CloneFunctionInto(F2, F1, VMap, false, Returns);
+  CloneFunctionInto(F2, F1, VMap, CloneType::NoModuleLevelChanges, Returns);
   EXPECT_EQ(CallingConv::Cold, F2->getCallingConv());
 
   delete F1;
@@ -700,7 +700,8 @@
   VMap[ImplFunction] = DeclFunction;
   // No args to map
   SmallVector<ReturnInst*, 8> Returns;
-  CloneFunctionInto(DeclFunction, ImplFunction, VMap, true, Returns);
+  CloneFunctionInto(DeclFunction, ImplFunction, VMap,
+                    CloneType::ModuleLevelChanges, Returns);
 
   EXPECT_FALSE(verifyModule(*ImplModule, &errs()));
   EXPECT_FALSE(verifyModule(*DeclModule, &errs()));