diff --git a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
--- a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
+++ b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
@@ -34,6 +34,14 @@
   PostDominatorTree *PDT;
 };
 
+/// Helper struct shared between Function Specialization and SCCP Solver.
+struct ArgInfo {
+  Argument *Formal; // The Formal argument being analysed.
+  Constant *Actual; // A corresponding actual constant argument.
+
+  ArgInfo(Argument *F, Constant *A) : Formal(F), Actual(A) {};
+};
+
 class SCCPInstVisitor;
 
 //===----------------------------------------------------------------------===//
@@ -134,11 +142,13 @@
   /// Return a reference to the set of argument tracked functions.
   SmallPtrSetImpl<Function *> &getArgumentTrackedFunctions();
 
-  /// Mark argument \p A constant with value \p C in a new function
-  /// specialization. The argument's parent function is a specialization of the
-  /// original function \p F. All other arguments of the specialization inherit
-  /// the lattice state of their corresponding values in the original function.
-  void markArgInFuncSpecialization(Function *F, Argument *A, Constant *C);
+  /// Mark the constant arguments of a new function specialization \p F. A list
+  /// of {formal,actual} pairs corresponding to these constants resides in
+  /// \p Args. The formal arguments are associated with the original function
+  /// definition.
+  /// All other arguments of the specialization inherit the lattice state of
+  /// their corresponding values in the original function.
+  void markArgInFuncSpecialization(Function *F, SmallVectorImpl<ArgInfo> &Args);
 
   /// Mark all of the blocks in function \p F non-executable. Clients can used
   /// this method to erase a function from the module (e.g., if it has been
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -83,6 +83,12 @@
              "specialization"),
     cl::init(3));
 
+static cl::opt<unsigned> MinGainThreshold(
+    "func-specialization-min-gain", cl::Hidden,
+    cl::desc("The minimum gain for a specialization to be considered "
+             "profitable"),
+    cl::init(10000));
+
 static cl::opt<unsigned> SmallFunctionThreshold(
     "func-specialization-size-threshold", cl::Hidden,
     cl::desc("Don't specialize functions that have less than this theshold "
@@ -98,8 +104,23 @@
     "func-specialization-on-address", cl::init(false), cl::Hidden,
     cl::desc("Enable function specialization on the address of global values"));
 
-// TODO: This needs checking to see the impact on compile-times, which is why
-// this is off by default for now.
+// Disabled by default as it can significantly increase compilation times.
+// Metrics based on instruction count as per
+// https://llvm-compile-time-tracker.com
+// https://github.com/nikic/llvm-compile-time-tracker
+//
+// testname         | % diff
+// -----------------+-------------------
+// ClamAV           | -0.003008478350503
+// 7zip             | -0.00387846255468
+// tramp3d-v4       | +0.032250428769666
+// kimwitu++        | +0.012783554583518
+// sqlite3          | +0.064026452410774
+// mafft            | -0.010917789416896
+// lencod           | -0.010513312365949
+// SPASS            | +4.0348661146709
+// Bullet           | +0.000114373245283
+// consumer-typeset | +0.007390973545237
 static cl::opt<bool> EnableSpecializationForLiteralConstant(
     "function-specialization-for-literal-constant", cl::init(false), cl::Hidden,
     cl::desc("Enable specialization of functions that take a literal constant "
@@ -108,24 +129,15 @@
 namespace {
 // Bookkeeping struct to pass data from the analysis and profitability phase
 // to the actual transform helper functions.
-struct ArgInfo {
-  Function *Fn;         // The function to perform specialisation on.
-  Argument *Formal;     // The Formal argument being analysed.
-  Constant *Actual;     // A corresponding actual constant argument.
-  InstructionCost Gain; // Profitability: Gain = Bonus - Cost.
-
-  // Flag if this will be a partial specialization, in which case we will need
-  // to keep the original function around in addition to the added
-  // specializations.
-  bool Partial = false;
-
-  ArgInfo(Function *F, Argument *A, Constant *C, InstructionCost G)
-      : Fn(F), Formal(A), Actual(C), Gain(G){};
+struct Specialization {
+  SmallVector<ArgInfo, 8> Args; // Stores the {formal,actual} argument pairs.
+  InstructionCost Gain = 0;     // Profitability: Gain = Bonus - Cost.
 };
 } // Anonymous namespace
 
 using FuncList = SmallVectorImpl<Function *>;
-using ConstList = SmallVectorImpl<Constant *>;
+using ConstList = SmallVector<std::pair<CallBase *, Constant *>>;
+using SpecializationInfo = SmallMapVector<CallBase *, Specialization, 8>;
 
 // Helper to check if \p LV is either a constant or a constant
 // range with a single element. This should cover exactly the same cases as the
@@ -306,20 +318,17 @@
       LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
                         << F->getName() << " is " << Cost << "\n");
 
-      auto ConstArgs = calculateGains(F, Cost);
-      if (ConstArgs.empty()) {
+      SpecializationInfo Info = calculateGains(F, Cost);
+      if (Info.empty()) {
         LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n");
         continue;
       }
 
-      for (auto &CA : ConstArgs) {
-        specializeFunction(CA, WorkList);
-        Changed = true;
-      }
+      specializeFunction(F, Info, WorkList);
+      Changed = true;
     }
 
     updateSpecializedFuncs(Candidates, WorkList);
-    NumFuncSpecialized += NbFunctionsSpecialized;
     return Changed;
   }
 
@@ -373,15 +382,10 @@
   }
 
 private:
-  // The number of functions specialised, used for collecting statistics and
-  // also in the cost model.
-  unsigned NbFunctionsSpecialized = 0;
-
   /// Clone the function \p F and remove the ssa_copy intrinsics added by
   /// the SCCPSolver in the cloned version.
-  Function *cloneCandidateFunction(Function *F) {
-    ValueToValueMapTy EmptyMap;
-    Function *Clone = CloneFunction(F, EmptyMap);
+  Function *cloneCandidateFunction(Function *F, ValueToValueMapTy &Mappings) {
+    Function *Clone = CloneFunction(F, Mappings);
     removeSSACopy(*Clone);
     return Clone;
   }
@@ -392,8 +396,8 @@
   /// profitable to specialize. Specialization is performed on the first
   /// interesting argument. Specializations based on additional arguments will
   /// be evaluated on following iterations of the main IPSCCP solve loop.
-  SmallVector<ArgInfo> calculateGains(Function *F, InstructionCost Cost) {
-    SmallVector<ArgInfo> Worklist;
+  SpecializationInfo calculateGains(Function *F, InstructionCost Cost) {
+    SpecializationInfo Info;
     // Determine if we should specialize the function based on the values the
     // argument can take on. If specialization is not profitable, we continue
     // on to the next argument.
@@ -405,7 +409,7 @@
       // be set to false by isArgumentInteresting (that function only adds
       // values to the Constants list that are deemed profitable).
       bool IsPartial = true;
-      SmallVector<Constant *> ActualArgs;
+      ConstList ActualArgs;
       if (!isArgumentInteresting(&FormalArg, ActualArgs, IsPartial)) {
         LLVM_DEBUG(dbgs() << "FnSpecialization: Argument "
                           << FormalArg.getNameOrAsOperand()
@@ -413,55 +417,58 @@
         continue;
       }
 
-      for (auto *ActualArg : ActualArgs) {
-        InstructionCost Gain =
-            ForceFunctionSpecialization
-                ? 1
-                : getSpecializationBonus(&FormalArg, ActualArg) - Cost;
+      for (auto &Entry : ActualArgs) {
+        CallBase *Call = Entry.first;
+        Constant *ActualArg = Entry.second;
 
-        if (Gain <= 0)
-          continue;
-        Worklist.push_back({F, &FormalArg, ActualArg, Gain});
+        Specialization &S = Info[Call];
+        if (!ForceFunctionSpecialization)
+          S.Gain += getSpecializationBonus(&FormalArg, ActualArg);
+        S.Args.push_back({&FormalArg, ActualArg});
       }
+    }
 
-      if (Worklist.empty())
-        continue;
+    if (Info.empty())
+      return Info;
 
-      // Sort the candidates in descending order.
-      llvm::stable_sort(Worklist, [](const ArgInfo &L, const ArgInfo &R) {
-        return L.Gain > R.Gain;
-      });
-
-      // Truncate the worklist to 'MaxClonesThreshold' candidates if
-      // necessary.
-      if (Worklist.size() > MaxClonesThreshold) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed "
-                          << "the maximum number of clones threshold.\n"
-                          << "FnSpecialization: Truncating worklist to "
-                          << MaxClonesThreshold << " candidates.\n");
-        Worklist.erase(Worklist.begin() + MaxClonesThreshold,
-                       Worklist.end());
-      }
+    if (ForceFunctionSpecialization) {
+        // The cost model is disregarded so keep up to \p MaxClonesThreshold
+        // per function specialization.
+        while (Info.size() > MaxClonesThreshold)
+          Info.erase(Info.begin());
+    } else {
+      // Account the Cost per specialization.
+      SmallVector<CallBase *, 8> KeysToRemove;
 
-      if (IsPartial || Worklist.size() < ActualArgs.size())
-        for (auto &ActualArg : Worklist)
-          ActualArg.Partial = true;
+      for (auto &Entry : Info) {
+        CallBase *Call = Entry.first;
+        Specialization &S = Entry.second;
+
+        S.Gain -= Cost;
+        if (S.Gain < MinGainThreshold)
+          KeysToRemove.push_back(Call);
+      }
+      // Remove unprofitable specializations.
+      for (CallBase *Call : KeysToRemove)
+        Info.erase(Call);
+    }
 
-      LLVM_DEBUG(
+    LLVM_DEBUG(
+      if (!Info.empty())
         dbgs() << "FnSpecialization: Specializations for function "
                << F->getName() << "\n";
-        for (auto &C : Worklist) {
-          dbgs() << "FnSpecialization:   FormalArg = "
-                 << C.Formal->getNameOrAsOperand() << ", ActualArg = "
-                 << C.Actual->getNameOrAsOperand() << ", Gain = "
-                 << C.Gain << "\n";
-        }
-      );
-
-      // FIXME: Only one argument per function.
-      break;
-    }
-    return Worklist;
+      for (auto &Entry : Info) {
+        Specialization &S = Entry.second;
+
+        dbgs() << "FnSpecialization:   Gain = " << S.Gain << "\n";
+        for (ArgInfo &Arg : S.Args)
+          dbgs() << "FnSpecialization:   - FormalArg = "
+                 << Arg.Formal->getNameOrAsOperand() << ", " << "ActualArg = "
+                 << Arg.Actual->getNameOrAsOperand() << "\n";
+      }
+    );
+
+    return Info;
   }
 
   bool isCandidateFunction(Function *F) {
@@ -488,32 +495,32 @@
     return true;
   }
 
-  void specializeFunction(ArgInfo &AI, FuncList &WorkList) {
-    Function *Clone = cloneCandidateFunction(AI.Fn);
-    Argument *ClonedArg = Clone->getArg(AI.Formal->getArgNo());
-
-    // Rewrite calls to the function so that they call the clone instead.
-    rewriteCallSites(AI.Fn, Clone, *ClonedArg, AI.Actual);
-
-    // Initialize the lattice state of the arguments of the function clone,
-    // marking the argument on which we specialized the function constant
-    // with the given value.
-    Solver.markArgInFuncSpecialization(AI.Fn, ClonedArg, AI.Actual);
-
-    // Mark all the specialized functions
-    WorkList.push_back(Clone);
-    NbFunctionsSpecialized++;
-
+  void specializeFunction(Function *F, SpecializationInfo &Info,
+                          FuncList &WorkList) {
+    for (auto &Entry : Info) {
+      Specialization &S = Entry.second;
+
+      ValueToValueMapTy Mappings;
+      Function *Clone = cloneCandidateFunction(F, Mappings);
+      rewriteCallSites(Clone, S.Args, Mappings);
+      // Initialize the lattice state of the arguments of the function clone,
+      // marking the argument on which we specialized the function constant
+      // with the given value.
+      Solver.markArgInFuncSpecialization(Clone, S.Args);
+      // Mark all the specialized functions
+      WorkList.push_back(Clone);
+      ++NumFuncSpecialized;
+    }
     // If the function has been completely specialized, the original function
     // is no longer needed. Mark it unreachable.
-    if (AI.Fn->getNumUses() == 0 ||
-        all_of(AI.Fn->users(), [&AI](User *U) {
+    if (F->getNumUses() == 0 ||
+        all_of(F->users(), [F](User *U) {
           if (auto *CS = dyn_cast_or_null<CallBase>(U))
-            return CS->getFunction() == AI.Fn;
+            return CS->getFunction() == F;
           return false;
         })) {
-      Solver.markFunctionUnreachable(AI.Fn);
-      FullySpecialized.push_back(AI.Fn);
+      Solver.markFunctionUnreachable(F);
+      FullySpecialized.push_back(F);
     }
   }
 
@@ -539,9 +546,8 @@
     }
 
     // Otherwise, set the specialization cost to be the cost of all the
-    // instructions in the function and penalty for specializing more functions.
-    unsigned Penalty = NbFunctionsSpecialized + 1;
-    return Metrics.NumInsts * InlineConstants::InstrCost * Penalty;
+    // instructions in the function.
+    return Metrics.NumInsts * InlineConstants::InstrCost;
   }
 
   InstructionCost getUserBonus(User *U, llvm::TargetTransformInfo &TTI,
@@ -689,6 +695,10 @@
     // TODO 2: this currently does not support constants, i.e. integer ranges.
     //
     IsPartial = !getPossibleConstants(A, Constants);
+
+    if (Constants.empty())
+      return false;
+
     LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument "
                       << A->getNameOrAsOperand() << "\n");
     return true;
@@ -745,7 +755,7 @@
 
       if (isa<Constant>(V) && (Solver.getLatticeValueFor(V).isConstant() ||
                                EnableSpecializationForLiteralConstant))
-        Constants.push_back(cast<Constant>(V));
+        Constants.push_back(std::make_pair(&CS, cast<Constant>(V)));
       else
         AllConstant = false;
     }
@@ -757,16 +767,18 @@
 
   /// Rewrite calls to function \p F to call function \p Clone instead.
   ///
-  /// This function modifies calls to function \p F whose argument at index \p
-  /// ArgNo is equal to constant \p C. The calls are rewritten to call function
-  /// \p Clone instead.
+  /// This function modifies calls to function \p F as long as the actual
+  /// arguments match those in \p Args. Note that for recursive calls we
+  /// need to compare against the cloned formal arguments.
   ///
   /// Callsites that have been marked with the MinSize function attribute won't
   /// be specialized and rewritten.
-  void rewriteCallSites(Function *F, Function *Clone, Argument &Arg,
-                        Constant *C) {
-    unsigned ArgNo = Arg.getArgNo();
-    SmallVector<CallBase *, 4> CallSitesToRewrite;
+  void rewriteCallSites(Function *Clone, SmallVectorImpl<ArgInfo> &Args,
+                        ValueToValueMapTy &Mappings) {
+    assert(!Args.empty() && "Specialization without arguments");
+    Function *F = Args[0].Formal->getParent();
+
+    SmallVector<CallBase *, 8> CallSitesToRewrite;
     for (auto *U : F->users()) {
       if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
         continue;
@@ -784,8 +796,15 @@
       LLVM_DEBUG(dbgs() << "FnSpecialization:   "
                         << CS->getFunction()->getName() << " ->"
                         << *CS << "\n");
-      if ((CS->getFunction() == Clone && CS->getArgOperand(ArgNo) == &Arg) ||
-          CS->getArgOperand(ArgNo) == C) {
+      if (/* recursive call */
+          (CS->getFunction() == Clone &&
+          all_of(Args, [CS, &Mappings](ArgInfo &Arg) {
+                 unsigned ArgNo = Arg.Formal->getArgNo();
+                 return CS->getArgOperand(ArgNo) == Mappings[Arg.Formal]; })) ||
+          /* normal call */
+          all_of(Args, [CS](ArgInfo &Arg) {
+                 unsigned ArgNo = Arg.Formal->getArgNo();
+                 return CS->getArgOperand(ArgNo) == Arg.Actual; })) {
         CS->setCalledFunction(Clone);
         Solver.markOverdefined(CS);
       }
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -452,7 +452,7 @@
     return TrackingIncomingArguments;
   }
 
-  void markArgInFuncSpecialization(Function *F, Argument *A, Constant *C);
+  void markArgInFuncSpecialization(Function *F, SmallVectorImpl<ArgInfo> &Args);
 
   void markFunctionUnreachable(Function *F) {
     for (auto &BB : *F)
@@ -526,24 +526,29 @@
   return nullptr;
 }
 
-void SCCPInstVisitor::markArgInFuncSpecialization(Function *F, Argument *A,
-                                                  Constant *C) {
-  assert(F->arg_size() == A->getParent()->arg_size() &&
+void
+SCCPInstVisitor::markArgInFuncSpecialization(Function *F,
+                                             SmallVectorImpl<ArgInfo> &Args) {
+  assert(!Args.empty() && "Specialization without arguments");
+  assert(F->arg_size() == Args[0].Formal->getParent()->arg_size() &&
          "Functions should have the same number of arguments");
 
-  // Mark the argument constant in the new function.
-  markConstant(A, C);
-
-  // For the remaining arguments in the new function, copy the lattice state
-  // over from the old function.
-  for (Argument *OldArg = F->arg_begin(), *NewArg = A->getParent()->arg_begin(),
-                *End = F->arg_end();
-       OldArg != End; ++OldArg, ++NewArg) {
+  auto Iter = Args.begin();
+  Argument *NewArg = F->arg_begin();
+  Argument *OldArg = Args[0].Formal->getParent()->arg_begin();
+  for (auto End = F->arg_end(); NewArg != End; ++NewArg, ++OldArg) {
 
     LLVM_DEBUG(dbgs() << "SCCP: Marking argument "
                       << NewArg->getNameOrAsOperand() << "\n");
 
-    if (NewArg != A && ValueState.count(OldArg)) {
+    if (OldArg == (*Iter).Formal) {
+      // Mark the argument constants in the new function.
+      markConstant(NewArg, (*Iter).Actual);
+      ++Iter;
+    } else if (ValueState.count(OldArg)) {
+      // For the remaining arguments in the new function, copy the lattice state
+      // over from the old function.
+      //
       // Note: This previously looked like this:
       // ValueState[NewArg] = ValueState[OldArg];
       // This is incorrect because the DenseMap class may resize the underlying
@@ -1718,9 +1723,9 @@
   return Visitor->getArgumentTrackedFunctions();
 }
 
-void SCCPSolver::markArgInFuncSpecialization(Function *F, Argument *A,
-                                             Constant *C) {
-  Visitor->markArgInFuncSpecialization(F, A, C);
+void SCCPSolver::markArgInFuncSpecialization(Function *F,
+                                             SmallVectorImpl<ArgInfo> &Args) {
+  Visitor->markArgInFuncSpecialization(F, Args);
 }
 
 void SCCPSolver::markFunctionUnreachable(Function *F) {
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll
@@ -1,4 +1,4 @@
-; RUN: opt -function-specialization -function-specialization-for-literal-constant=true -func-specialization-size-threshold=10 -S < %s | FileCheck %s
+; RUN: opt -function-specialization -func-specialization-min-gain=50 -function-specialization-for-literal-constant=true -func-specialization-size-threshold=10 -S < %s | FileCheck %s
 
 ; Check that the literal constant parameter could be specialized.
 ; CHECK: @foo.1(
@@ -41,4 +41,4 @@
   %retval.2 = call i32 @foo(i1 0)
   %retval = add nsw i32 %retval.1, %retval.2
   ret i32 %retval
-}
\ No newline at end of file
+}
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll
@@ -1,8 +1,12 @@
-; RUN: opt -function-specialization -func-specialization-avg-iters-cost=3 -func-specialization-size-threshold=10 -S < %s | FileCheck %s
+; RUN: opt -function-specialization -func-specialization-min-gain=20000 -func-specialization-avg-iters-cost=10 -func-specialization-size-threshold=10 -S < %s | FileCheck %s --check-prefix=HIGH_ITER_COST
+; RUN: opt -function-specialization -func-specialization-min-gain=20000 -func-specialization-avg-iters-cost=3 -func-specialization-size-threshold=10 -S < %s | FileCheck %s --check-prefix=LOW_ITER_COST
 
 ; Check that the loop depth results in a larger specialization bonus.
-; CHECK: @foo.1(
-; CHECK: @foo.2(
+; HIGH_ITER_COST: @foo.1(
+; HIGH_ITER_COST: @foo.2(
+
+; LOW_ITER_COST-NOT: @foo.1(
+; LOW_ITER_COST-NOT: @foo.2(
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -60,4 +64,4 @@
 return:
   %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ]
   ret i32 %retval.0
-}
\ No newline at end of file
+}
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll
@@ -1,4 +1,4 @@
-; RUN: opt -function-specialization -func-specialization-size-threshold=3 -S < %s | FileCheck %s
+; RUN: opt -function-specialization -func-specialization-min-gain=500 -func-specialization-size-threshold=3 -S < %s | FileCheck %s
 
 ; Checks for callsites that have been annotated with MinSize. We only expect
 ; specialisation for the call that does not have the attribute:
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll
@@ -1,4 +1,4 @@
-; RUN: opt -function-specialization -func-specialization-size-threshold=3 -S < %s | FileCheck %s
+; RUN: opt -function-specialization -func-specialization-min-gain=500 -func-specialization-size-threshold=3 -S < %s | FileCheck %s
 
 define i64 @main(i64 %x, i1 %flag) {
 ;
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll
@@ -46,7 +46,7 @@
 ; CHECK-NEXT:   entry:
 ; CHECK-NEXT:     %0 = load i32, i32* @A, align 4
 ; CHECK-NEXT:     %add = add nsw i32 %x, %0
-; CHECK-NEXT:     %1 = load i32, i32* %c, align 4
+; CHECK-NEXT:     %1 = load i32, i32* @C, align 4
 ; CHECK-NEXT:     %add1 = add nsw i32 %add, %1
 ; CHECK-NEXT:     ret i32 %add1
 ; CHECK-NEXT:   }
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:   entry:
 ; CHECK-NEXT:     %0 = load i32, i32* @B, align 4
 ; CHECK-NEXT:     %add = add nsw i32 %x, %0
-; CHECK-NEXT:     %1 = load i32, i32* %c, align 4
+; CHECK-NEXT:     %1 = load i32, i32* @D, align 4
 ; CHECK-NEXT:     %add1 = add nsw i32 %add, %1
 ; CHECK-NEXT:     ret i32 %add1
 ; CHECK-NEXT:   }
diff --git a/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll b/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll
--- a/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll
@@ -1,4 +1,4 @@
-; RUN: opt -function-specialization -func-specialization-size-threshold=3 -S < %s | FileCheck %s
+; RUN: opt -function-specialization -func-specialization-min-gain=400 -func-specialization-size-threshold=3 -S < %s | FileCheck %s
 
 define i64 @main(i64 %x, i1 %flag) {
 entry:
diff --git a/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll b/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll
@@ -0,0 +1,172 @@
+; RUN: opt -function-specialization -func-specialization-min-gain=1000 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=NONE
+; RUN: opt -function-specialization -func-specialization-min-gain=900 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefixes=ONE
+; RUN: opt -function-specialization -func-specialization-min-gain=700 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=TWO
+; RUN: opt -function-specialization -func-specialization-min-gain=600 -func-specialization-size-threshold=14 -S < %s | FileCheck %s --check-prefix=THREE
+
+define i64 @main(i64 %x, i64 %y, i1 %flag) {
+; NONE-LABEL: @main(
+; NONE-NEXT:  entry:
+; NONE-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; NONE:       plus:
+; NONE-NEXT:    [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; NONE-NEXT:    br label [[MERGE:%.*]]
+; NONE:       minus:
+; NONE-NEXT:    [[TMP1:%.*]] = call i64 @compute(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; NONE-NEXT:    br label [[MERGE]]
+; NONE:       merge:
+; NONE-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; NONE-NEXT:    [[TMP3:%.*]] = call i64 @compute(i64 [[TMP2]], i64 42, i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; NONE-NEXT:    ret i64 [[TMP3]]
+;
+; ONE-LABEL: @main(
+; ONE-NEXT:  entry:
+; ONE-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; ONE:       plus:
+; ONE-NEXT:    [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; ONE-NEXT:    br label [[MERGE:%.*]]
+; ONE:       minus:
+; ONE-NEXT:    [[TMP1:%.*]] = call i64 @compute(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; ONE-NEXT:    br label [[MERGE]]
+; ONE:       merge:
+; ONE-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; ONE-NEXT:    [[TMP3:%.*]] = call i64 @compute.1(i64 [[TMP2]], i64 42, i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; ONE-NEXT:    ret i64 [[TMP3]]
+;
+; TWO-LABEL: @main(
+; TWO-NEXT:  entry:
+; TWO-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; TWO:       plus:
+; TWO-NEXT:    [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; TWO-NEXT:    br label [[MERGE:%.*]]
+; TWO:       minus:
+; TWO-NEXT:    [[TMP1:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; TWO-NEXT:    br label [[MERGE]]
+; TWO:       merge:
+; TWO-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; TWO-NEXT:    [[TMP3:%.*]] = call i64 @compute.2(i64 [[TMP2]], i64 42, i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; TWO-NEXT:    ret i64 [[TMP3]]
+;
+; THREE-LABEL: @main(
+; THREE-NEXT:  entry:
+; THREE-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
+; THREE:       plus:
+; THREE-NEXT:    [[TMP0:%.*]] = call i64 @compute.1(i64 [[X:%.*]], i64 [[Y:%.*]], i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+; THREE-NEXT:    br label [[MERGE:%.*]]
+; THREE:       minus:
+; THREE-NEXT:    [[TMP1:%.*]] = call i64 @compute.2(i64 [[X]], i64 [[Y]], i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+; THREE-NEXT:    br label [[MERGE]]
+; THREE:       merge:
+; THREE-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ]
+; THREE-NEXT:    [[TMP3:%.*]] = call i64 @compute.3(i64 [[TMP2]], i64 42, i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+; THREE-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  br i1 %flag, label %plus, label %minus
+
+plus:
+  %tmp0 = call i64 @compute(i64 %x, i64 %y, i64 (i64, i64)* @power, i64 (i64, i64)* @mul)
+  br label %merge
+
+minus:
+  %tmp1 = call i64 @compute(i64 %x, i64 %y, i64 (i64, i64)* @minus, i64 (i64, i64)* @power)
+  br label %merge
+
+merge:
+  %tmp2 = phi i64 [ %tmp0, %plus ], [ %tmp1, %minus]
+  %tmp3 = call i64 @compute(i64 %tmp2, i64 42, i64 (i64, i64)* @plus, i64 (i64, i64)* @minus)
+  ret i64 %tmp3
+}
+
+; THREE-NOT: define internal i64 @compute
+;
+; THREE-LABEL: define internal i64 @compute.1(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+; THREE-NEXT:  entry:
+; THREE-NEXT:    [[TMP0:%.+]] = call i64 @power(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP1:%.+]] = call i64 @mul(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]]
+; THREE-NEXT:    [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x
+; THREE-NEXT:    [[TMP4:%.+]] = sub i64 [[TMP3]], %y
+; THREE-NEXT:    [[TMP5:%.+]] = mul i64 [[TMP4]], 2
+; THREE-NEXT:    ret i64 [[TMP5]]
+; THREE-NEXT:  }
+;
+; THREE-LABEL: define internal i64 @compute.2(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+; THREE-NEXT:  entry:
+; THREE-NEXT:    [[TMP0:%.+]] = call i64 @minus(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP1:%.+]] = call i64 @power(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]]
+; THREE-NEXT:    [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x
+; THREE-NEXT:    [[TMP4:%.+]] = sub i64 [[TMP3]], %y
+; THREE-NEXT:    [[TMP5:%.+]] = mul i64 [[TMP4]], 2
+; THREE-NEXT:    ret i64 [[TMP5]]
+; THREE-NEXT:  }
+;
+; THREE-LABEL: define internal i64 @compute.3(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+; THREE-NEXT:  entry:
+; THREE-NEXT:    [[TMP0:%.+]] = call i64 @plus(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP1:%.+]] = call i64 @minus(i64 %x, i64 %y)
+; THREE-NEXT:    [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]]
+; THREE-NEXT:    [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x
+; THREE-NEXT:    [[TMP4:%.+]] = sub i64 [[TMP3]], %y
+; THREE-NEXT:    [[TMP5:%.+]] = mul i64 [[TMP4]], 2
+; THREE-NEXT:    ret i64 [[TMP5]]
+; THREE-NEXT:  }
+;
+define internal i64 @compute(i64 %x, i64 %y, i64 (i64, i64)* %binop1, i64 (i64, i64)* %binop2) {
+entry:
+  %tmp0 = call i64 %binop1(i64 %x, i64 %y)
+  %tmp1 = call i64 %binop2(i64 %x, i64 %y)
+  %add = add i64 %tmp0, %tmp1
+  %div = sdiv i64 %add, %x
+  %sub = sub i64 %div, %y
+  %mul = mul i64 %sub, 2
+  ret i64 %mul
+}
+
+define internal i64 @plus(i64 %x, i64 %y) {
+entry:
+  %tmp0 = add i64 %x, %y
+  ret i64 %tmp0
+}
+
+define internal i64 @minus(i64 %x, i64 %y) {
+entry:
+  %tmp0 = sub i64 %x, %y
+  ret i64 %tmp0
+}
+
+define internal i64 @mul(i64 %x, i64 %n) {
+entry:
+  %cmp6 = icmp sgt i64 %n, 1
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %x.addr.0.lcssa = phi i64 [ %x, %entry ], [ %add, %for.body ]
+  ret i64 %x.addr.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %entry ]
+  %x.addr.07 = phi i64 [ %add, %for.body ], [ %x, %entry ]
+  %add = shl nsw i64 %x.addr.07, 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define internal i64 @power(i64 %x, i64 %n) {
+entry:
+  %cmp6 = icmp sgt i64 %n, 1
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %x.addr.0.lcssa = phi i64 [ %x, %entry ], [ %mul, %for.body ]
+  ret i64 %x.addr.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %entry ]
+  %x.addr.07 = phi i64 [ %mul, %for.body ], [ %x, %entry ]
+  %mul = mul nsw i64 %x.addr.07, %x.addr.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}