Index: include/llvm/Transforms/IPO/InlinerPass.h
===================================================================
--- include/llvm/Transforms/IPO/InlinerPass.h
+++ include/llvm/Transforms/IPO/InlinerPass.h
@@ -57,7 +57,18 @@
   /// given on the comand line. It is higher if the callee is marked with the
   /// inlinehint attribute.
   ///
-  unsigned getInlineThreshold(CallSite CS) const;
+  unsigned getInlineThreshold(CallSite CS);
+
+  /// The inline threshold can be dynamically tuned according to the control
+  /// flow context around the callsite.
+  virtual unsigned TuneThreshold(unsigned threshold, CallSite CS) {
+    return threshold;
+  };
+
+  /// Analyze callee by calculating loop info and others etc.
+  virtual void
+  AnalyzeFunc(Function *F,
+              SmallVector<std::pair<CallSite, int>, 16> &CallSites){};
 
   /// getInlineCost - This method must be implemented by the subclass to
   /// determine the cost of inlining the specified call site.  If the cost
@@ -83,7 +94,10 @@
 
   /// shouldInline - Return true if the inliner should attempt to
   /// inline at the given CallSite.
-  bool shouldInline(CallSite CS);
+  /// Set ABC true, if the A->B->C case can pass the inlining threshold check.
+  /// and false otherwise.
+  /// Pass true to Remark if want to emit analysis message.
+  bool shouldInline(CallSite CS, bool &ABC, bool Remark);
 };
 
 } // End llvm namespace
Index: lib/Transforms/IPO/InlineSimple.cpp
===================================================================
--- lib/Transforms/IPO/InlineSimple.cpp
+++ lib/Transforms/IPO/InlineSimple.cpp
@@ -12,25 +12,86 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "inline"
 
+STATISTIC(NumCallsAnalyzedForLoopInfo, "Number of call analyzed for loop info");
+
+static cl::opt<bool>
+    AgressiveInline("inline-aggressive", cl::Hidden, cl::Optional,
+                    cl::desc("Enable aggresive inline or not"));
+
+static cl::opt<int>
+    SmallLoopSize("inline-small-loop-size", cl::Hidden, cl::init(2),
+                  cl::ZeroOrMore,
+                  cl::desc("The maximum number of BBs in a small loop"));
+
+static cl::opt<int> SmallLoopBonus(
+    "inline-small-loop-bonus", cl::Hidden, cl::init(400), cl::ZeroOrMore,
+    cl::desc("The percent of threshold enlarged for a simple loop"));
+
+static cl::opt<int>
+    LoopBonus("inline-loop-bonus", cl::Hidden, cl::init(200), cl::ZeroOrMore,
+              cl::desc("The percent of threshold enlarged for a loop"));
+
+static cl::opt<int>
+    CalleeNum("inline-callee-num", cl::Hidden, cl::init(10), cl::ZeroOrMore,
+              cl::desc("The maximum number of callees inside caller"));
+
+static cl::opt<int> ConstArgBonus(
+    "inline-const-arg-bonus", cl::Hidden, cl::init(200), cl::ZeroOrMore,
+    cl::desc("The percent of threshold enlarged for a callee with const args"));
+
 namespace {
 
+/// A small loop analyzer to help iniling context analysis.
+class LoopAnalyzer {
+public:
+  LoopAnalyzer() {}
+  ~LoopAnalyzer();
+
+  /// Analyze loops of function F, and record result to cache LoopStatusMap.
+  /// Always get loop info updated if update is true.
+  void Analyze(Function *F, bool update);
+
+  /// Return the depth of loop which instruction I belongs to.
+  unsigned GetLoopDepth(Instruction *I);
+
+  /// Return true if function F has loop defined, and false otherwise.
+  bool HasLoop(Function *F);
+
+  /// Return the loop to which instruction I belongs.
+  Loop *GetLoop(Instruction *I);
+
+private:
+  struct LoopS {
+    DominatorTree DT;
+    LoopInfoBase<llvm::BasicBlock, llvm::Loop> LI;
+  };
+
+  /// The cache of loop info status for any function.
+  DenseMap<Function *, LoopS *> LoopStatusMap;
+};
+
 /// \brief Actual inliner pass implementation.
 ///
 /// The common implementation of the inlining logic is shared between this
@@ -39,13 +100,29 @@
 class SimpleInliner : public Inliner {
   InlineCostAnalysis *ICA;
 
+  /// A loop analyzer to analyze/record loops for all functions in current SCC.
+  LoopAnalyzer LA;
+
+  /// Optimization level like -O1, -O2, -O3, ...
+  unsigned OptLevel;
+
+  /// Record all call sites passed by inliner.
+  SmallVector<std::pair<CallSite, int>, 16> *CallSites;
+
 public:
-  SimpleInliner() : Inliner(ID), ICA(nullptr) {
+  SimpleInliner() : Inliner(ID), ICA(nullptr), OptLevel(0), CallSites(0) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
   SimpleInliner(int Threshold)
-      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr) {
+      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr),
+        OptLevel(0), CallSites(0) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  SimpleInliner(int Threshold, unsigned OptLevel)
+      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr),
+        OptLevel(OptLevel), CallSites(0) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -57,12 +134,20 @@
 
   bool runOnSCC(CallGraphSCC &SCC) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Use the call site context to tune the original threshold and return.
+  unsigned TuneThreshold(unsigned thres, CallSite CS) override;
+
+  /// Analyze callee by calculating loop info and others etc.
+  void
+  AnalyzeFunc(Function *F,
+              SmallVector<std::pair<CallSite, int>, 16> &CallSites) override;
 };
 
 static int computeThresholdFromOptLevels(unsigned OptLevel,
                                          unsigned SizeOptLevel) {
   if (OptLevel > 2)
-    return 275;
+    return 250;
   if (SizeOptLevel == 1) // -Os
     return 75;
   if (SizeOptLevel == 2) // -Oz
@@ -91,7 +176,67 @@
 Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
                                        unsigned SizeOptLevel) {
   return new SimpleInliner(
-      computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
+      computeThresholdFromOptLevels(OptLevel, SizeOptLevel), OptLevel);
+}
+
+LoopAnalyzer::~LoopAnalyzer() {
+  for (DenseMap<Function *, LoopS *>::const_iterator I = LoopStatusMap.begin(),
+                                                     E = LoopStatusMap.end();
+       I != E; ++I) {
+    const LoopS *LS = I->second;
+    assert(LS && "Find loop analyzer doesn't exist for function!");
+    delete LS;
+  }
+}
+
+/// Analyze loops of function F.
+/// Do nothing if update is false and F already has loop information.
+void LoopAnalyzer::Analyze(Function *F, bool update) {
+  LoopS *LS;
+
+  if (LoopStatusMap.count(F) && !update)
+    return;
+
+  if (LoopStatusMap.count(F) && update)
+    delete LoopStatusMap[F];
+
+  LS = new LoopS();
+  LoopStatusMap[F] = LS;
+  LS->DT.recalculate(*F);
+  LS->LI.Analyze(LS->DT);
+
+  ++NumCallsAnalyzedForLoopInfo;
+}
+
+/// Return the loop instruction I belongs to.
+Loop *LoopAnalyzer::GetLoop(Instruction *I) {
+  Function *F = I->getParent()->getParent();
+
+  // F could be caller's caller, and if we never analyze it before, we must
+  // analyze it to get loop information. If we already have F's loop info,
+  // it's unnecessary to get it updated at this moment, because F must have
+  // not changed since last inlining. We only update loop info whenever
+  // inlining really happens.
+  Analyze(F, false);
+
+  const LoopS *LS = LoopStatusMap.find(F)->second;
+  return LS->LI.getLoopFor(I->getParent());
+}
+
+/// Return the depth of loop to which instruction I belongs.
+unsigned LoopAnalyzer::GetLoopDepth(Instruction *I) {
+  Function *F = I->getParent()->getParent();
+
+  Analyze(F, false);
+  const LoopS *LS = LoopStatusMap.find(F)->second;
+  return LS->LI.getLoopDepth(I->getParent());
+}
+
+/// Return true if function F has loop defined, and false otherwise.
+bool LoopAnalyzer::HasLoop(Function *F) {
+  Analyze(F, false);
+  const LoopS *LS = LoopStatusMap.find(F)->second;
+  return !LS->LI.empty();
 }
 
 bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
@@ -103,3 +248,86 @@
   AU.addRequired<InlineCostAnalysis>();
   Inliner::getAnalysisUsage(AU);
 }
+
+/// The inline threshold can be dynamically tuned according to the control
+/// flow context around the callsite.
+unsigned SimpleInliner::TuneThreshold(unsigned thres, CallSite CS) {
+  // We only tune threshold if optlevel is larger than 2 or aggressive inlining
+  // is specified on command line.
+  if (!AgressiveInline && OptLevel <= 2)
+    return thres;
+
+  Function *Caller = CS.getCaller();
+  Function *Callee = CS.getCalledFunction();
+  StringRef CalleeName = Callee->getName();
+
+  // The following are constant factors to tune threshold.
+
+  // The maximum number of callees inside caller.
+  const unsigned MaxNumOfCallee = CalleeNum;
+  // The maximum number of BBs in a simple loop.
+  const unsigned MaxNumOfBlocksInSimpleLoop = SmallLoopSize;
+  // The percent of threshold enlarged for a simple loop.
+  const unsigned SimpleLoopThresholdBonus = SmallLoopBonus;
+  // The percent of threshold enlarged for a loop.
+  const unsigned LoopThresholdBonus = LoopBonus;
+  // The times we will enlarge for callees with constant arguments.
+  const unsigned ConstantArgThresholdBonus = ConstArgBonus;
+
+  // Some heuristic rules are defined as below...
+
+  // Avoid inlining too many times of the same callee to reduce code size.
+  unsigned NumOfCallee = 0;
+  assert(CallSites && "Call sites don't exist!");
+  for (unsigned CSi = 0; CSi != CallSites->size(); ++CSi) {
+    CallSite CS = (*CallSites)[CSi].first;
+
+    if (Caller != CS.getCaller() || Callee != CS.getCalledFunction())
+      continue;
+
+    NumOfCallee++;
+  }
+  if (NumOfCallee > MaxNumOfCallee) {
+    DEBUG(dbgs() << "    > " << CalleeName << " is called " << NumOfCallee
+                 << " times, Skip.\n");
+    return thres;
+  }
+
+  // Give threshold bonus if callee is inside a loop body.
+  unsigned LoopDepth = LA.GetLoopDepth(CS.getInstruction());
+  if (LoopDepth) {
+    Loop *l = LA.GetLoop(CS.getInstruction());
+
+    // Give further threshold bonus if the loop body is very small.
+    if (l->getNumBlocks() <= MaxNumOfBlocksInSimpleLoop) {
+      DEBUG(dbgs() << "    > " << l->getNumBlocks() << " BBs found in loop.\n");
+      return thres * SimpleLoopThresholdBonus / 100;
+    } else {
+      DEBUG(dbgs() << "    > " << CalleeName << " is called in loop.\n");
+      return thres * LoopThresholdBonus / 100;
+    }
+  }
+
+  // Give threshold bonus, if callee has constant real args and loop.
+  // This is because constant can significantly affect loop optimization.
+  // e.g. loop boundary could turn to be constant after inlining.
+  bool FindConstArg = false;
+  for (CallSite::arg_iterator CAI = CS.arg_begin(), CAE = CS.arg_end();
+       CAI != CAE; ++CAI) {
+    if (dyn_cast<Constant>(CAI))
+      FindConstArg = true;
+  }
+  if (FindConstArg && LA.HasLoop(Callee)) {
+    DEBUG(dbgs() << "    > " << CalleeName << " has const arg.\n");
+    return thres * ConstantArgThresholdBonus / 100;
+  }
+
+  // TODO: add more heuristic rules here.
+  return thres;
+}
+
+void SimpleInliner::AnalyzeFunc(
+    Function *F, SmallVector<std::pair<CallSite, int>, 16> &CallSites) {
+  this->CallSites = &CallSites;
+  LA.Analyze(F, true);
+}
Index: lib/Transforms/IPO/Inliner.cpp
===================================================================
--- lib/Transforms/IPO/Inliner.cpp
+++ lib/Transforms/IPO/Inliner.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -251,7 +252,7 @@
   return true;
 }
 
-unsigned Inliner::getInlineThreshold(CallSite CS) const {
+unsigned Inliner::getInlineThreshold(CallSite CS) {
   int thres = InlineThreshold; // -inline-threshold or else selected by
                                // overall opt level
 
@@ -284,7 +285,8 @@
       ColdThreshold < thres)
     thres = ColdThreshold;
 
-  return thres;
+  // We need to tune threshold according to the control flow context.
+  return TuneThreshold(thres, CS);
 }
 
 static void emitAnalysis(CallSite CS, const Twine &Msg) {
@@ -295,34 +297,45 @@
 }
 
 /// Return true if the inliner should attempt to inline at the given CallSite.
-bool Inliner::shouldInline(CallSite CS) {
+/// Set ABC true, if the A->B->C case can pass the inlining threshold check.
+/// and false otherwise.
+/// Pass true to Remark if want to emit analysis message.
+bool Inliner::shouldInline(CallSite CS, bool &ABC, bool Remark) {
+  ABC = false;
+
   InlineCost IC = getInlineCost(CS);
   
   if (IC.isAlways()) {
-    DEBUG(dbgs() << "    Inlining: cost=always"
-          << ", Call: " << *CS.getInstruction() << "\n");
-    emitAnalysis(CS, Twine(CS.getCalledFunction()->getName()) +
-                         " should always be inlined (cost=always)");
+    if (Remark) {
+      DEBUG(dbgs() << "    Inlining: cost=always"
+                   << ", Call: " << *CS.getInstruction() << "\n");
+      emitAnalysis(CS, Twine(CS.getCalledFunction()->getName()) +
+                           " should always be inlined (cost=always)");
+    }
     return true;
   }
   
   if (IC.isNever()) {
-    DEBUG(dbgs() << "    NOT Inlining: cost=never"
-          << ", Call: " << *CS.getInstruction() << "\n");
-    emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
-                           " should never be inlined (cost=never)"));
+    if (Remark) {
+      DEBUG(dbgs() << "    NOT Inlining: cost=never"
+                   << ", Call: " << *CS.getInstruction() << "\n");
+      emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
+                             " should never be inlined (cost=never)"));
+    }
     return false;
   }
   
   Function *Caller = CS.getCaller();
   if (!IC) {
-    DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
-          << ", thres=" << (IC.getCostDelta() + IC.getCost())
-          << ", Call: " << *CS.getInstruction() << "\n");
-    emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
-                           " too costly to inline (cost=") +
-                         Twine(IC.getCost()) + ", threshold=" +
-                         Twine(IC.getCostDelta() + IC.getCost()) + ")");
+    if (Remark) {
+      DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
+                   << ", thres=" << (IC.getCostDelta() + IC.getCost())
+                   << ", Call: " << *CS.getInstruction() << "\n");
+      emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
+                             " too costly to inline (cost=") +
+                           Twine(IC.getCost()) + ", threshold=" +
+                           Twine(IC.getCostDelta() + IC.getCost()) + ")");
+    }
     return false;
   }
   
@@ -386,25 +399,31 @@
       TotalSecondaryCost += InlineConstants::LastCallToStaticBonus;
 
     if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) {
-      DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction() <<
-           " Cost = " << IC.getCost() <<
-           ", outer Cost = " << TotalSecondaryCost << '\n');
-      emitAnalysis(
-          CS, Twine("Not inlining. Cost of inlining " +
-                    CS.getCalledFunction()->getName() +
-                    " increases the cost of inlining " +
-                    CS.getCaller()->getName() + " in other contexts"));
+      if (Remark) {
+        DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction()
+                     << " Cost = " << IC.getCost()
+                     << ", outer Cost = " << TotalSecondaryCost << '\n');
+        emitAnalysis(CS,
+                     Twine("Not inlining. Cost of inlining " +
+                           CS.getCalledFunction()->getName() +
+                           " increases the cost of inlining " +
+                           CS.getCaller()->getName() + " in other contexts"));
+      }
+      ABC = true;
       return false;
     }
   }
 
-  DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
-        << ", thres=" << (IC.getCostDelta() + IC.getCost())
-        << ", Call: " << *CS.getInstruction() << '\n');
-  emitAnalysis(
-      CS, CS.getCalledFunction()->getName() + Twine(" can be inlined into ") +
-              CS.getCaller()->getName() + " with cost=" + Twine(IC.getCost()) +
-              " (threshold=" + Twine(IC.getCostDelta() + IC.getCost()) + ")");
+  if (Remark) {
+    DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
+                 << ", thres=" << (IC.getCostDelta() + IC.getCost())
+                 << ", Call: " << *CS.getInstruction() << '\n');
+    emitAnalysis(CS, CS.getCalledFunction()->getName() +
+                         Twine(" can be inlined into ") +
+                         CS.getCaller()->getName() + " with cost=" +
+                         Twine(IC.getCost()) + " (threshold=" +
+                         Twine(IC.getCostDelta() + IC.getCost()) + ")");
+  }
   return true;
 }
 
@@ -488,12 +507,76 @@
   InlinedArrayAllocasTy InlinedArrayAllocas;
   InlineFunctionInfo InlineInfo(&CG, AA, ACT);
 
+  // Solve A->B->C issue (ABC case in short). We check all call sites once
+  // before doing real inlining transformation. If any positive A->B->C case is
+  // captured, we will exit the analysis of current function immediately. This
+  // is because C still can be inlined into A after B is inlined into A. And
+  // other call sites like B->X, for which X can be inlined into B but A->B->X
+  // situation is negative could still have inlining opportunity after B is
+  // inlined into function A. The early exit for ABC case can significantly
+  // save compile-time for CPP program because all the analysis of late-called
+  // callees can be saved.
+  //
+  // FIXME: This may not be accurate enough because the early-called function
+  // inlining could simplify the argument of of late-called function. Therefore
+  // some positive A->B->C cases could be missed eventually.
+
+  // Collect all callers and callsites for ABC case checking.
+  SmallSetVector<Function *, 4> Callers;
+  SmallSetVector<CallSite, 16> CallSitesForABC;
+  for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
+    CallSite CS = CallSites[CSi].first;
+    Function *Caller = CS.getCaller();
+    Function *Callee = CS.getCalledFunction();
+
+    // If this call site is dead and it is to a readonly function, we should
+    // just delete the call instead of trying to inline it, regardless of
+    // size.  This happens because IPSCCP propagates the result out of the
+    // call and then we're left with the dead call.
+    if (isInstructionTriviallyDead(CS.getInstruction(), TLI))
+      continue;
+
+    // We can only inline direct calls to non-declarations.
+    if (!Callee || Callee->isDeclaration())
+      continue;
+
+    // If this call site was obtained by inlining another function, verify
+    // that the include path for the function did not include the callee
+    // itself.  If so, we'd be recursively inlining the same function,
+    // which would provide the same callsites, which would cause us to
+    // infinitely inline.
+    int InlineHistoryID = CallSites[CSi].second;
+    if (InlineHistoryID != -1 &&
+        InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory))
+      continue;
+
+    Callers.insert(Caller);
+    CallSitesForABC.insert(CS);
+  }
+
+  // Analyze all callers once.
+  for (unsigned i = 0; i < Callers.size(); i++)
+    AnalyzeFunc(Callers[i], CallSites);
+
+  // If positive ABC case is found out of callsites, we exit immediately.
+  bool ABC = false;
+
+  for (unsigned i = 0; i < CallSitesForABC.size(); i++)
+    if (!shouldInline(CallSitesForABC[i], ABC, false))
+      if (ABC)
+        return false;
+
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
   bool Changed = false;
   bool LocalChange;
+  Function *LastCaller = 0;
+  CallSite LastCS;
   do {
     LocalChange = false;
+    if (CallSites.size())
+      LastCS = CallSites[CallSites.size() - 1].first;
+
     // Iterate over the outer loop because inlining functions can cause indirect
     // calls to become direct calls.
     for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
@@ -532,9 +615,26 @@
         // Get DebugLoc to report. CS will be invalid after Inliner.
         DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
 
+        if (LastCaller != Caller) {
+          // We don't analyze loop info until caller is changed. The inlining
+          // of any call site should not significantly change the loop info for
+          // other call sites in the same caller. This intends to save the
+          // number of analyzing loop info, because it is expensive.
+          AnalyzeFunc(Caller, CallSites);
+          LastCaller = Caller;
+        } else if (LastCS == CS) {
+          // If we reach the end of the analysis of the same level of call
+          // sites, the newly introduced call sites after inlining as the
+          // next level would have to be analyzed to get accurate loop info.
+          LastCaller = 0;
+
+          // Mark the end of next level of call sites.
+          LastCS = CallSites[CallSites.size() - 1].first;
+        }
+
         // If the policy determines that we should inline this function,
         // try to do so.
-        if (!shouldInline(CS)) {
+        if (!shouldInline(CS, ABC, true)) {
           emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,
                                        Twine(Callee->getName() +
                                              " will not be inlined into " +
@@ -596,16 +696,8 @@
         ++NumDeleted;
       }
 
-      // Remove this call site from the list.  If possible, use 
-      // swap/pop_back for efficiency, but do not use it if doing so would
-      // move a call site to a function in this SCC before the
-      // 'FirstCallInSCC' barrier.
-      if (SCC.isSingular()) {
-        CallSites[CSi] = CallSites.back();
-        CallSites.pop_back();
-      } else {
-        CallSites.erase(CallSites.begin()+CSi);
-      }
+      // Remove this call site from the list.
+      CallSites.erase(CallSites.begin() + CSi);
       --CSi;
 
       Changed = true;
Index: test/Transforms/Inline/inline-loop.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/inline-loop.ll
@@ -0,0 +1,376 @@
+; Check default behavior of aggressive inline.
+; RUN: opt -S -inline -inline-aggressive -inline-threshold=225  < %s | FileCheck %s
+
+; Check default behavior without aggressive inline.
+; RUN: opt -S -inline -inline-threshold=225 < %s | FileCheck %s -check-prefix=NO-AGGRESSIVE
+
+; For aggresive inline, by default small loop bonus is 400, so we check if big
+; function can't be inlined even if it is inside a small loop.
+; RUN: opt -S -inline -inline-aggressive -inline-threshold=225 -inline-small-loop-bonus=200 < %s | FileCheck %s -check-prefix=SMALL-LOOP
+
+; For aggresive inline, by default small loop size is 2, i.e. 2 BBs inside loop
+; body, so we check big function can still be inlined even if the loop body is
+; with 3 BBs.
+; RUN: opt -S -inline -inline-aggressive -inline-threshold=225 -inline-small-loop-size=3 < %s | FileCheck %s -check-prefix=SMALL-LOOP-SIZE
+
+; For aggressive inline, by default loop bonus is 400, i.e. enlarge threshold 4
+; times, so we check big function can be inlined into a big loop if only we
+; enlarge loop bonus.
+; RUN: opt -S -inline -inline-aggressive -inline-threshold=225 -inline-loop-bonus=400 < %s | FileCheck %s -check-prefix=BIG-LOOP-BONUS
+
+@a = global i32 4
+
+; This function should be larger than threshold 225 and called inside a loop.
+define i32 @HotFunction1(i32 %a) {
+entry:
+  %a1 = load volatile i32, i32* @a
+  %x1 = add i32 %a1,  %a1
+  %a2 = load volatile i32, i32* @a
+  %x2 = add i32 %x1, %a2
+  %a3 = load volatile i32, i32* @a
+  %x3 = add i32 %x2, %a3
+  %a4 = load volatile i32, i32* @a
+  %x4 = add i32 %x3, %a4
+  %a5 = load volatile i32, i32* @a
+  %x5 = add i32 %x4, %a5
+  %a6 = load volatile i32, i32* @a
+  %x6 = add i32 %x5, %a6
+  %a7 = load volatile i32, i32* @a
+  %x7 = add i32 %x6, %a7
+  %a8 = load volatile i32, i32* @a
+  %x8 = add i32 %x7, %a8
+  %a9 = load volatile i32, i32* @a
+  %x9 = add i32 %x8, %a9
+  %a10 = load volatile i32, i32* @a
+  %x10 = add i32 %x9, %a10
+  %a11 = load volatile i32, i32* @a
+  %x11 = add i32 %x10, %a11
+  %a12 = load volatile i32, i32* @a
+  %x12 = add i32 %x11, %a12
+
+  %a21 = load volatile i32, i32* @a
+  %x21 = add i32 %x12, %a21
+  %a22 = load volatile i32, i32* @a
+  %x22 = add i32 %x21, %a22
+  %a23 = load volatile i32, i32* @a
+  %x23 = add i32 %x22, %a23
+  %a24 = load volatile i32, i32* @a
+  %x24 = add i32 %x23, %a24
+  %a25 = load volatile i32, i32* @a
+  %x25 = add i32 %x24, %a25
+  %a26 = load volatile i32, i32* @a
+  %x26 = add i32 %x25, %a26
+  %a27 = load volatile i32, i32* @a
+  %x27 = add i32 %x26, %a27
+  %a28 = load volatile i32, i32* @a
+  %x28 = add i32 %x27, %a28
+  %a29 = load volatile i32, i32* @a
+  %x29 = add i32 %x28, %a29
+  %a30 = load volatile i32, i32* @a
+  %x30 = add i32 %x29, %a30
+  %a31 = load volatile i32, i32* @a
+  %x31 = add i32 %x30, %a31
+  %a32 = load volatile i32, i32* @a
+  %x32 = add i32 %x31, %a32
+
+  %a41 = load volatile i32, i32* @a
+  %x41 = add i32 %x32, %a41
+  %a42 = load volatile i32, i32* @a
+  %x42 = add i32 %x41, %a42
+  %a43 = load volatile i32, i32* @a
+  %x43 = add i32 %x42, %a43
+  %a44 = load volatile i32, i32* @a
+  %x44 = add i32 %x43, %a44
+  %a45 = load volatile i32, i32* @a
+  %x45 = add i32 %x44, %a45
+  %a46 = load volatile i32, i32* @a
+  %x46 = add i32 %x45, %a46
+  %a47 = load volatile i32, i32* @a
+  %x47 = add i32 %x46, %a47
+  %a48 = load volatile i32, i32* @a
+  %x48 = add i32 %x47, %a48
+  %a49 = load volatile i32, i32* @a
+  %x49 = add i32 %x48, %a49
+  %a50 = load volatile i32, i32* @a
+  %x50 = add i32 %x49, %a50
+  %a51 = load volatile i32, i32* @a
+  %x51 = add i32 %x50, %a51
+  %a52 = load volatile i32, i32* @a
+  %x52 = add i32 %x51, %a52
+
+  %add = add i32 %x52, %a
+  ret i32 %add
+}
+
+define i32 @bar1(i32 %n) {
+; CHECK-LABEL: @bar1
+; CHECK-NOT: call i32 @HotFunction1(i32
+; NO-AGGRESSIVE-LABEL: @bar1
+; NO-AGGRESSIV: call i32 @HotFunction1(i32
+entry:
+  %cmp5 = icmp sgt i32 %n, 1
+  br i1 %cmp5, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %s.07 = phi i32 [ %s.1, %for.inc ], [ 0, %for.body.preheader ]
+  %i.06 = phi i32 [ %inc, %for.inc ], [ 1, %for.body.preheader ]
+  %rem = srem i32 %i.06, 3
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:
+  %call = tail call i32 @HotFunction1(i32 %i.06)
+  %add = add nsw i32 %call, %s.07
+  br label %for.inc
+
+for.inc:
+  %s.1 = phi i32 [ %add, %if.then ], [ %s.07, %for.body ]
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %s.1.lcssa = phi i32 [ %s.1, %for.inc ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %s.1.lcssa, %for.end.loopexit ]
+  ret i32 %s.0.lcssa
+}
+
+; This function should be larger than threshold 450 and called inside a small
+; loop.
+define i32 @HotFunction2(i32 %a) {
+entry:
+  %a1 = load volatile i32, i32* @a
+  %x1 = add i32 %a1,  %a1
+  %a2 = load volatile i32, i32* @a
+  %x2 = add i32 %x1, %a2
+  %a3 = load volatile i32, i32* @a
+  %x3 = add i32 %x2, %a3
+  %a4 = load volatile i32, i32* @a
+  %x4 = add i32 %x3, %a4
+  %a5 = load volatile i32, i32* @a
+  %x5 = add i32 %x4, %a5
+  %a6 = load volatile i32, i32* @a
+  %x6 = add i32 %x5, %a6
+  %a7 = load volatile i32, i32* @a
+  %x7 = add i32 %x6, %a7
+  %a8 = load volatile i32, i32* @a
+  %x8 = add i32 %x7, %a8
+  %a9 = load volatile i32, i32* @a
+  %x9 = add i32 %x8, %a9
+  %a10 = load volatile i32, i32* @a
+  %x10 = add i32 %x9, %a10
+  %a11 = load volatile i32, i32* @a
+  %x11 = add i32 %x10, %a11
+  %a12 = load volatile i32, i32* @a
+  %x12 = add i32 %x11, %a12
+
+  %a21 = load volatile i32, i32* @a
+  %x21 = add i32 %x12, %a21
+  %a22 = load volatile i32, i32* @a
+  %x22 = add i32 %x21, %a22
+  %a23 = load volatile i32, i32* @a
+  %x23 = add i32 %x22, %a23
+  %a24 = load volatile i32, i32* @a
+  %x24 = add i32 %x23, %a24
+  %a25 = load volatile i32, i32* @a
+  %x25 = add i32 %x24, %a25
+  %a26 = load volatile i32, i32* @a
+  %x26 = add i32 %x25, %a26
+  %a27 = load volatile i32, i32* @a
+  %x27 = add i32 %x26, %a27
+  %a28 = load volatile i32, i32* @a
+  %x28 = add i32 %x27, %a28
+  %a29 = load volatile i32, i32* @a
+  %x29 = add i32 %x28, %a29
+  %a30 = load volatile i32, i32* @a
+  %x30 = add i32 %x29, %a30
+  %a31 = load volatile i32, i32* @a
+  %x31 = add i32 %x30, %a31
+  %a32 = load volatile i32, i32* @a
+  %x32 = add i32 %x31, %a32
+
+  %a41 = load volatile i32, i32* @a
+  %x41 = add i32 %x32, %a41
+  %a42 = load volatile i32, i32* @a
+  %x42 = add i32 %x41, %a42
+  %a43 = load volatile i32, i32* @a
+  %x43 = add i32 %x42, %a43
+  %a44 = load volatile i32, i32* @a
+  %x44 = add i32 %x43, %a44
+  %a45 = load volatile i32, i32* @a
+  %x45 = add i32 %x44, %a45
+  %a46 = load volatile i32, i32* @a
+  %x46 = add i32 %x45, %a46
+  %a47 = load volatile i32, i32* @a
+  %x47 = add i32 %x46, %a47
+  %a48 = load volatile i32, i32* @a
+  %x48 = add i32 %x47, %a48
+  %a49 = load volatile i32, i32* @a
+  %x49 = add i32 %x48, %a49
+  %a50 = load volatile i32, i32* @a
+  %x50 = add i32 %x49, %a50
+  %a51 = load volatile i32, i32* @a
+  %x51 = add i32 %x50, %a51
+  %a52 = load volatile i32, i32* @a
+  %x52 = add i32 %x51, %a52
+
+  %a61 = load volatile i32, i32* @a
+  %x61 = add i32 %a52,  %a61
+  %a62 = load volatile i32, i32* @a
+  %x62 = add i32 %x61, %a62
+  %a63 = load volatile i32, i32* @a
+  %x63 = add i32 %x62, %a63
+  %a64 = load volatile i32, i32* @a
+  %x64 = add i32 %x63, %a64
+  %a65 = load volatile i32, i32* @a
+  %x65 = add i32 %x64, %a65
+  %a66 = load volatile i32, i32* @a
+  %x66 = add i32 %x65, %a66
+  %a67 = load volatile i32, i32* @a
+  %x67 = add i32 %x66, %a67
+  %a68 = load volatile i32, i32* @a
+  %x68 = add i32 %x67, %a68
+  %a69 = load volatile i32, i32* @a
+  %x69 = add i32 %x68, %a69
+  %a610 = load volatile i32, i32* @a
+  %x610 = add i32 %x69, %a610
+  %a611 = load volatile i32, i32* @a
+  %x611 = add i32 %x610, %a611
+  %a612 = load volatile i32, i32* @a
+  %x612 = add i32 %x611, %a612
+
+  %a621 = load volatile i32, i32* @a
+  %x621 = add i32 %x612, %a621
+  %a622 = load volatile i32, i32* @a
+  %x622 = add i32 %x621, %a622
+  %a623 = load volatile i32, i32* @a
+  %x623 = add i32 %x622, %a623
+  %a624 = load volatile i32, i32* @a
+  %x624 = add i32 %x623, %a624
+  %a625 = load volatile i32, i32* @a
+  %x625 = add i32 %x624, %a625
+  %a626 = load volatile i32, i32* @a
+  %x626 = add i32 %x625, %a626
+  %a627 = load volatile i32, i32* @a
+  %x627 = add i32 %x626, %a627
+  %a628 = load volatile i32, i32* @a
+  %x628 = add i32 %x627, %a628
+  %a629 = load volatile i32, i32* @a
+  %x629 = add i32 %x628, %a629
+  %a630 = load volatile i32, i32* @a
+  %x630 = add i32 %x629, %a630
+  %a631 = load volatile i32, i32* @a
+  %x631 = add i32 %x630, %a631
+  %a632 = load volatile i32, i32* @a
+  %x632 = add i32 %x631, %a632
+
+  %a641 = load volatile i32, i32* @a
+  %x641 = add i32 %x632, %a641
+  %a642 = load volatile i32, i32* @a
+  %x642 = add i32 %x641, %a642
+  %a643 = load volatile i32, i32* @a
+  %x643 = add i32 %x642, %a643
+  %a644 = load volatile i32, i32* @a
+  %x644 = add i32 %x643, %a644
+  %a645 = load volatile i32, i32* @a
+  %x645 = add i32 %x644, %a645
+  %a646 = load volatile i32, i32* @a
+  %x646 = add i32 %x645, %a646
+  %a647 = load volatile i32, i32* @a
+  %x647 = add i32 %x646, %a647
+  %a648 = load volatile i32, i32* @a
+  %x648 = add i32 %x647, %a648
+  %a649 = load volatile i32, i32* @a
+  %x649 = add i32 %x648, %a649
+  %a650 = load volatile i32, i32* @a
+  %x650 = add i32 %x649, %a650
+  %a651 = load volatile i32, i32* @a
+  %x651 = add i32 %x650, %a651
+  %a652 = load volatile i32, i32* @a
+  %x652 = add i32 %x651, %a652
+
+  %add = add i32 %x652, %a
+  ret i32 %add
+}
+
+define i32 @bar2(i32 %n) {
+; CHECK-LABEL: @bar
+; CHECK-NOT: call i32 @HotFunction2(i32
+; NO-AGGRESSIVE-LABEL: @bar2
+; NO-AGGRESSIV: call i32 @HotFunction2(i32
+; SMALL-LOOP-LABEL: @bar2
+; SMALL-LOOP: call i32 @HotFunction2(i32
+entry:
+  %cmp4 = icmp sgt i32 %n, 1
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %s.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %i.05 = phi i32 [ %inc, %for.body ], [ 1, %for.body.preheader ]
+  %call = tail call i32 @HotFunction2(i32 %i.05)
+  %add = add nsw i32 %call, %s.06
+  %inc = add nuw nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %s.0.lcssa
+}
+
+; This function has loop with > 2BB in loop body, so it can't get enough
+; bonus to inline big callee HotFunction2.
+define i32 @bar3(i32 %n) {
+; CHECK-LABEL: @bar3
+; CHECK: call i32 @HotFunction2(i32
+; SMALL-LOOP-SIZE-LABEL: @bar3
+; SMALL-LOOP-SIZE-NOT: call i32 @HotFunction2(i32
+; BIG-LOOP-BONUS-LABEL: @bar3
+; BIG-LOOP-BONUS-NOT: call i32 @HotFunction2(i32
+entry:
+  %cmp5 = icmp sgt i32 %n, 1
+  br i1 %cmp5, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %s.07 = phi i32 [ %s.1, %for.inc ], [ 0, %for.body.preheader ]
+  %i.06 = phi i32 [ %inc, %for.inc ], [ 1, %for.body.preheader ]
+  %rem = srem i32 %i.06, 3
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:
+  %call = tail call i32 @HotFunction2(i32 %i.06)
+  %add = add nsw i32 %call, %s.07
+  br label %for.inc
+
+for.inc:
+  %s.1 = phi i32 [ %add, %if.then ], [ %s.07, %for.body ]
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %s.1.lcssa = phi i32 [ %s.1, %for.inc ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %s.1.lcssa, %for.end.loopexit ]
+  ret i32 %s.0.lcssa
+}
+
Index: test/Transforms/Inline/inline-misc.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/inline-misc.ll
@@ -0,0 +1,128 @@
+; For aggressive inline, by default 
+; (1) The threshold bonus for callee with constant argument is 200, i.e.
+; enlarge threshold 2 times.
+; (2) The max number of same callees is 10. i.e. the callees called >10 times
+; can't be inlined.
+; RUN: opt -inline -S -inline-threshold=75 -inline-aggressive < %s | FileCheck %s
+; RUN: opt -inline -S -inline-threshold=75 < %s | FileCheck %s -check-prefix=NO-AGGRESSIVE
+
+; Negatively check big function can't be inlined without enlarging threshold
+; for function with constant argument.
+; RUN: opt -inline -S -inline-threshold=75 -inline-aggressive -inline-const-arg-bonus=100 < %s | FileCheck %s -check-prefix=LOW-CONST-ARG-BONUS
+
+; Lower the max number of same callees to check if -inline-callee-num can work.
+; RUN: opt -inline -S -inline-threshold=75 -inline-aggressive -inline-callee-num=9 < %s | FileCheck %s -check-prefix=CALLEE-NUM
+
+declare i32 @HotFunction(i32)
+
+define i32 @bar(i32 %n) {
+entry:
+  %cmp29 = icmp sgt i32 %n, 1
+  br i1 %cmp29, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %s.031 = phi i32 [ %s.1, %for.inc ], [ 0, %for.body.preheader ]
+  %i.030 = phi i32 [ %inc, %for.inc ], [ 1, %for.body.preheader ]
+  %rem = srem i32 %i.030, 3
+  %tobool = icmp eq i32 %rem, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:
+  %call = tail call i32 @HotFunction(i32 %i.030)
+  %add = add nsw i32 %call, %s.031
+  br label %for.inc
+
+for.inc:
+  %s.1 = phi i32 [ %add, %if.then ], [ %s.031, %for.body ]
+  %inc = add nuw nsw i32 %i.030, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %s.1.lcssa = phi i32 [ %s.1, %for.inc ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %s.1.lcssa, %for.end.loopexit ]
+  %mul = shl nsw i32 %n, 1
+  %call1 = tail call i32 @HotFunction(i32 %mul)
+  %add2 = add nsw i32 %call1, %s.0.lcssa
+  %cmp426 = icmp slt i32 %n, 1
+  br i1 %cmp426, label %for.body5.preheader, label %for.end13
+
+for.body5.preheader:
+  br label %for.body5
+
+for.body5:
+  %s.228 = phi i32 [ %s.3, %for.inc12 ], [ %add2, %for.body5.preheader ]
+  %i.127 = phi i32 [ %sub, %for.inc12 ], [ %n, %for.body5.preheader ]
+  %rem6 = srem i32 %i.127, 3
+  %tobool7 = icmp eq i32 %rem6, 0
+  br i1 %tobool7, label %for.inc12, label %if.then8
+
+if.then8:
+  %call9 = tail call i32 @HotFunction(i32 %i.127)
+  %add10 = add nsw i32 %call9, %s.228
+  br label %for.inc12
+
+for.inc12:
+  %s.3 = phi i32 [ %add10, %if.then8 ], [ %s.228, %for.body5 ]
+  %sub = add nsw i32 %i.127, -2
+  %cmp4 = icmp slt i32 %sub, 1
+  br i1 %cmp4, label %for.body5, label %for.end13.loopexit
+
+for.end13.loopexit:
+  %s.3.lcssa = phi i32 [ %s.3, %for.inc12 ]
+  br label %for.end13
+
+for.end13:
+  %s.2.lcssa = phi i32 [ %add2, %for.end ], [ %s.3.lcssa, %for.end13.loopexit ]
+  ret i32 %s.2.lcssa
+}
+
+define i32 @foo(i32 %x) {
+; CHECK-LABEL: @foo
+; CHECK-NOT: call i32 @bar(
+; NO-AGGRESSIVE-LABEL: @foo
+; NO-AGGRESSIVE: call i32 @bar(
+; LOW-CONST-ARG-BONUS-LABEL: @foo
+; LOW-CONST-ARG-BONUS: call i32 @bar(
+entry:
+  %call = tail call i32 @bar(i32 1000)
+  ret i32 %call
+}
+
+define i32 @goo(i32 %x) {
+; CHECK-LABEL: @goo
+; CHECK-NOT: call i32 @bar(
+; NO-AGGRESSIVE-LABEL: @goo
+; NO-AGGRESSIVE: call i32 @bar(
+; CALLEE-NUM-LABEL: @goo
+; CALLEE-NUM: call i32 @bar(
+entry:
+  %a1 = tail call i32 @bar(i32 1000)
+  %x1 = add i32 %a1, %a1
+  %a2 = tail call i32 @bar(i32 1001)
+  %x2 = add i32 %x1, %a2
+  %a3 = tail call i32 @bar(i32 1002)
+  %x3 = add i32 %x2, %a3
+  %a4 = tail call i32 @bar(i32 1003)
+  %x4 = add i32 %x3, %a4
+  %a5 = tail call i32 @bar(i32 1004)
+  %x5 = add i32 %x4, %a5
+  %a6 = tail call i32 @bar(i32 1005)
+  %x6 = add i32 %x5, %a6
+  %a7 = tail call i32 @bar(i32 1006)
+  %x7 = add i32 %x6, %a7
+  %a8 = tail call i32 @bar(i32 1007)
+  %x8 = add i32 %x7, %a8
+  %a9 = tail call i32 @bar(i32 1008)
+  %x9 = add i32 %x8, %a9
+  %a10 = tail call i32 @bar(i32 1009)
+  %x10 = add i32 %x9, %a10
+
+  ret i32 %x10
+}