Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -640,6 +640,9 @@
   /// \brief Additional properties of an operand's values.
   enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
 
+  /// \return True if target can execute instructions out of order.
+  bool isOutOfOrder() const;
+
   /// \return The number of scalar or vector registers that the target has.
   /// If 'Vectors' is true, it returns the number of vector registers. If it is
   /// set to false, it returns the number of scalar registers.
@@ -1017,6 +1020,7 @@
                             Type *Ty) = 0;
   virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                             Type *Ty) = 0;
+  virtual bool isOutOfOrder() const = 0;
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
@@ -1290,6 +1294,9 @@
                     Type *Ty) override {
     return Impl.getIntImmCost(IID, Idx, Imm, Ty);
   }
+  bool isOutOfOrder() const override {
+    return Impl.isOutOfOrder();
+  }
   unsigned getNumberOfRegisters(bool Vector) override {
     return Impl.getNumberOfRegisters(Vector);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -335,6 +335,8 @@
     return TTI::TCC_Free;
   }
 
+  bool isOutOfOrder() const { return false; }
+
   unsigned getNumberOfRegisters(bool Vector) { return 8; }
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -394,6 +394,10 @@
     return BaseT::getInstructionLatency(I);
   }
 
+  bool isOutOfOrder() const {
+    return getST()->getSchedModel().isOutOfOrder();
+  }
+
   /// @}
 
   /// \name Vector TTI Implementations
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -315,6 +315,10 @@
   return Cost;
 }
 
+bool TargetTransformInfo::isOutOfOrder() const {
+  return TTIImpl->isOutOfOrder();
+}
+
 unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
   return TTIImpl->getNumberOfRegisters(Vector);
 }
Index: lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- lib/Transforms/Utils/SimplifyCFG.cpp
+++ lib/Transforms/Utils/SimplifyCFG.cpp
@@ -127,6 +127,11 @@
     cl::desc("Limit maximum recursion depth when calculating costs of "
              "speculatively executed instructions"));
 
+static cl::opt<unsigned> DependenceChainLatency(
+    "dependence-chain-latency", cl::Hidden, cl::init(8),
+    cl::desc("Limit the maximum latency of dependence chain containing cmp "
+             "for if conversion"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -395,6 +400,148 @@
   return true;
 }
 
+/// Estimate the code size of the specified BB.
+static unsigned CountBBCodeSize(BasicBlock *BB,
+                                const TargetTransformInfo &TTI) {
+  unsigned Size = 0;
+  for (auto II = BB->begin(); !isa<TerminatorInst>(II); ++II)
+    Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize);
+  return Size;
+}
+
+/// Find out the latency of the longest dependence chain in the BB or the
+/// dependence chain containing the compare instruction.
+static unsigned FindDependenceChainLatency(BasicBlock *BB,
+                            DenseMap<Instruction *, unsigned> &Instructions,
+                            const TargetTransformInfo &TTI,
+                            bool LongestChain) {
+  unsigned Max_latency = 0;
+
+  BasicBlock::iterator II;
+  for (II = BB->begin(); !isa<TerminatorInst>(II); ++II) {
+    unsigned Latency = 0;
+    for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) {
+      Instruction *Op = dyn_cast<Instruction>(II->getOperand(O));
+      if (Op && Instructions.count(Op)) {
+        auto Op_latency = Instructions[Op];
+        if (Op_latency > Latency)
+          Latency = Op_latency;
+      }
+    }
+    Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency);
+    Instructions[&(*II)] = Latency;
+
+    if (Latency > Max_latency)
+      Max_latency = Latency;
+  }
+
+  if (LongestChain)
+    return Max_latency;
+
+  // The length of the dependence chain containing the compare instruction is
+  // wanted, so the terminator must be a BranchInst.
+  assert(isa<BranchInst>(II));
+  BranchInst* Br = cast<BranchInst>(II);
+  Instruction *Cmp = dyn_cast<Instruction>(Br->getCondition());
+  if (Cmp && Instructions.count(Cmp))
+    return Instructions[Cmp];
+  else
+    return 0;
+}
+
+/// Instructions in BB2 may depend on instructions in BB1, and instructions
+/// in BB1 may have users in BB2. If the last (in terms of latency) such kind
+/// of instruction in BB1 is I, then the instructions after I can be executed
+/// in parallel with instructions in BB2.
+/// This function returns the latency of I.
+static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2,
+                        BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                        DenseMap<Instruction *, unsigned> &BB1_instructions) {
+  unsigned LastLatency = 0;
+  SmallVector<Instruction *, 16> Worklist;
+  BasicBlock::iterator II;
+  for (II = BB2->begin(); !isa<TerminatorInst>(II); ++II) {
+    if (PHINode *PN = dyn_cast<PHINode>(II)) {
+      // Look for users in BB2.
+      bool InBBUser = false;
+      for (User *U : PN->users()) {
+        if (cast<Instruction>(U)->getParent() == BB2) {
+          InBBUser = true;
+          break;
+        }
+      }
+      // No such user, we don't care about this instruction and its operands.
+      if (!InBBUser)
+        break;
+    }
+    Worklist.push_back(&(*II));
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) {
+      if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(O))) {
+        if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2)
+          Worklist.push_back(Op);
+        else if (Op->getParent() == BB1 && BB1_instructions.count(Op)) {
+          if (BB1_instructions[Op] > LastLatency)
+            LastLatency = BB1_instructions[Op];
+        }
+      }
+    }
+  }
+
+  return LastLatency;
+}
+
+/// If after if conversion, most of the instructions in new BB construct a
+/// long and slow dependence chain, it may be slower than cmp/branch, even
+/// if the branch has a high miss rate, because the data dependence is changed
+/// into control dependence, and the long dependence chain is split into two,
+/// the two parts can be executed in parallel on modern OOO processor.
+static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2,
+                             BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                             unsigned Speculation_size,
+                             const TargetTransformInfo &TTI) {
+  // Accumulated latency of each instruction in their BBs.
+  DenseMap<Instruction *, unsigned> BB1_instructions;
+  DenseMap<Instruction *, unsigned> BB2_instructions;
+
+  if (!TTI.isOutOfOrder())
+    return false;
+
+  unsigned New_BB_size = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI)
+                         + Speculation_size;
+
+  // We check small BB only since it is more difficult to find unrelated
+  // instructions to fill functional units in small BB.
+  if (New_BB_size > 40)
+    return false;
+
+  auto BB1_chain =
+         FindDependenceChainLatency(BB1, BB1_instructions, TTI, false);
+  auto BB2_chain =
+         FindDependenceChainLatency(BB2, BB2_instructions, TTI, true);
+
+  // If we have a good ILP (IPC>=2) in new BB, then we don't care about the
+  // latency of the dependence chain.
+  if ((BB1_chain + BB2_chain) * 2 <= New_BB_size)
+    return false;
+
+  // We only care about part of the dependence chain in BB1 that can be
+  // executed in parallel with BB2, so adjust the latency.
+  BB1_chain -=
+      LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1_instructions);
+
+  // Correctly predicted branch instruction can skip the dependence chain in
+  // BB1, but misprediction has a penalty, so only when the dependence chain is
+  // longer than DependenceChainLatency, then branch is better than select.
+  if (BB1_chain >= DependenceChainLatency)
+    return true;
+
+  return false;
+}
+
 /// Extract ConstantInt from value, looking through IntToPtr
 /// and PointerNullValue. Return NULL if value is not a constant int.
 static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
@@ -2048,6 +2195,11 @@
   if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
     return false;
 
+  // Don't do if conversion for long dependence chain.
+  if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr,
+                              CountBBCodeSize(ThenBB, TTI), TTI))
+    return false;
+
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
@@ -2355,6 +2507,10 @@
       }
   }
 
+  if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2,
+                              AggressiveInsts.size(), TTI))
+    return false;
+
   DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
                << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
 
Index: test/Transforms/SimplifyCFG/X86/if-conversion.ll
===================================================================
--- test/Transforms/SimplifyCFG/X86/if-conversion.ll
+++ test/Transforms/SimplifyCFG/X86/if-conversion.ll
@@ -0,0 +1,56 @@
+; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s
+; Avoid if conversion if there is a long dependence chain.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define i64 @foo(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-NOT: select
+}
+
+define i64 @bar(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64 
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = add i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @bar
+; CHECK-NOT: select
+}
+