Index: lib/Transforms/Utils/BypassSlowDivision.cpp
===================================================================
--- lib/Transforms/Utils/BypassSlowDivision.cpp
+++ lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -74,6 +75,15 @@
 }
 
 namespace {
+enum ValueRange {
+  /// Operand definitely fits BypassType. No runtime checks are needed.
+  VALRNG_SHORT,
+  /// A runtime check is required, as value range is unknown.
+  VALRNG_UNKNOWN,
+  /// Operand is unlikely to fit BypassType. The bypassing should be disabled.
+  VALRNG_LONG
+};
+
 class FastDivInsertionTask {
   // These fields are set during initialization.
   unsigned Opcode;
@@ -95,7 +105,7 @@
   BasicBlock *createFastBB(BasicBlock *Successor);
   void createDivRemPhiNodes(BasicBlock *ShortBB, BasicBlock *LongBB,
                             BasicBlock *PhiBB, DivCacheTy &PerBBDivCache);
-  Value *createDivRunTimeCheck();
+  Value *createDivRunTimeCheck(Value *Op1, Value *Op2);
 
 public:
   /// Sets up the object to work with instruction \p I. Returns false if the
@@ -115,6 +125,9 @@
     return Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
   }
 
+  /// Check if an integer value fits into a smaller type.
+  ValueRange getValueRange(Value *Op, const DataLayout &DL);
+
   /// Tries to replace a division with a faster code and returns true if
   /// succeeds.
   bool insertFastDiv(DivCacheTy &PerBBDivCache);
@@ -138,6 +151,25 @@
 };
 }
 
+ValueRange FastDivInsertionTask::getValueRange(Value *Op,
+                                               const DataLayout &DL) {
+  unsigned ShortLen = BypassType->getBitWidth();
+  unsigned LongLen = SlowType->getBitWidth();
+  unsigned HiBits = LongLen - ShortLen;
+
+  APInt Zeros(LongLen, 0), Ones(LongLen, 0);
+
+  computeKnownBits(Op, Zeros, Ones, DL);
+
+  if (Zeros.countLeadingOnes() >= HiBits)
+    return VALRNG_SHORT;
+
+  if (Ones.countLeadingZeros() < HiBits)
+    return VALRNG_LONG;
+
+  return VALRNG_UNKNOWN;
+}
+
 bool FastDivInsertionTask::setScalarIDivOp(Instruction *I) {
   SlowDiv = I;
   Opcode = I->getOpcode();
@@ -236,19 +268,17 @@
 // Creates a run-time check to test whether both operands fit the shorter type.
 // The check is inserted at the end of MainBB.
 // True return value means that the operands fit.
-Value *FastDivInsertionTask::createDivRunTimeCheck() {
+// Either of the operands may be NULL if it doesn't need a runtime check.
+Value *FastDivInsertionTask::createDivRunTimeCheck(Value *Op1, Value *Op2) {
   IRBuilder<> Builder(MainBB, MainBB->end());
 
-  // We should have bailed out above if the divisor is a constant, but the
-  // dividend may still be a constant.  Set OrV to our non-constant operands
-  // OR'ed together.
-  assert(!isa<ConstantInt>(Divisor));
-
-  Value *OrV;
-  if (!isa<ConstantInt>(Dividend))
-    OrV = Builder.CreateOr(Dividend, Divisor);
+  Value *OrV = nullptr;
+  if (Op1 && Op2)
+    OrV = Builder.CreateOr(Op1, Op2);
   else
-    OrV = Divisor;
+    OrV = Op1 ? Op1 : Op2;
+
+  assert(OrV && "Nothing to check");
 
   // BitMask is inverted to check if the operands are
   // larger than the bypass type
@@ -265,25 +295,67 @@
 // possible and the longer-slower div/rem instruction otherwise.
 bool FastDivInsertionTask::insertFastDiv(DivCacheTy &PerBBDivCache) {
   if (isa<ConstantInt>(Divisor)) {
-    // Division by a constant should have been been solved and replaced earlier
-    // in the pipeline.
+    // Keep division by a constant for DAGCombiner.
     return false;
   }
 
-  // If the numerator is a constant, bail if it doesn't fit into BypassType.
-  if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
-    if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
-      return false;
-
-  // Basic Block is split before divide
-  BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDiv);
-  MainBB->getInstList().back().eraseFromParent();
-  BasicBlock *FastBB = createFastBB(SuccessorBB);
-  BasicBlock *SlowBB = createSlowBB(SuccessorBB);
-  createDivRemPhiNodes(FastBB, SlowBB, SuccessorBB, PerBBDivCache);
-  Value *CmpV = createDivRunTimeCheck();
-  IRBuilder<> Builder(MainBB, MainBB->end());
-  Builder.CreateCondBr(CmpV, FastBB, SlowBB);
+  const DataLayout &DL = SlowDiv->getModule()->getDataLayout();
+
+  ValueRange DividendRange = getValueRange(Dividend, DL);
+  if (DividendRange == VALRNG_LONG)
+    return false;
+
+  ValueRange DivisorRange = getValueRange(Divisor, DL);
+  if (DivisorRange == VALRNG_LONG)
+    return false;
+
+  bool DividendShort = (DividendRange == VALRNG_SHORT);
+  bool DivisorShort = (DivisorRange == VALRNG_SHORT);
+
+  if (DividendShort && DivisorShort) {
+    // If both operands are known to be short then just replace the long
+    // division with a short one.
+    IRBuilder<> Builder(SlowDiv);
+    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+    Value *TruncResult = isDivisionOp()
+                             ? Builder.CreateUDiv(TruncDividend, TruncDivisor)
+                             : Builder.CreateURem(TruncDividend, TruncDivisor);
+    Value *Result = Builder.CreateZExt(TruncResult, SlowType);
+    SlowDiv->replaceAllUsesWith(Result);
+    SlowDiv->eraseFromParent();
+    SlowDiv = nullptr;
+  } else if (DividendShort && !isSignedDiv()) {
+    // If the division is unsigned and Dividend is known to be short, then
+    // either
+    // 1) Divisor is less or equal to Dividend, and the result can be computed
+    //    with a short division.
+    // 2) Divisor is greater than Dividend. In this case, no division is needed
+    //    at all: The quotient is 0 and the remainder is equal to Dividend.
+    //
+    // So instead of checking at runtime whether Divisor fits into BypassType,
+    // we emit a runtime check to differentiate between these two cases. This
+    // lets us entirely avoid a long div.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDiv);
+    MainBB->getInstList().back().eraseFromParent();
+    LongQuotientV = ConstantInt::get(SlowType, 0);
+    LongRemainderV = Dividend;
+    BasicBlock *DivBB = createFastBB(SuccessorBB);
+    createDivRemPhiNodes(DivBB, MainBB, SuccessorBB, PerBBDivCache);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+    Builder.CreateCondBr(CmpV, DivBB, SuccessorBB);
+  } else {
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDiv);
+    MainBB->getInstList().back().eraseFromParent();
+    BasicBlock *FastBB = createFastBB(SuccessorBB);
+    BasicBlock *SlowBB = createSlowBB(SuccessorBB);
+    createDivRemPhiNodes(FastBB, SlowBB, SuccessorBB, PerBBDivCache);
+    Value *CmpV = createDivRunTimeCheck(DividendShort ? nullptr : Dividend,
+                                        DivisorShort ? nullptr : Divisor);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Builder.CreateCondBr(CmpV, FastBB, SlowBB);
+  }
 
   return true;
 }
Index: test/CodeGen/X86/bypass-slow-division-64.ll
===================================================================
--- test/CodeGen/X86/bypass-slow-division-64.ll
+++ test/CodeGen/X86/bypass-slow-division-64.ll
@@ -76,3 +76,88 @@
   %result = add i64 %resultdiv, %resultrem
   ret i64 %result
 }
+
+; No bypassing should be done in apparently unsuitable cases.
+define i64 @Test_no_bypassing(i32 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_no_bypassing:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    retq
+  %a.1 = zext i32 %a to i64
+  ; %a.2 is always negative so the division cannot be bypassed.
+  %a.2 = sub i64 -1, %a.1
+  %res = srem i64 %a.2, %b
+  ret i64 %res
+}
+
+; No OR instruction is needed if one of the operands (divisor) is known
+; to fit into 32 bits.
+define i64 @Test_check_one_operand(i64 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_check_one_operand:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    je .LBB4_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rcx
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB4_1:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    divl %ecx
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT:    retq
+  %b.1 = zext i32 %b to i64
+  %res = sdiv i64 %a, %b.1
+  ret i64 %res
+}
+
+; If both operands are known to fit into 32 bits, then replace the division
+; in-place without CFG modification.
+define i64 @Test_check_none(i64 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_check_none:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT:    retq
+  %a.1 = and i64 %a, 4294967295
+  %b.1 = zext i32 %b to i64
+  %res = udiv i64 %a.1, %b.1
+  ret i64 %res
+}
+
+; In case of unsigned long division with a short dividend,
+; the long division is not needed any more.
+define i64 @Test_special_case(i32 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_special_case:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    cmpq %rsi, %rcx
+; CHECK-NEXT:    jb .LBB6_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB6_1:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+  %a.1 = zext i32 %a to i64
+  %div = udiv i64 %a.1, %b
+  %rem = urem i64 %a.1, %b
+  %res = add i64 %div, %rem
+  ret i64 %res
+}