Index: llvm/trunk/lib/Transforms/Utils/BypassSlowDivision.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ llvm/trunk/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -83,17 +84,28 @@
 }
 
 namespace {
+enum ValueRange {
+  /// Operand definitely fits into BypassType. No runtime checks are needed.
+  VALRNG_SHORT,
+  /// A runtime check is required, as value range is unknown.
+  VALRNG_UNKNOWN,
+  /// Operand is unlikely to fit into BypassType. The bypassing should be
+  /// disabled.
+  VALRNG_LONG
+};
+
 class FastDivInsertionTask {
   bool IsValidTask = false;
   Instruction *SlowDivOrRem = nullptr;
   IntegerType *BypassType = nullptr;
   BasicBlock *MainBB = nullptr;
 
+  ValueRange getValueRange(Value *Op);
   QuotRemWithBB createSlowBB(BasicBlock *Successor);
   QuotRemWithBB createFastBB(BasicBlock *Successor);
   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
                                    BasicBlock *PhiBB);
-  Value *insertOperandRuntimeCheck();
+  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
   Optional<QuotRemPair> insertFastDivAndRem();
 
   bool isSignedOp() {
@@ -175,6 +187,28 @@
   return isDivisionOp() ? Value.Quotient : Value.Remainder;
 }
 
+/// Check if an integer value fits into our bypass type.
+ValueRange FastDivInsertionTask::getValueRange(Value *V) {
+  unsigned ShortLen = BypassType->getBitWidth();
+  unsigned LongLen = V->getType()->getIntegerBitWidth();
+
+  assert(LongLen > ShortLen && "Value type must be wider than BypassType");
+  unsigned HiBits = LongLen - ShortLen;
+
+  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+  APInt Zeros(LongLen, 0), Ones(LongLen, 0);
+
+  computeKnownBits(V, Zeros, Ones, DL);
+
+  if (Zeros.countLeadingOnes() >= HiBits)
+    return VALRNG_SHORT;
+
+  if (Ones.countLeadingZeros() < HiBits)
+    return VALRNG_LONG;
+
+  return VALRNG_UNKNOWN;
+}
+
 /// Add new basic block for slow div and rem operations and put it before
 /// SuccessorBB.
 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
@@ -241,22 +275,17 @@
 
 /// Creates a runtime check to test whether both the divisor and dividend fit
 /// into BypassType. The check is inserted at the end of MainBB. True return
-/// value means that the operands fit.
-Value *FastDivInsertionTask::insertOperandRuntimeCheck() {
+/// value means that the operands fit. Either of the operands may be NULL if it
+/// doesn't need a runtime check.
+Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
+  assert((Op1 || Op2) && "Nothing to check");
   IRBuilder<> Builder(MainBB, MainBB->end());
-  Value *Dividend = SlowDivOrRem->getOperand(0);
-  Value *Divisor = SlowDivOrRem->getOperand(1);
-
-  // We should have bailed out above if the divisor is a constant, but the
-  // dividend may still be a constant.  Set OrV to our non-constant operands
-  // OR'ed together.
-  assert(!isa<ConstantInt>(Divisor));
 
   Value *OrV;
-  if (!isa<ConstantInt>(Dividend))
-    OrV = Builder.CreateOr(Dividend, Divisor);
+  if (Op1 && Op2)
+    OrV = Builder.CreateOr(Op1, Op2);
   else
-    OrV = Divisor;
+    OrV = Op1 ? Op1 : Op2;
 
   // BitMask is inverted to check if the operands are
   // larger than the bypass type
@@ -279,22 +308,72 @@
     return None;
   }
 
-  // If the numerator is a constant, bail if it doesn't fit into BypassType.
-  if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
-    if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
-      return None;
-
-  // Split the basic block before the div/rem.
-  BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
-  // Remove the unconditional branch from MainBB to SuccessorBB.
-  MainBB->getInstList().back().eraseFromParent();
-  QuotRemWithBB Fast = createFastBB(SuccessorBB);
-  QuotRemWithBB Slow = createSlowBB(SuccessorBB);
-  QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
-  Value *CmpV = insertOperandRuntimeCheck();
-  IRBuilder<> Builder(MainBB, MainBB->end());
-  Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
-  return Result;
+  ValueRange DividendRange = getValueRange(Dividend);
+  if (DividendRange == VALRNG_LONG)
+    return None;
+
+  ValueRange DivisorRange = getValueRange(Divisor);
+  if (DivisorRange == VALRNG_LONG)
+    return None;
+
+  bool DividendShort = (DividendRange == VALRNG_SHORT);
+  bool DivisorShort = (DivisorRange == VALRNG_SHORT);
+
+  if (DividendShort && DivisorShort) {
+    // If both operands are known to be short then just replace the long
+    // division with a short one in-place.
+
+    IRBuilder<> Builder(SlowDivOrRem);
+    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
+    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
+    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
+    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
+    return QuotRemPair(ExtDiv, ExtRem);
+  } else if (DividendShort && !isSignedOp()) {
+    // If the division is unsigned and Dividend is known to be short, then
+    // either
+    // 1) Divisor is less or equal to Dividend, and the result can be computed
+    //    with a short division.
+    // 2) Divisor is greater than Dividend. In this case, no division is needed
+    //    at all: The quotient is 0 and the remainder is equal to Dividend.
+    //
+    // So instead of checking at runtime whether Divisor fits into BypassType,
+    // we emit a runtime check to differentiate between these two cases. This
+    // lets us entirely avoid a long div.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Long;
+    Long.BB = MainBB;
+    Long.Quotient = ConstantInt::get(getSlowType(), 0);
+    Long.Remainder = Dividend;
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
+    return Result;
+  } else {
+    // General case. Create both slow and fast div/rem pairs and choose one of
+    // them at runtime.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemWithBB Slow = createSlowBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
+    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
+                                            DivisorShort ? nullptr : Divisor);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
+    return Result;
+  }
 }
 
 /// This optimization identifies DIV/REM instructions in a BB that can be
Index: llvm/trunk/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
===================================================================
--- llvm/trunk/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
+++ llvm/trunk/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; No bypassing should be done in apparently unsuitable cases.
+define void @Test_no_bypassing(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_no_bypassing(
+; CHECK-NEXT:    [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[A_2:%.*]] = sub i64 -1, [[A_1]]
+; CHECK-NEXT:    [[RES:%.*]] = srem i64 [[A_2]], [[B:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = zext i32 %a to i64
+  ; %a.2 is always negative so the division cannot be bypassed.
+  %a.2 = sub i64 -1, %a.1
+  %res = srem i64 %a.2, %b
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; No OR instruction is needed if one of the operands (divisor) is known
+; to fit into 32 bits.
+define void @Test_check_one_operand(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_one_operand(
+; CHECK-NEXT:    [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[A:%.*]], -4294967296
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP8:%.*]]
+; CHECK:         [[TMP4:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP10:%.*]]
+; CHECK:         [[TMP9:%.*]] = sdiv i64 [[A]], [[B_1]]
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:         [[TMP11:%.*]] = phi i64 [ [[TMP7]], [[TMP3]] ], [ [[TMP9]], [[TMP8]] ]
+; CHECK-NEXT:    store i64 [[TMP11]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %b.1 = zext i32 %b to i64
+  %res = sdiv i64 %a, %b.1
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; If both operands are known to fit into 32 bits, then replace the division
+; in-place without CFG modification.
+define void @Test_check_none(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_none(
+; CHECK-NEXT:    [[A_1:%.*]] = and i64 [[A:%.*]], 4294967295
+; CHECK-NEXT:    [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = and i64 %a, 4294967295
+  %b.1 = zext i32 %b to i64
+  %res = udiv i64 %a.1, %b.1
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; In case of unsigned long division with a short dividend,
+; the long division is not needed any more.
+define void @Test_special_case(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_special_case(
+; CHECK-NEXT:    [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i64 [[A_1]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP9:%.*]]
+; CHECK:         [[TMP3:%.*]] = trunc i64 [[B]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = urem i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:         [[TMP10:%.*]] = phi i64 [ [[TMP7]], [[TMP2]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP8]], [[TMP2]] ], [ [[A_1]], [[TMP0]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = zext i32 %a to i64
+  %div = udiv i64 %a.1, %b
+  %rem = urem i64 %a.1, %b
+  %res = add i64 %div, %rem
+  store i64 %res, i64* %retptr
+  ret void
+}