diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -687,6 +687,9 @@
   /// would typically be allowed using throughput or size cost models.
   bool hasDivRemOp(Type *DataType, bool IsSigned) const;
 
+  /// Returns the maximum bitwidth of legal div and rem instructions.
+  unsigned maxLegalDivRemBitWidth() const;
+
   /// Return true if the given instruction (assumed to be a memory access
   /// instruction) has a volatile variant. If that's the case then we can avoid
   /// addrspacecast to generic AS for volatile loads/stores. Default
@@ -1641,6 +1644,7 @@
                                const SmallBitVector &OpcodeMask) const = 0;
   virtual bool enableOrderedReductions() = 0;
   virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
+  virtual unsigned maxLegalDivRemBitWidth() = 0;
   virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
   virtual bool prefersVectorizedAddressing() = 0;
   virtual InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
@@ -2088,6 +2092,9 @@
   bool hasDivRemOp(Type *DataType, bool IsSigned) override {
     return Impl.hasDivRemOp(DataType, IsSigned);
   }
+  unsigned maxLegalDivRemBitWidth() override {
+    return Impl.maxLegalDivRemBitWidth();
+  }
   bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override {
     return Impl.hasVolatileVariant(I, AddrSpace);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -291,6 +291,10 @@
 
   bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }
 
+  bool maxLegalDivRemBitWidth() const {
+    return llvm::IntegerType::MAX_INT_BITS;
+  }
+
   bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const {
     return false;
   }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -451,6 +451,10 @@
   return TTIImpl->hasDivRemOp(DataType, IsSigned);
 }
 
+unsigned TargetTransformInfo::maxLegalDivRemBitWidth() const {
+  return TTIImpl->maxLegalDivRemBitWidth();
+}
+
 bool TargetTransformInfo::hasVolatileVariant(Instruction *I,
                                              unsigned AddrSpace) const {
   return TTIImpl->hasVolatileVariant(I, AddrSpace);
diff --git a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
--- a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -30,14 +31,37 @@
 using namespace llvm;
 
 static cl::opt<unsigned>
-    ExpandDivRemBits("expand-div-rem-bits", cl::Hidden, cl::init(128),
+    ExpandDivRemBits("expand-div-rem-bits", cl::Hidden,
+                     cl::init(llvm::IntegerType::MAX_INT_BITS),
                      cl::desc("div and rem instructions on integers with "
                               "more than <N> bits are expanded."));
 
-static bool runImpl(Function &F) {
+static bool isConstantPowerOfTwo(llvm::Value *V, bool SignedOp) {
+  auto *C = dyn_cast<ConstantInt>(V);
+  if (!C)
+    return false;
+
+  APInt Val = C->getValue();
+  if (SignedOp && Val.isNegative())
+    Val = -Val;
+  return Val.isPowerOf2();
+}
+
+static bool isSigned(unsigned int Opcode) {
+  return Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
+}
+
+static bool runImpl(Function &F, const TargetTransformInfo &TTI) {
   SmallVector<BinaryOperator *, 4> Replace;
   bool Modified = false;
 
+  unsigned MaxLegalDivRemBitWidth = TTI.maxLegalDivRemBitWidth();
+  if (ExpandDivRemBits != llvm::IntegerType::MAX_INT_BITS)
+    MaxLegalDivRemBitWidth = ExpandDivRemBits;
+
+  if (MaxLegalDivRemBitWidth >= llvm::IntegerType::MAX_INT_BITS)
+    return false;
+
   for (auto &I : instructions(F)) {
     switch (I.getOpcode()) {
     case Instruction::UDiv:
@@ -46,7 +70,11 @@
     case Instruction::SRem: {
       // TODO: This doesn't handle vectors.
       auto *IntTy = dyn_cast<IntegerType>(I.getType());
-      if (!IntTy || IntTy->getIntegerBitWidth() <= ExpandDivRemBits)
+      if (!IntTy || IntTy->getIntegerBitWidth() <= MaxLegalDivRemBitWidth)
+        continue;
+
+      // The backend has peephole optimizations for powers of two.
+      if (isConstantPowerOfTwo(I.getOperand(1), isSigned(I.getOpcode())))
         continue;
 
       Replace.push_back(&cast<BinaryOperator>(I));
@@ -77,7 +105,8 @@
 
 PreservedAnalyses ExpandLargeDivRemPass::run(Function &F,
                                              FunctionAnalysisManager &AM) {
-  bool Changed = runImpl(F);
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  bool Changed = runImpl(F, TTI);
 
   if (Changed)
     return PreservedAnalyses::none();
@@ -93,9 +122,13 @@
     initializeExpandLargeDivRemLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
-  bool runOnFunction(Function &F) override { return runImpl(F); }
+  bool runOnFunction(Function &F) override {
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return runImpl(F, TTI);
+  }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1113,6 +1113,7 @@
 
   addPass(createPreISelIntrinsicLoweringPass());
   PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+  addPass(createExpandLargeDivRemPass());
   addIRPasses();
   addCodeGenPrepare();
   addPassesToHandleExceptions();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -319,6 +319,8 @@
 
   bool enableOrderedReductions() const { return true; }
 
+  unsigned maxLegalDivRemBitWidth() const { return 128; }
+
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -207,6 +207,8 @@
     return isLegalMaskedGather(Ty, Alignment);
   }
 
+  unsigned maxLegalDivRemBitWidth() const { return 64; }
+
   InstructionCost getMemcpyCost(const Instruction *I);
 
   int getNumMemOps(const IntrinsicInst *I) const;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -255,6 +255,7 @@
                        const SmallBitVector &OpcodeMask) const;
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool isExpensiveToSpeculativelyExecute(const Instruction *I);
+  unsigned maxLegalDivRemBitWidth() const;
   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5734,6 +5734,10 @@
   return BaseT::isExpensiveToSpeculativelyExecute(I);
 }
 
+unsigned X86TTIImpl::maxLegalDivRemBitWidth() const {
+  return ST->is64Bit() ? 128 : 64;
+}
+
 bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
   return false;
 }
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -15,6 +15,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Expand large div/rem
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -18,6 +18,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Expand large div/rem
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:     SVE intrinsics optimizations
 ; CHECK-NEXT:       FunctionPass Manager
diff --git a/llvm/test/CodeGen/AArch64/udivmodei5.ll b/llvm/test/CodeGen/AArch64/udivmodei5.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/udivmodei5.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=aarch64-linux-gnuabi < %s | FileCheck %s
+
+define i65 @udiv65(i65 %a, i65 %b) nounwind {
+; CHECK-LABEL: udiv65:
+; CHECK-NOT:     call
+  %res = udiv i65 %a, %b
+  ret i65 %res
+}
+
+define i129 @udiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: udiv129:
+; CHECK-NOT:     call
+  %res = udiv i129 %a, %b
+  ret i129 %res
+}
+
+define i129 @urem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: urem129:
+; CHECK-NOT:     call
+  %res = urem i129 %a, %b
+  ret i129 %res
+}
+
+define i129 @sdiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: sdiv129:
+; CHECK-NOT:     call
+  %res = sdiv i129 %a, %b
+  ret i129 %res
+}
+
+define i129 @srem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: srem129:
+; CHECK-NOT:     call
+  %res = srem i129 %a, %b
+  ret i129 %res
+}
+
+; Some higher sizes
+define i257 @sdiv257(i257 %a, i257 %b) nounwind {
+; CHECK-LABEL: sdiv257:
+; CHECK-NOT:     call
+  %res = sdiv i257 %a, %b
+  ret i257 %res
+}
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -5,6 +5,7 @@
 ; CHECK:       ModulePass Manager
 ; CHECK-NEXT:    Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:    FunctionPass Manager
+; CHECK-NEXT:      Expand large div/rem
 ; CHECK-NEXT:      Expand Atomic instructions
 ; CHECK-NEXT:      Simplify the CFG
 ; CHECK-NEXT:      Dominator Tree Construction
diff --git a/llvm/test/CodeGen/ARM/udivmodei5.ll b/llvm/test/CodeGen/ARM/udivmodei5.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/udivmodei5.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=arm-eabi < %s | FileCheck %s
+
+define i65 @udiv65(i65 %a, i65 %b) nounwind {
+; CHECK-LABEL: udiv65:
+; CHECK-NOT:     call
+  %res = udiv i65 %a, %b
+  ret i65 %res
+}
+
+define i129 @udiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: udiv129:
+; CHECK-NOT:     call
+  %res = udiv i129 %a, %b
+  ret i129 %res
+}
+
+define i129 @urem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: urem129:
+; CHECK-NOT:     call
+  %res = urem i129 %a, %b
+  ret i129 %res
+}
+
+define i129 @sdiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: sdiv129:
+; CHECK-NOT:     call
+  %res = sdiv i129 %a, %b
+  ret i129 %res
+}
+
+define i129 @srem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: srem129:
+; CHECK-NOT:     call
+  %res = srem i129 %a, %b
+  ret i129 %res
+}
+
+; Some higher sizes
+define i257 @sdiv257(i257 %a, i257 %b) nounwind {
+; CHECK-LABEL: sdiv257:
+; CHECK-NOT:     call
+  %res = sdiv i257 %a, %b
+  ret i257 %res
+}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -17,6 +17,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Expand large div/rem
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:       Lower AMX intrinsics
 ; CHECK-NEXT:       Lower AMX type for load/store
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -171,101 +171,8 @@
 
 define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $40, %esp
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    pushl 36(%ebp)
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    pushl 28(%ebp)
-; X86-NEXT:    pushl 24(%ebp)
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl %ecx, 12(%edi)
-; X86-NEXT:    movl %esi, 8(%edi)
-; X86-NEXT:    movl %eax, 4(%edi)
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %ebx, (%edx)
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull 32(%ebp), %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl 40(%ebp), %eax
-; X86-NEXT:    imull %ebx, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%ebp), %esi
-; X86-NEXT:    sbbl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
 ;
 ; X64-LABEL: scalar_i128:
 ; X64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -171,101 +171,8 @@
 
 define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $40, %esp
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    pushl 36(%ebp)
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    pushl 28(%ebp)
-; X86-NEXT:    pushl 24(%ebp)
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl %ecx, 12(%edi)
-; X86-NEXT:    movl %esi, 8(%edi)
-; X86-NEXT:    movl %eax, 4(%edi)
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %ebx, (%edx)
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull 32(%ebp), %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl 40(%ebp), %eax
-; X86-NEXT:    imull %ebx, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%ebp), %esi
-; X86-NEXT:    sbbl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
 ;
 ; X64-LABEL: scalar_i128:
 ; X64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -107,40 +107,8 @@
 
 define i128 @test3(i128 %x) nounwind {
 ; X86-LABEL: test3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %esp, %eax
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl $-5
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl $-3
-; X86-NEXT:    pushl 24(%ebp)
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 12(%esi)
-; X86-NEXT:    movl %edx, 8(%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -31,40 +31,8 @@
 
 define i128 @test2(i128 %x) nounwind {
 ; X86-LABEL: test2:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %esp, %eax
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl $-4
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 24(%ebp)
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 12(%esi)
-; X86-NEXT:    movl %edx, 8(%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
 ;
 ; X64-LABEL: test2:
 ; X64:       # %bb.0:
@@ -80,40 +48,8 @@
 
 define i128 @test3(i128 %x) nounwind {
 ; X86-LABEL: test3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %esp, %eax
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl $-5
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl $-3
-; X86-NEXT:    pushl 24(%ebp)
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 12(%esi)
-; X86-NEXT:    movl %edx, 8(%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/libcall-sret.ll b/llvm/test/CodeGen/X86/libcall-sret.ll
deleted file mode 100644
--- a/llvm/test/CodeGen/X86/libcall-sret.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -mtriple=i686-linux-gnu -o - %s | FileCheck %s
-
-@var = global i128 0
-
-; We were trying to convert the i128 operation into a libcall, but failing to
-; perform sret demotion when we couldn't return the result in registers. Make
-; sure we marshal the return properly:
-
-define void @test_sret_libcall(i128 %l, i128 %r) {
-; CHECK-LABEL: test_sret_libcall:
-
-  ; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical
-  ; (aligned) place for the actual sret data is %esp + 20.
-; CHECK: leal 20(%esp), [[SRET_ADDR:%[a-z]+]]
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl [[SRET_ADDR]]
-
-; CHECK: calll __udivti3
-
-; CHECK: addl $44, %esp
-; CHECK-DAG: movl 8(%esp), [[RES0:%[a-z]+]]
-; CHECK-DAG: movl 12(%esp), [[RES1:%[a-z]+]]
-; CHECK-DAG: movl 16(%esp), [[RES2:%[a-z]+]]
-; CHECK-DAG: movl 20(%esp), [[RES3:%[a-z]+]]
-; CHECK-DAG: movl [[RES0]], var
-; CHECK-DAG: movl [[RES1]], var+4
-; CHECK-DAG: movl [[RES2]], var+8
-; CHECK-DAG: movl [[RES3]], var+12
-  %quot = udiv i128 %l, %r
-  store i128 %quot, ptr @var
-  ret void
-}
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -26,6 +26,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Expand large div/rem
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:       Lower AMX intrinsics
 ; CHECK-NEXT:       Lower AMX type for load/store
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -13,26 +13,6 @@
 ; X64-NEXT:    movq %rax, (%rax)
 ; X64-NEXT:    movb $0, (%rax)
 ; X64-NEXT:    retq
-;
-; X86-LABEL: f:
-; X86:       # %bb.0: # %BB
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movzbl (%eax), %eax
-; X86-NEXT:    cmpb $0, (%eax)
-; X86-NEXT:    setne (%eax)
-; X86-NEXT:    leal -{{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, (%eax)
-; X86-NEXT:    movb $0, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
-; X86-NEXT:    retl
 BB:
   %A30 = alloca i66
   %L17 = load i66, ptr %A30
diff --git a/llvm/test/CodeGen/X86/udivmodei5.ll b/llvm/test/CodeGen/X86/udivmodei5.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udivmodei5.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+; On i686, this is expanded into a loop. On x86_64, this calls __udivti3.
+define i65 @udiv65(i65 %a, i65 %b) nounwind {
+; X86-LABEL: udiv65:
+; X86-NOT:     call
+;
+; X64-LABEL: udiv65:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    callq __udivti3@PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %res = udiv i65 %a, %b
+  ret i65 %res
+}
+
+define i129 @udiv129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: udiv129:
+; X86-NOT:     call
+;
+; X64-LABEL: udiv129:
+; X64-NOT:     call
+  %res = udiv i129 %a, %b
+  ret i129 %res
+}
+
+define i129 @urem129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: urem129:
+; X86-NOT:     call
+;
+; X64-LABEL: urem129:
+; X64-NOT:     call
+  %res = urem i129 %a, %b
+  ret i129 %res
+}
+
+define i129 @sdiv129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: sdiv129:
+; X86-NOT:     call
+;
+; X64-LABEL: sdiv129:
+; X64-NOT:     call
+  %res = sdiv i129 %a, %b
+  ret i129 %res
+}
+
+define i129 @srem129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: srem129:
+; X86-NOT:     call
+;
+; X64-LABEL: srem129:
+; X64-NOT:     call
+  %res = srem i129 %a, %b
+  ret i129 %res
+}
+
+; Some higher sizes
+define i257 @sdiv257(i257 %a, i257 %b) nounwind {
+; X86-LABEL: sdiv257:
+; X86-NOT:     call
+;
+; X64-LABEL: sdiv257:
+; X64-NOT:     call
+  %res = sdiv i257 %a, %b
+  ret i257 %res
+}