diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -687,6 +687,9 @@ /// would typically be allowed using throughput or size cost models. bool hasDivRemOp(Type *DataType, bool IsSigned) const; + /// Returns the maximum bitwidth of legal div and rem instructions. + unsigned maxLegalDivRemBitWidth() const; + /// Return true if the given instruction (assumed to be a memory access /// instruction) has a volatile variant. If that's the case then we can avoid /// addrspacecast to generic AS for volatile loads/stores. Default @@ -1641,6 +1644,7 @@ const SmallBitVector &OpcodeMask) const = 0; virtual bool enableOrderedReductions() = 0; virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0; + virtual unsigned maxLegalDivRemBitWidth() = 0; virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0; virtual bool prefersVectorizedAddressing() = 0; virtual InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, @@ -2088,6 +2092,9 @@ bool hasDivRemOp(Type *DataType, bool IsSigned) override { return Impl.hasDivRemOp(DataType, IsSigned); } + unsigned maxLegalDivRemBitWidth() override { + return Impl.maxLegalDivRemBitWidth(); + } bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override { return Impl.hasVolatileVariant(I, AddrSpace); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -291,6 +291,10 @@ bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; } + bool maxLegalDivRemBitWidth() const { + return llvm::IntegerType::MAX_INT_BITS; + } + bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -451,6 +451,10 @@ return TTIImpl->hasDivRemOp(DataType, IsSigned); } +unsigned TargetTransformInfo::maxLegalDivRemBitWidth() const { + return TTIImpl->maxLegalDivRemBitWidth(); +} + bool TargetTransformInfo::hasVolatileVariant(Instruction *I, unsigned AddrSpace) const { return TTIImpl->hasVolatileVariant(I, AddrSpace); diff --git a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp --- a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp +++ b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" @@ -30,14 +31,37 @@ using namespace llvm; static cl::opt - ExpandDivRemBits("expand-div-rem-bits", cl::Hidden, cl::init(128), + ExpandDivRemBits("expand-div-rem-bits", cl::Hidden, + cl::init(llvm::IntegerType::MAX_INT_BITS), cl::desc("div and rem instructions on integers with " "more than bits are expanded.")); -static bool runImpl(Function &F) { +static bool isConstantPowerOfTwo(llvm::Value *V, bool SignedOp) { + auto *C = dyn_cast(V); + if (!C) + return false; + + APInt Val = C->getValue(); + if (SignedOp && Val.isNegative()) + Val = -Val; + return Val.isPowerOf2(); +} + +static bool isSigned(unsigned int Opcode) { + return Opcode == Instruction::SDiv || Opcode == Instruction::SRem; +} + +static bool runImpl(Function &F, const TargetTransformInfo &TTI) { SmallVector Replace; bool Modified = false; + unsigned MaxLegalDivRemBitWidth = TTI.maxLegalDivRemBitWidth(); + if (ExpandDivRemBits != llvm::IntegerType::MAX_INT_BITS) + MaxLegalDivRemBitWidth = ExpandDivRemBits; + + if (MaxLegalDivRemBitWidth >= llvm::IntegerType::MAX_INT_BITS) + return false; + for (auto &I : instructions(F)) { switch (I.getOpcode()) { case Instruction::UDiv: @@ -46,7 +70,11 @@ case Instruction::SRem: { // TODO: This doesn't handle vectors. auto *IntTy = dyn_cast(I.getType()); - if (!IntTy || IntTy->getIntegerBitWidth() <= ExpandDivRemBits) + if (!IntTy || IntTy->getIntegerBitWidth() <= MaxLegalDivRemBitWidth) + continue; + + // The backend has peephole optimizations for powers of two. + if (isConstantPowerOfTwo(I.getOperand(1), isSigned(I.getOpcode()))) continue; Replace.push_back(&cast(I)); @@ -77,7 +105,8 @@ PreservedAnalyses ExpandLargeDivRemPass::run(Function &F, FunctionAnalysisManager &AM) { - bool Changed = runImpl(F); + TargetTransformInfo &TTI = AM.getResult(F); + bool Changed = runImpl(F, TTI); if (Changed) return PreservedAnalyses::none(); @@ -93,9 +122,13 @@ initializeExpandLargeDivRemLegacyPassPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F) override { return runImpl(F); } + bool runOnFunction(Function &F) override { + auto &TTI = getAnalysis().getTTI(F); + return runImpl(F, TTI); + } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); } diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1113,6 +1113,7 @@ addPass(createPreISelIntrinsicLoweringPass()); PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); + addPass(createExpandLargeDivRemPass()); addIRPasses(); addCodeGenPrepare(); addPassesToHandleExceptions(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -319,6 +319,8 @@ bool enableOrderedReductions() const { return true; } + unsigned maxLegalDivRemBitWidth() const { return 128; } + InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -207,6 +207,8 @@ return isLegalMaskedGather(Ty, Alignment); } + unsigned maxLegalDivRemBitWidth() const { return 64; } + InstructionCost getMemcpyCost(const Instruction *I); int getNumMemOps(const IntrinsicInst *I) const; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -255,6 +255,7 @@ const SmallBitVector &OpcodeMask) const; bool hasDivRemOp(Type *DataType, bool IsSigned); bool isExpensiveToSpeculativelyExecute(const Instruction *I); + unsigned maxLegalDivRemBitWidth() const; bool isFCmpOrdCheaperThanFCmpZero(Type *Ty); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5734,6 +5734,10 @@ return BaseT::isExpensiveToSpeculativelyExecute(I); } +unsigned X86TTIImpl::maxLegalDivRemBitWidth() const { + return ST->is64Bit() ? 128 : 64; +} + bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { return false; } diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -15,6 +15,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -18,6 +18,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: SVE intrinsics optimizations ; CHECK-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AArch64/udivmodei5.ll b/llvm/test/CodeGen/AArch64/udivmodei5.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/udivmodei5.ll @@ -0,0 +1,44 @@ +; RUN: llc -mtriple=aarch64-linux-gnuabi < %s | FileCheck %s + +define i65 @udiv65(i65 %a, i65 %b) nounwind { +; CHECK-LABEL: udiv65: +; CHECK-NOT: call + %res = udiv i65 %a, %b + ret i65 %res +} + +define i129 @udiv129(i129 %a, i129 %b) nounwind { +; CHECK-LABEL: udiv129: +; CHECK-NOT: call + %res = udiv i129 %a, %b + ret i129 %res +} + +define i129 @urem129(i129 %a, i129 %b) nounwind { +; CHECK-LABEL: urem129: +; CHECK-NOT: call + %res = urem i129 %a, %b + ret i129 %res +} + +define i129 @sdiv129(i129 %a, i129 %b) nounwind { +; CHECK-LABEL: sdiv129: +; CHECK-NOT: call + %res = sdiv i129 %a, %b + ret i129 %res +} + +define i129 @srem129(i129 %a, i129 %b) nounwind { +; CHECK-LABEL: srem129: +; CHECK-NOT: call + %res = srem i129 %a, %b + ret i129 %res +} + +; Some higher sizes +define i257 @sdiv257(i257 %a, i257 %b) nounwind { +; CHECK-LABEL: sdiv257: +; CHECK-NOT: call + %res = sdiv i257 %a, %b + ret i257 %res +} diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -5,6 +5,7 @@ ; CHECK: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/ARM/udivmodei5.ll b/llvm/test/CodeGen/ARM/udivmodei5.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/udivmodei5.ll @@ -0,0 +1,44 @@ +; RUN: llc -mtriple=arm-eabi < %s | FileCheck %s + +define i65 @udiv65(i65 %a, i65 %b) nounwind { +; CHECK-LABEL: udiv65: +; CHECK-NOT: call + %res = udiv i65 %a, %b + ret i65 %res +} + +define i129 @udiv129(i129 %a, i129 %b) nounwind { +; CHECK-LABEL: udiv129: +; CHECK-NOT: call + %res = udiv i129 %a, %b + ret i129 %res +} + +define i129 @urem129(i129 %a, i129 %b) nounwind { +; CHECK-LABEL: urem129: +; CHECK-NOT: call + %res = urem i129 %a, %b + ret i129 %res +} + +define i129 @sdiv129(i129 %a, i129 %b) nounwind { +; CHECK-LABEL: sdiv129: +; CHECK-NOT: call + %res = sdiv i129 %a, %b + ret i129 %res +} + +define i129 @srem129(i129 %a, i129 %b) nounwind { +; CHECK-LABEL: srem129: +; CHECK-NOT: call + %res = srem i129 %a, %b + ret i129 %res +} + +; Some higher sizes +define i257 @sdiv257(i257 %a, i257 %b) nounwind { +; CHECK-LABEL: sdiv257: +; CHECK-NOT: call + %res = sdiv i257 %a, %b + ret i257 %res +} diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -17,6 +17,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -171,101 +171,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-LABEL: scalar_i128: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $40, %esp -; X86-NEXT: movl 44(%ebp), %edi -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl 40(%ebp) -; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: pushl 32(%ebp) -; X86-NEXT: pushl 28(%ebp) -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) -; X86-NEXT: pushl %eax -; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl %ecx, 12(%edi) -; X86-NEXT: movl %esi, 8(%edi) -; X86-NEXT: movl %eax, 4(%edi) -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %ebx, (%edx) -; X86-NEXT: movl 28(%ebp), %eax -; X86-NEXT: imull %eax, %ecx -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: imull 32(%ebp), %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl 36(%ebp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl 40(%ebp), %eax -; X86-NEXT: imull %ebx, %eax -; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl 28(%ebp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull 32(%ebp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull 32(%ebp) -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl 16(%ebp), %esi -; X86-NEXT: sbbl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: movl 20(%ebp), %edi -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl 24(%ebp), %ebx -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl $4 +; X86 doesn't have __divti3, so the urem is expanded into a loop. +; X86: udiv-do-while ; ; X64-LABEL: scalar_i128: ; X64: # %bb.0: diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -171,101 +171,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-LABEL: scalar_i128: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $40, %esp -; X86-NEXT: movl 44(%ebp), %edi -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl 40(%ebp) -; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: pushl 32(%ebp) -; X86-NEXT: pushl 28(%ebp) -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) -; X86-NEXT: pushl %eax -; X86-NEXT: calll __udivti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl %ecx, 12(%edi) -; X86-NEXT: movl %esi, 8(%edi) -; X86-NEXT: movl %eax, 4(%edi) -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %ebx, (%edx) -; X86-NEXT: movl 28(%ebp), %eax -; X86-NEXT: imull %eax, %ecx -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: imull 32(%ebp), %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl 36(%ebp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl 40(%ebp), %eax -; X86-NEXT: imull %ebx, %eax -; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl 28(%ebp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull 32(%ebp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull 32(%ebp) -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl 16(%ebp), %esi -; X86-NEXT: sbbl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: movl 20(%ebp), %edi -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl 24(%ebp), %ebx -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl $4 +; X86 doesn't have __divti3, so the urem is expanded into a loop. +; X86: udiv-do-while ; ; X64-LABEL: scalar_i128: ; X64: # %bb.0: diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll --- a/llvm/test/CodeGen/X86/i128-sdiv.ll +++ b/llvm/test/CodeGen/X86/i128-sdiv.ll @@ -107,40 +107,8 @@ define i128 @test3(i128 %x) nounwind { ; X86-LABEL: test3: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %esp, %eax -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-5 -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-3 -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) -; X86-NEXT: pushl %eax -; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl %eax, (%esi) -; X86-NEXT: movl %esi, %eax -; X86-NEXT: leal -8(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebp -; X86-NEXT: retl $4 +; X86 doesn't have __divti3, so the urem is expanded into a loop. +; X86: udiv-do-while ; ; X64-LABEL: test3: ; X64: # %bb.0: diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll --- a/llvm/test/CodeGen/X86/i128-udiv.ll +++ b/llvm/test/CodeGen/X86/i128-udiv.ll @@ -31,40 +31,8 @@ define i128 @test2(i128 %x) nounwind { ; X86-LABEL: test2: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %esp, %eax -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-4 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) -; X86-NEXT: pushl %eax -; X86-NEXT: calll __udivti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl %eax, (%esi) -; X86-NEXT: movl %esi, %eax -; X86-NEXT: leal -8(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebp -; X86-NEXT: retl $4 +; X86 doesn't have __divti3, so the urem is expanded into a loop. +; X86: udiv-do-while ; ; X64-LABEL: test2: ; X64: # %bb.0: @@ -80,40 +48,8 @@ define i128 @test3(i128 %x) nounwind { ; X86-LABEL: test3: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %esp, %eax -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-5 -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-3 -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) -; X86-NEXT: pushl %eax -; X86-NEXT: calll __udivti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl %eax, (%esi) -; X86-NEXT: movl %esi, %eax -; X86-NEXT: leal -8(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebp -; X86-NEXT: retl $4 +; X86 doesn't have __divti3, so the urem is expanded into a loop. +; X86: udiv-do-while ; ; X64-LABEL: test3: ; X64: # %bb.0: diff --git a/llvm/test/CodeGen/X86/libcall-sret.ll b/llvm/test/CodeGen/X86/libcall-sret.ll deleted file mode 100644 --- a/llvm/test/CodeGen/X86/libcall-sret.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -mtriple=i686-linux-gnu -o - %s | FileCheck %s - -@var = global i128 0 - -; We were trying to convert the i128 operation into a libcall, but failing to -; perform sret demotion when we couldn't return the result in registers. Make -; sure we marshal the return properly: - -define void @test_sret_libcall(i128 %l, i128 %r) { -; CHECK-LABEL: test_sret_libcall: - - ; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical - ; (aligned) place for the actual sret data is %esp + 20. -; CHECK: leal 20(%esp), [[SRET_ADDR:%[a-z]+]] -; CHECK: pushl 72(%esp) -; CHECK: pushl 72(%esp) -; CHECK: pushl 72(%esp) -; CHECK: pushl 72(%esp) -; CHECK: pushl 72(%esp) -; CHECK: pushl 72(%esp) -; CHECK: pushl 72(%esp) -; CHECK: pushl 72(%esp) -; CHECK: pushl [[SRET_ADDR]] - -; CHECK: calll __udivti3 - -; CHECK: addl $44, %esp -; CHECK-DAG: movl 8(%esp), [[RES0:%[a-z]+]] -; CHECK-DAG: movl 12(%esp), [[RES1:%[a-z]+]] -; CHECK-DAG: movl 16(%esp), [[RES2:%[a-z]+]] -; CHECK-DAG: movl 20(%esp), [[RES3:%[a-z]+]] -; CHECK-DAG: movl [[RES0]], var -; CHECK-DAG: movl [[RES1]], var+4 -; CHECK-DAG: movl [[RES2]], var+8 -; CHECK-DAG: movl [[RES3]], var+12 - %quot = udiv i128 %l, %r - store i128 %quot, ptr @var - ret void -} diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -26,6 +26,7 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Expand large div/rem ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll --- a/llvm/test/CodeGen/X86/pr38539.ll +++ b/llvm/test/CodeGen/X86/pr38539.ll @@ -13,26 +13,6 @@ ; X64-NEXT: movq %rax, (%rax) ; X64-NEXT: movb $0, (%rax) ; X64-NEXT: retq -; -; X86-LABEL: f: -; X86: # %bb.0: # %BB -; X86-NEXT: pushl %ebp -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: .cfi_def_cfa_register %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movzbl (%eax), %eax -; X86-NEXT: cmpb $0, (%eax) -; X86-NEXT: setne (%eax) -; X86-NEXT: leal -{{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, (%eax) -; X86-NEXT: movb $0, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: .cfi_def_cfa %esp, 4 -; X86-NEXT: retl BB: %A30 = alloca i66 %L17 = load i66, ptr %A30 diff --git a/llvm/test/CodeGen/X86/udivmodei5.ll b/llvm/test/CodeGen/X86/udivmodei5.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/udivmodei5.ll @@ -0,0 +1,70 @@ +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 + +; On i686, this is expanded into a loop. On x86_64, this calls __udivti3. +define i65 @udiv65(i65 %a, i65 %b) nounwind { +; X86-LABEL: udiv65: +; X86-NOT: call +; +; X64-LABEL: udiv65: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: andl $1, %esi +; X64-NEXT: andl $1, %ecx +; X64-NEXT: callq __udivti3@PLT +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %res = udiv i65 %a, %b + ret i65 %res +} + +define i129 @udiv129(i129 %a, i129 %b) nounwind { +; X86-LABEL: udiv129: +; X86-NOT: call +; +; X64-LABEL: udiv129: +; X64-NOT: call + %res = udiv i129 %a, %b + ret i129 %res +} + +define i129 @urem129(i129 %a, i129 %b) nounwind { +; X86-LABEL: urem129: +; X86-NOT: call +; +; X64-LABEL: urem129: +; X64-NOT: call + %res = urem i129 %a, %b + ret i129 %res +} + +define i129 @sdiv129(i129 %a, i129 %b) nounwind { +; X86-LABEL: sdiv129: +; X86-NOT: call +; +; X64-LABEL: sdiv129: +; X64-NOT: call + %res = sdiv i129 %a, %b + ret i129 %res +} + +define i129 @srem129(i129 %a, i129 %b) nounwind { +; X86-LABEL: srem129: +; X86-NOT: call +; +; X64-LABEL: srem129: +; X64-NOT: call + %res = srem i129 %a, %b + ret i129 %res +} + +; Some higher sizes +define i257 @sdiv257(i257 %a, i257 %b) nounwind { +; X86-LABEL: sdiv257: +; X86-NOT: call +; +; X64-LABEL: sdiv257: +; X64-NOT: call + %res = sdiv i257 %a, %b + ret i257 %res +}