Index: lib/Transforms/Utils/BypassSlowDivision.cpp =================================================================== --- lib/Transforms/Utils/BypassSlowDivision.cpp +++ lib/Transforms/Utils/BypassSlowDivision.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -74,6 +75,15 @@ } namespace { +enum ValueRange { + /// Operand definitely fits BypassType. No runtime checks are needed. + VALRNG_SHORT, + /// A runtime check is required, as value range is unknown. + VALRNG_UNKNOWN, + /// Operand is unlikely to fit BypassType. The bypassing should be disabled. + VALRNG_LONG +}; + class FastDivInsertionTask { // These fields are set during initialization. unsigned Opcode; @@ -95,7 +105,7 @@ BasicBlock *createFastBB(BasicBlock *Successor); void createDivRemPhiNodes(BasicBlock *ShortBB, BasicBlock *LongBB, BasicBlock *PhiBB, DivCacheTy &PerBBDivCache); - Value *createDivRunTimeCheck(); + Value *createDivRunTimeCheck(Value *Op1, Value *Op2); public: /// Sets up the object to work with instruction \p I. Returns false if the @@ -115,6 +125,9 @@ return Opcode == Instruction::SDiv || Opcode == Instruction::UDiv; } + /// Check if an integer value fits into a smaller type. + ValueRange getValueRange(Value *Op, const DataLayout &DL); + /// Tries to replace a division with a faster code and returns true if /// succeeds. bool insertFastDiv(DivCacheTy &PerBBDivCache); @@ -138,6 +151,25 @@ }; } +ValueRange FastDivInsertionTask::getValueRange(Value *Op, + const DataLayout &DL) { + unsigned ShortLen = BypassType->getBitWidth(); + unsigned LongLen = SlowType->getBitWidth(); + unsigned HiBits = LongLen - ShortLen; + + APInt Zeros(LongLen, 0), Ones(LongLen, 0); + + computeKnownBits(Op, Zeros, Ones, DL); + + if (Zeros.countLeadingOnes() >= HiBits) + return VALRNG_SHORT; + + if (Ones.countLeadingZeros() < HiBits) + return VALRNG_LONG; + + return VALRNG_UNKNOWN; +} + bool FastDivInsertionTask::setScalarIDivOp(Instruction *I) { SlowDiv = I; Opcode = I->getOpcode(); @@ -236,19 +268,17 @@ // Creates a run-time check to test whether both operands fit the shorter type. // The check is inserted at the end of MainBB. // True return value means that the operands fit. -Value *FastDivInsertionTask::createDivRunTimeCheck() { +// Either of the operands may be NULL if it doesn't need a runtime check. +Value *FastDivInsertionTask::createDivRunTimeCheck(Value *Op1, Value *Op2) { IRBuilder<> Builder(MainBB, MainBB->end()); - // We should have bailed out above if the divisor is a constant, but the - // dividend may still be a constant. Set OrV to our non-constant operands - // OR'ed together. - assert(!isa(Divisor)); - - Value *OrV; - if (!isa(Dividend)) - OrV = Builder.CreateOr(Dividend, Divisor); + Value *OrV = nullptr; + if (Op1 && Op2) + OrV = Builder.CreateOr(Op1, Op2); else - OrV = Divisor; + OrV = Op1 ? Op1 : Op2; + + assert(OrV && "Nothing to check"); // BitMask is inverted to check if the operands are // larger than the bypass type @@ -265,25 +295,67 @@ // possible and the longer-slower div/rem instruction otherwise. bool FastDivInsertionTask::insertFastDiv(DivCacheTy &PerBBDivCache) { if (isa(Divisor)) { - // Division by a constant should have been been solved and replaced earlier - // in the pipeline. + // Keep division by a constant for DAGCombiner. return false; } - // If the numerator is a constant, bail if it doesn't fit into BypassType. - if (ConstantInt *ConstDividend = dyn_cast(Dividend)) - if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth()) - return false; - - // Basic Block is split before divide - BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDiv); - MainBB->getInstList().back().eraseFromParent(); - BasicBlock *FastBB = createFastBB(SuccessorBB); - BasicBlock *SlowBB = createSlowBB(SuccessorBB); - createDivRemPhiNodes(FastBB, SlowBB, SuccessorBB, PerBBDivCache); - Value *CmpV = createDivRunTimeCheck(); - IRBuilder<> Builder(MainBB, MainBB->end()); - Builder.CreateCondBr(CmpV, FastBB, SlowBB); + const DataLayout &DL = SlowDiv->getModule()->getDataLayout(); + + ValueRange DividendRange = getValueRange(Dividend, DL); + if (DividendRange == VALRNG_LONG) + return false; + + ValueRange DivisorRange = getValueRange(Divisor, DL); + if (DivisorRange == VALRNG_LONG) + return false; + + bool DividendShort = (DividendRange == VALRNG_SHORT); + bool DivisorShort = (DivisorRange == VALRNG_SHORT); + + if (DividendShort && DivisorShort) { + // If both operands are known to be short then just replace the long + // division with a short one. + IRBuilder<> Builder(SlowDiv); + Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); + Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); + Value *TruncResult = isDivisionOp() + ? Builder.CreateUDiv(TruncDividend, TruncDivisor) + : Builder.CreateURem(TruncDividend, TruncDivisor); + Value *Result = Builder.CreateZExt(TruncResult, SlowType); + SlowDiv->replaceAllUsesWith(Result); + SlowDiv->eraseFromParent(); + SlowDiv = nullptr; + } else if (DividendShort && !isSignedDiv()) { + // If the division is unsigned and Dividend is known to be short, then + // either + // 1) Divisor is less or equal to Dividend, and the result can be computed + // with a short division. + // 2) Divisor is greater than Dividend. In this case, no division is needed + // at all: The quotient is 0 and the remainder is equal to Dividend. + // + // So instead of checking at runtime whether Divisor fits into BypassType, + // we emit a runtime check to differentiate between these two cases. This + // lets us entirely avoid a long div. + BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDiv); + MainBB->getInstList().back().eraseFromParent(); + LongQuotientV = ConstantInt::get(SlowType, 0); + LongRemainderV = Dividend; + BasicBlock *DivBB = createFastBB(SuccessorBB); + createDivRemPhiNodes(DivBB, MainBB, SuccessorBB, PerBBDivCache); + IRBuilder<> Builder(MainBB, MainBB->end()); + Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); + Builder.CreateCondBr(CmpV, DivBB, SuccessorBB); + } else { + BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDiv); + MainBB->getInstList().back().eraseFromParent(); + BasicBlock *FastBB = createFastBB(SuccessorBB); + BasicBlock *SlowBB = createSlowBB(SuccessorBB); + createDivRemPhiNodes(FastBB, SlowBB, SuccessorBB, PerBBDivCache); + Value *CmpV = createDivRunTimeCheck(DividendShort ? nullptr : Dividend, + DivisorShort ? nullptr : Divisor); + IRBuilder<> Builder(MainBB, MainBB->end()); + Builder.CreateCondBr(CmpV, FastBB, SlowBB); + } return true; } Index: test/CodeGen/X86/bypass-slow-division-64.ll =================================================================== --- test/CodeGen/X86/bypass-slow-division-64.ll +++ test/CodeGen/X86/bypass-slow-division-64.ll @@ -76,3 +76,88 @@ %result = add i64 %resultdiv, %resultrem ret i64 %result } + +; No bypassing should be done in apparently unsuitable cases. +define i64 @Test_no_bypassing(i32 %a, i64 %b) nounwind { +; CHECK-LABEL: Test_no_bypassing: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: notq %rax +; CHECK-NEXT: cqto +; CHECK-NEXT: idivq %rsi +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: retq + %a.1 = zext i32 %a to i64 + ; %a.2 is always negative so the division cannot be bypassed. + %a.2 = sub i64 -1, %a.1 + %res = srem i64 %a.2, %b + ret i64 %res +} + +; No OR instruction is needed if one of the operands (divisor) is known +; to fit into 32 bits. +define i64 @Test_check_one_operand(i64 %a, i32 %b) nounwind { +; CHECK-LABEL: Test_check_one_operand: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: je .LBB4_1 +; CHECK-NEXT: # BB#2: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: cqto +; CHECK-NEXT: idivq %rcx +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB4_1: +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: divl %ecx +; CHECK-NEXT: # kill: %EAX %EAX %RAX +; CHECK-NEXT: retq + %b.1 = zext i32 %b to i64 + %res = sdiv i64 %a, %b.1 + ret i64 %res +} + +; If both operands are known to fit into 32 bits, then replace the division +; in-place without CFG modification. +define i64 @Test_check_none(i64 %a, i32 %b) nounwind { +; CHECK-LABEL: Test_check_none: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: divl %esi +; CHECK-NEXT: # kill: %EAX %EAX %RAX +; CHECK-NEXT: retq + %a.1 = and i64 %a, 4294967295 + %b.1 = zext i32 %b to i64 + %res = udiv i64 %a.1, %b.1 + ret i64 %res +} + +; In case of unsigned long division with a short dividend, +; the long division is not needed any more. +define i64 @Test_special_case(i32 %a, i64 %b) nounwind { +; CHECK-LABEL: Test_special_case: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: cmpq %rsi, %rcx +; CHECK-NEXT: jb .LBB6_1 +; CHECK-NEXT: # BB#2: +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: divl %esi +; CHECK-NEXT: # kill: %EAX %EAX %RAX +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB6_1: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq + %a.1 = zext i32 %a to i64 + %div = udiv i64 %a.1, %b + %rem = urem i64 %a.1, %b + %res = add i64 %div, %rem + ret i64 %res +}