Index: lib/Transforms/Utils/BypassSlowDivision.cpp =================================================================== --- lib/Transforms/Utils/BypassSlowDivision.cpp +++ lib/Transforms/Utils/BypassSlowDivision.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -83,17 +84,28 @@ } namespace { +enum ValueRange { + /// Operand definitely fits into BypassType. No runtime checks are needed. + VALRNG_SHORT, + /// A runtime check is required, as value range is unknown. + VALRNG_UNKNOWN, + /// Operand is unlikely to fit into BypassType. The bypassing should be + /// disabled. + VALRNG_LONG +}; + class FastDivInsertionTask { bool IsValidTask = false; Instruction *SlowDivOrRem = nullptr; IntegerType *BypassType = nullptr; BasicBlock *MainBB = nullptr; + ValueRange getValueRange(Value *Op); QuotRemWithBB createSlowBB(BasicBlock *Successor); QuotRemWithBB createFastBB(BasicBlock *Successor); QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, BasicBlock *PhiBB); - Value *insertOperandRuntimeCheck(); + Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); Optional insertFastDivAndRem(); bool isSignedOp() { @@ -175,6 +187,28 @@ return isDivisionOp() ? Value.Quotient : Value.Remainder; } +/// Check if an integer value fits into our bypass type. +ValueRange FastDivInsertionTask::getValueRange(Value *V) { + unsigned ShortLen = BypassType->getBitWidth(); + unsigned LongLen = V->getType()->getIntegerBitWidth(); + + assert(LongLen > ShortLen && "Value type must be wider than BypassType"); + unsigned HiBits = LongLen - ShortLen; + + const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); + APInt Zeros(LongLen, 0), Ones(LongLen, 0); + + computeKnownBits(V, Zeros, Ones, DL); + + if (Zeros.countLeadingOnes() >= HiBits) + return VALRNG_SHORT; + + if (Ones.countLeadingZeros() < HiBits) + return VALRNG_LONG; + + return VALRNG_UNKNOWN; +} + /// Add new basic block for slow div and rem operations and put it before /// SuccessorBB. QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { @@ -241,22 +275,17 @@ /// Creates a runtime check to test whether both the divisor and dividend fit /// into BypassType. The check is inserted at the end of MainBB. True return -/// value means that the operands fit. -Value *FastDivInsertionTask::insertOperandRuntimeCheck() { +/// value means that the operands fit. Either of the operands may be NULL if it +/// doesn't need a runtime check. +Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { + assert((Op1 || Op2) && "Nothing to check"); IRBuilder<> Builder(MainBB, MainBB->end()); - Value *Dividend = SlowDivOrRem->getOperand(0); - Value *Divisor = SlowDivOrRem->getOperand(1); - - // We should have bailed out above if the divisor is a constant, but the - // dividend may still be a constant. Set OrV to our non-constant operands - // OR'ed together. - assert(!isa(Divisor)); Value *OrV; - if (!isa(Dividend)) - OrV = Builder.CreateOr(Dividend, Divisor); + if (Op1 && Op2) + OrV = Builder.CreateOr(Op1, Op2); else - OrV = Divisor; + OrV = Op1 ? Op1 : Op2; // BitMask is inverted to check if the operands are // larger than the bypass type @@ -279,22 +308,72 @@ return None; } - // If the numerator is a constant, bail if it doesn't fit into BypassType. - if (ConstantInt *ConstDividend = dyn_cast(Dividend)) - if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth()) - return None; - - // Split the basic block before the div/rem. - BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); - // Remove the unconditional branch from MainBB to SuccessorBB. - MainBB->getInstList().back().eraseFromParent(); - QuotRemWithBB Fast = createFastBB(SuccessorBB); - QuotRemWithBB Slow = createSlowBB(SuccessorBB); - QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); - Value *CmpV = insertOperandRuntimeCheck(); - IRBuilder<> Builder(MainBB, MainBB->end()); - Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); - return Result; + ValueRange DividendRange = getValueRange(Dividend); + if (DividendRange == VALRNG_LONG) + return None; + + ValueRange DivisorRange = getValueRange(Divisor); + if (DivisorRange == VALRNG_LONG) + return None; + + bool DividendShort = (DividendRange == VALRNG_SHORT); + bool DivisorShort = (DivisorRange == VALRNG_SHORT); + + if (DividendShort && DivisorShort) { + // If both operands are known to be short then just replace the long + // division with a short one in-place. + + IRBuilder<> Builder(SlowDivOrRem); + Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); + Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); + Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); + Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); + Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); + Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); + return QuotRemPair(ExtDiv, ExtRem); + } else if (DividendShort && !isSignedOp()) { + // If the division is unsigned and Dividend is known to be short, then + // either + // 1) Divisor is less or equal to Dividend, and the result can be computed + // with a short division. + // 2) Divisor is greater than Dividend. In this case, no division is needed + // at all: The quotient is 0 and the remainder is equal to Dividend. + // + // So instead of checking at runtime whether Divisor fits into BypassType, + // we emit a runtime check to differentiate between these two cases. This + // lets us entirely avoid a long div. + + // Split the basic block before the div/rem. + BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); + // Remove the unconditional branch from MainBB to SuccessorBB. + MainBB->getInstList().back().eraseFromParent(); + QuotRemWithBB Long; + Long.BB = MainBB; + Long.Quotient = ConstantInt::get(getSlowType(), 0); + Long.Remainder = Dividend; + QuotRemWithBB Fast = createFastBB(SuccessorBB); + QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); + IRBuilder<> Builder(MainBB, MainBB->end()); + Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); + Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); + return Result; + } else { + // General case. Create both slow and fast div/rem pairs and choose one of + // them at runtime. + + // Split the basic block before the div/rem. + BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); + // Remove the unconditional branch from MainBB to SuccessorBB. + MainBB->getInstList().back().eraseFromParent(); + QuotRemWithBB Fast = createFastBB(SuccessorBB); + QuotRemWithBB Slow = createSlowBB(SuccessorBB); + QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); + Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, + DivisorShort ? nullptr : Divisor); + IRBuilder<> Builder(MainBB, MainBB->end()); + Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); + return Result; + } } /// This optimization identifies DIV/REM instructions in a BB that can be Index: test/CodeGen/X86/bypass-slow-division-64.ll =================================================================== --- test/CodeGen/X86/bypass-slow-division-64.ll +++ test/CodeGen/X86/bypass-slow-division-64.ll @@ -76,3 +76,88 @@ %result = add i64 %resultdiv, %resultrem ret i64 %result } + +; No bypassing should be done in apparently unsuitable cases. +define i64 @Test_no_bypassing(i32 %a, i64 %b) nounwind { +; CHECK-LABEL: Test_no_bypassing: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: notq %rax +; CHECK-NEXT: cqto +; CHECK-NEXT: idivq %rsi +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: retq + %a.1 = zext i32 %a to i64 + ; %a.2 is always negative so the division cannot be bypassed. + %a.2 = sub i64 -1, %a.1 + %res = srem i64 %a.2, %b + ret i64 %res +} + +; No OR instruction is needed if one of the operands (divisor) is known +; to fit into 32 bits. +define i64 @Test_check_one_operand(i64 %a, i32 %b) nounwind { +; CHECK-LABEL: Test_check_one_operand: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: je .LBB4_1 +; CHECK-NEXT: # BB#2: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: cqto +; CHECK-NEXT: idivq %rcx +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB4_1: +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: divl %ecx +; CHECK-NEXT: # kill: %EAX %EAX %RAX +; CHECK-NEXT: retq + %b.1 = zext i32 %b to i64 + %res = sdiv i64 %a, %b.1 + ret i64 %res +} + +; If both operands are known to fit into 32 bits, then replace the division +; in-place without CFG modification. +define i64 @Test_check_none(i64 %a, i32 %b) nounwind { +; CHECK-LABEL: Test_check_none: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: divl %esi +; CHECK-NEXT: # kill: %EAX %EAX %RAX +; CHECK-NEXT: retq + %a.1 = and i64 %a, 4294967295 + %b.1 = zext i32 %b to i64 + %res = udiv i64 %a.1, %b.1 + ret i64 %res +} + +; In case of unsigned long division with a short dividend, +; the long division is not needed any more. +define i64 @Test_special_case(i32 %a, i64 %b) nounwind { +; CHECK-LABEL: Test_special_case: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: cmpq %rsi, %rcx +; CHECK-NEXT: jb .LBB6_1 +; CHECK-NEXT: # BB#2: +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: divl %esi +; CHECK-NEXT: # kill: %EAX %EAX %RAX +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB6_1: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq + %a.1 = zext i32 %a to i64 + %div = udiv i64 %a.1, %b + %rem = urem i64 %a.1, %b + %res = add i64 %div, %rem + ret i64 %res +}