Index: lib/Transforms/Utils/BypassSlowDivision.cpp =================================================================== --- lib/Transforms/Utils/BypassSlowDivision.cpp +++ lib/Transforms/Utils/BypassSlowDivision.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -83,17 +84,28 @@ } namespace { +enum ValueRange { + /// Operand definitely fits into BypassType. No runtime checks are needed. + VALRNG_SHORT, + /// A runtime check is required, as value range is unknown. + VALRNG_UNKNOWN, + /// Operand is unlikely to fit into BypassType. The bypassing should be + /// disabled. + VALRNG_LONG +}; + class FastDivInsertionTask { bool IsValidTask = false; Instruction *SlowDivOrRem = nullptr; IntegerType *BypassType = nullptr; BasicBlock *MainBB = nullptr; + ValueRange getValueRange(Value *Op); QuotRemWithBB createSlowBB(BasicBlock *Successor); QuotRemWithBB createFastBB(BasicBlock *Successor); QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, BasicBlock *PhiBB); - Value *insertOperandRuntimeCheck(); + Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); Optional insertFastDivAndRem(); bool isSignedOp() { @@ -175,6 +187,28 @@ return isDivisionOp() ? Value.Quotient : Value.Remainder; } +/// Check if an integer value fits into our bypass type. +ValueRange FastDivInsertionTask::getValueRange(Value *V) { + unsigned ShortLen = BypassType->getBitWidth(); + unsigned LongLen = V->getType()->getIntegerBitWidth(); + + assert(LongLen > ShortLen && "Value type must be wider than BypassType"); + unsigned HiBits = LongLen - ShortLen; + + const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); + APInt Zeros(LongLen, 0), Ones(LongLen, 0); + + computeKnownBits(V, Zeros, Ones, DL); + + if (Zeros.countLeadingOnes() >= HiBits) + return VALRNG_SHORT; + + if (Ones.countLeadingZeros() < HiBits) + return VALRNG_LONG; + + return VALRNG_UNKNOWN; +} + /// Add new basic block for slow div and rem operations and put it before /// SuccessorBB. QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { @@ -241,22 +275,17 @@ /// Creates a runtime check to test whether both the divisor and dividend fit /// into BypassType. The check is inserted at the end of MainBB. True return -/// value means that the operands fit. -Value *FastDivInsertionTask::insertOperandRuntimeCheck() { +/// value means that the operands fit. Either of the operands may be NULL if it +/// doesn't need a runtime check. +Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { + assert((Op1 || Op2) && "Nothing to check"); IRBuilder<> Builder(MainBB, MainBB->end()); - Value *Dividend = SlowDivOrRem->getOperand(0); - Value *Divisor = SlowDivOrRem->getOperand(1); - - // We should have bailed out above if the divisor is a constant, but the - // dividend may still be a constant. Set OrV to our non-constant operands - // OR'ed together. - assert(!isa(Divisor)); Value *OrV; - if (!isa(Dividend)) - OrV = Builder.CreateOr(Dividend, Divisor); + if (Op1 && Op2) + OrV = Builder.CreateOr(Op1, Op2); else - OrV = Divisor; + OrV = Op1 ? Op1 : Op2; // BitMask is inverted to check if the operands are // larger than the bypass type @@ -279,22 +308,72 @@ return None; } - // If the numerator is a constant, bail if it doesn't fit into BypassType. - if (ConstantInt *ConstDividend = dyn_cast(Dividend)) - if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth()) - return None; - - // Split the basic block before the div/rem. - BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); - // Remove the unconditional branch from MainBB to SuccessorBB. - MainBB->getInstList().back().eraseFromParent(); - QuotRemWithBB Fast = createFastBB(SuccessorBB); - QuotRemWithBB Slow = createSlowBB(SuccessorBB); - QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); - Value *CmpV = insertOperandRuntimeCheck(); - IRBuilder<> Builder(MainBB, MainBB->end()); - Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); - return Result; + ValueRange DividendRange = getValueRange(Dividend); + if (DividendRange == VALRNG_LONG) + return None; + + ValueRange DivisorRange = getValueRange(Divisor); + if (DivisorRange == VALRNG_LONG) + return None; + + bool DividendShort = (DividendRange == VALRNG_SHORT); + bool DivisorShort = (DivisorRange == VALRNG_SHORT); + + if (DividendShort && DivisorShort) { + // If both operands are known to be short then just replace the long + // division with a short one in-place. + + IRBuilder<> Builder(SlowDivOrRem); + Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); + Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); + Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); + Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); + Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); + Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); + return QuotRemPair(ExtDiv, ExtRem); + } else if (DividendShort && !isSignedOp()) { + // If the division is unsigned and Dividend is known to be short, then + // either + // 1) Divisor is less or equal to Dividend, and the result can be computed + // with a short division. + // 2) Divisor is greater than Dividend. In this case, no division is needed + // at all: The quotient is 0 and the remainder is equal to Dividend. + // + // So instead of checking at runtime whether Divisor fits into BypassType, + // we emit a runtime check to differentiate between these two cases. This + // lets us entirely avoid a long div. + + // Split the basic block before the div/rem. + BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); + // Remove the unconditional branch from MainBB to SuccessorBB. + MainBB->getInstList().back().eraseFromParent(); + QuotRemWithBB Long; + Long.BB = MainBB; + Long.Quotient = ConstantInt::get(getSlowType(), 0); + Long.Remainder = Dividend; + QuotRemWithBB Fast = createFastBB(SuccessorBB); + QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); + IRBuilder<> Builder(MainBB, MainBB->end()); + Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); + Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); + return Result; + } else { + // General case. Create both slow and fast div/rem pairs and choose one of + // them at runtime. + + // Split the basic block before the div/rem. + BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); + // Remove the unconditional branch from MainBB to SuccessorBB. + MainBB->getInstList().back().eraseFromParent(); + QuotRemWithBB Fast = createFastBB(SuccessorBB); + QuotRemWithBB Slow = createSlowBB(SuccessorBB); + QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); + Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, + DivisorShort ? nullptr : Divisor); + IRBuilder<> Builder(MainBB, MainBB->end()); + Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); + return Result; + } } /// This optimization identifies DIV/REM instructions in a BB that can be Index: test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll =================================================================== --- /dev/null +++ test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -codegenprepare < %s | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; No bypassing should be done in apparently unsuitable cases. +define void @Test_no_bypassing(i32 %a, i64 %b, i64* %retptr) { +; CHECK-LABEL: @Test_no_bypassing( +; CHECK-NEXT: [[A_1:%.*]] = zext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[A_2:%.*]] = sub i64 -1, [[A_1]] +; CHECK-NEXT: [[RES:%.*]] = srem i64 [[A_2]], [[B:%.*]] +; CHECK-NEXT: store i64 [[RES]], i64* [[RETPTR:%.*]] +; CHECK-NEXT: ret void +; + %a.1 = zext i32 %a to i64 + ; %a.2 is always negative so the division cannot be bypassed. + %a.2 = sub i64 -1, %a.1 + %res = srem i64 %a.2, %b + store i64 %res, i64* %retptr + ret void +} + +; No OR instruction is needed if one of the operands (divisor) is known +; to fit into 32 bits. +define void @Test_check_one_operand(i64 %a, i32 %b, i64* %retptr) { +; CHECK-LABEL: @Test_check_one_operand( +; CHECK-NEXT: [[B_1:%.*]] = zext i32 [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[A:%.*]], -4294967296 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP8:%.*]] +; CHECK: [[TMP4:%.*]] = trunc i64 [[B_1]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[A]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: br label [[TMP10:%.*]] +; CHECK: [[TMP9:%.*]] = sdiv i64 [[A]], [[B_1]] +; CHECK-NEXT: br label [[TMP10]] +; CHECK: [[TMP11:%.*]] = phi i64 [ [[TMP7]], [[TMP3]] ], [ [[TMP9]], [[TMP8]] ] +; CHECK-NEXT: store i64 [[TMP11]], i64* [[RETPTR:%.*]] +; CHECK-NEXT: ret void +; + %b.1 = zext i32 %b to i64 + %res = sdiv i64 %a, %b.1 + store i64 %res, i64* %retptr + ret void +} + +; If both operands are known to fit into 32 bits, then replace the division +; in-place without CFG modification. +define void @Test_check_none(i64 %a, i32 %b, i64* %retptr) { +; CHECK-LABEL: @Test_check_none( +; CHECK-NEXT: [[A_1:%.*]] = and i64 [[A:%.*]], 4294967295 +; CHECK-NEXT: [[B_1:%.*]] = zext i32 [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[A_1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[B_1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = udiv i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[TMP4]], i64* [[RETPTR:%.*]] +; CHECK-NEXT: ret void +; + %a.1 = and i64 %a, 4294967295 + %b.1 = zext i32 %b to i64 + %res = udiv i64 %a.1, %b.1 + store i64 %res, i64* %retptr + ret void +} + +; In case of unsigned long division with a short dividend, +; the long division is not needed any more. +define void @Test_special_case(i32 %a, i64 %b, i64* %retptr) { +; CHECK-LABEL: @Test_special_case( +; CHECK-NEXT: [[A_1:%.*]] = zext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = icmp uge i64 [[A_1]], [[B:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP9:%.*]] +; CHECK: [[TMP3:%.*]] = trunc i64 [[B]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[A_1]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = urem i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: br label [[TMP9]] +; CHECK: [[TMP10:%.*]] = phi i64 [ [[TMP7]], [[TMP2]] ], [ 0, [[TMP0:%.*]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP8]], [[TMP2]] ], [ [[A_1]], [[TMP0]] ] +; CHECK-NEXT: [[RES:%.*]] = add i64 [[TMP10]], [[TMP11]] +; CHECK-NEXT: store i64 [[RES]], i64* [[RETPTR:%.*]] +; CHECK-NEXT: ret void +; + %a.1 = zext i32 %a to i64 + %div = udiv i64 %a.1, %b + %rem = urem i64 %a.1, %b + %res = add i64 %div, %rem + store i64 %res, i64* %retptr + ret void +}