diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -2013,6 +2013,7 @@ SmallVector ScalarEvolutionIVs; void OptimizeShadowIV(); + void optimizeDiv(); bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); void OptimizeLoopTermCond(); @@ -2243,6 +2244,140 @@ } } +/// Transform divide instruction to add instruction. +/// for (int k = 0; k < m; ++k) { +/// int div = k / KW; +/// } +/// --> +/// for (int k = 0, cnt = 0, div = 0; k < m; ++k, ++i) { +/// if (i == KW) { +/// ++div; +/// cnt = 0; +/// } +/// } +void LSRInstance::optimizeDiv() { + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa(BackedgeTakenCount)) + return; + + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; + /* empty */) { + IVUsers::const_iterator CandidateUI = UI; + ++UI; + Instruction *Div = CandidateUI->getUser(); + bool IsSigned = false; + Type *Ty = Div->getType(); + + if (Div->getOpcode() == Instruction::UDiv) + IsSigned = false; + else if (Div->getOpcode() == Instruction::SDiv) + IsSigned = true; + else + continue; + + PHINode *PH = dyn_cast(Div->getOperand(0)); + if (!PH) + continue; + if (PH->getNumIncomingValues() != 2) + continue; + + const SCEVAddRecExpr *AR = dyn_cast(SE.getSCEV(PH)); + if (!AR) + continue; + // TODO, support negative and non-one positive step + if (!AR->getStepRecurrence(SE)->isOne()) + continue; + // Drop the transform if there is overflow. + if (IsSigned && !AR->hasNoSignedWrap()) + continue; + if (!IsSigned && !AR->hasNoUnsignedWrap()) + continue; + + if (TTI.hasDivRemOp(Ty, IsSigned)) + continue; + + const SCEV *Divisor = SE.getSCEV(Div->getOperand(1)); + if (!SE.isLoopInvariant(Divisor, L)) + continue; + + unsigned Entry, Latch; + if (PH->getIncomingBlock(0) == L->getLoopPreheader()) { + Entry = 0; + Latch = 1; + } else { + Entry = 1; + Latch = 0; + } + + ConstantInt *Init = dyn_cast(PH->getIncomingValue(Entry)); + if (!Init) + continue; + // TODO: suppport any constant initial value. + if (!Init->isZero() && !Init->isOne()) + continue; + + // 7: + // %8 = phi i32 [ %12, %7 ], [ 0, %3 ] + // %div = sdiv i32 %8, %1 + // latch + // %12 = add nuw nsw i32 %8, 1 + // %13 = icmp eq i32 %12, %4 + // br i1 %13, label %6, label %7 + // --> + // 6: + // %divph = phi i32 [ %div, %6 ], [ 0, %3 ] + // %cntph = phi i32 [ %cnt, %6 ], [ 0, %3 ] + // %k = phi i32 [ %15, %6 ], [ 0, %3 ] + // %10 = icmp eq i32 %cntph, %1 + // %11 = zext i1 %10 to i32 # step 1 + // %div = add nuw nsw i32 %divph, %11 # div++ + // %15 = add nuw nsw i32 %k, 1 + // %16 = add nsw i32 %cntph, 1 + // %cnt = select i1 %10, i32 1, i32 %16 + // latch + // %18 = icmp eq i32 %15, %0 + // br i1 %18, label %5, label %6 + + // %7 = phi i32 [ %div, %6 ], [ 0, %3 ] + PHINode *DivPH = PHINode::Create(Ty, 2, "div", PH); + DivPH->addIncoming(Init, PH->getIncomingBlock(Entry)); + + // %cntph = phi i32 [ %cnt, %6 ], [ 0, %3 ] + PHINode *CountPH = PHINode::Create(Ty, 2, "cnt", PH); + CountPH->addIncoming(Init, PH->getIncomingBlock(Entry)); + + // %10 = icmp eq i32 %cnt, %1 + ICmpInst *Cond = new ICmpInst(Div, ICmpInst::ICMP_EQ, CountPH, + Div->getOperand(1), "icmp"); + // %11 = zext i1 %10 to i32 + Instruction *ZExt = + CastInst::Create(Instruction::ZExt, Cond, Ty, "zext", Div); + // %div = add nuw nsw i32 %divph, %11 + Instruction *DivRes = + BinaryOperator::Create(Instruction::Add, DivPH, ZExt, "add1", Div); + DivPH->addIncoming(DivRes, PH->getIncomingBlock(Latch)); + // %15 = add nuw nsw i32 %k, 1 + Instruction *CntAdd = BinaryOperator::Create( + Instruction::Add, CountPH, ConstantInt::get(Ty, 1), "add2", Div); + if (IsSigned) { + DivRes->setHasNoSignedWrap(); + CntAdd->setHasNoSignedWrap(); + } else { + DivRes->setHasNoUnsignedWrap(); + CntAdd->setHasNoUnsignedWrap(); + } + // %cnt = select i1 %10, i32 1, i32 %16 + SelectInst *Sel = + SelectInst::Create(Cond, ConstantInt::get(Ty, 1), CntAdd, "sel", Div); + CountPH->addIncoming(Sel, PH->getIncomingBlock(Latch)); + + Div->replaceAllUsesWith(DivRes); + Div->eraseFromParent(); + + Changed = true; + } +} + /// If Cond has an operand that is an expression of an IV, set the IV user and /// stride information and return true, otherwise return false. bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { @@ -5911,6 +6046,7 @@ // First, perform some low-level loop optimizations. OptimizeShadowIV(); + optimizeDiv(); OptimizeLoopTermCond(); // If loop preparation eliminates all interesting IV users, bail. diff --git a/llvm/test/Transforms/LoopStrengthReduce/div.ll b/llvm/test/Transforms/LoopStrengthReduce/div.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/div.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt < %s -loop-reduce -S | FileCheck %s + +; Provide legal integer types. +target datalayout = "n8:16:32:64" + +define void @sdiv(i32 %0, i32 %1, ptr %2) { +; CHECK-LABEL: define void @sdiv +; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], ptr [[TMP2:%.*]]) { +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[DOTPREHEADER:%.*]], label [[TMP6:%.*]] +; CHECK: .preheader: +; CHECK-NEXT: br label [[TMP7:%.*]] +; CHECK: .loopexit: +; CHECK-NEXT: br label [[TMP6]] +; CHECK: 6: +; CHECK-NEXT: ret void +; CHECK: 7: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[TMP4]], [[DOTPREHEADER]] ], [ [[LSR_IV_NEXT:%.*]], [[TMP7]] ] +; CHECK-NEXT: [[DIV:%.*]] = phi i32 [ 0, [[DOTPREHEADER]] ], [ [[ADD1:%.*]], [[TMP7]] ] +; CHECK-NEXT: [[CNT:%.*]] = phi i32 [ 0, [[DOTPREHEADER]] ], [ [[SEL:%.*]], [[TMP7]] ] +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[CNT]], [[TMP1]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[ICMP]] to i32 +; CHECK-NEXT: [[ADD1]] = add nsw i32 [[DIV]], [[ZEXT]] +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[CNT]], 1 +; CHECK-NEXT: [[SEL]] = select i1 [[ICMP]], i32 1, i32 [[ADD2]] +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[ADD1]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP8]] +; CHECK-NEXT: store i32 0, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label [[DOTLOOPEXIT:%.*]], label [[TMP7]] +; + %4 = mul nsw i32 %1, %0 + %5 = icmp sgt i32 %4, 0 + br i1 %5, label %7, label %6 + +6: ; preds = %7, %3 + ret void + +7: ; preds = %7, %3 + %8 = phi i32 [ %12, %7 ], [ 0, %3 ] + %9 = sdiv i32 %8, %1 + %10 = sext i32 %9 to i64 + %11 = getelementptr inbounds i32, ptr %2, i64 %10 + store i32 0, ptr %11, align 4 + %12 = add nuw nsw i32 %8, 1 + %13 = icmp eq i32 %12, %4 + br i1 %13, label %6, label %7 +} + +define void @udiv(i32 %0, i32 %1, ptr %2) { +; CHECK-LABEL: define void @udiv +; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], ptr [[TMP2:%.*]]) { +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[DOTPREHEADER:%.*]], label [[TMP6:%.*]] +; CHECK: .preheader: +; CHECK-NEXT: br label [[TMP7:%.*]] +; CHECK: .loopexit: +; CHECK-NEXT: br label [[TMP6]] +; CHECK: 6: +; CHECK-NEXT: ret void +; CHECK: 7: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[TMP4]], [[DOTPREHEADER]] ], [ [[LSR_IV_NEXT:%.*]], [[TMP7]] ] +; CHECK-NEXT: [[DIV:%.*]] = phi i32 [ 0, [[DOTPREHEADER]] ], [ [[ADD1:%.*]], [[TMP7]] ] +; CHECK-NEXT: [[CNT:%.*]] = phi i32 [ 0, [[DOTPREHEADER]] ], [ [[SEL:%.*]], [[TMP7]] ] +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[CNT]], [[TMP1]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[ICMP]] to i32 +; CHECK-NEXT: [[ADD1]] = add nuw i32 [[DIV]], [[ZEXT]] +; CHECK-NEXT: [[ADD2:%.*]] = add nuw i32 [[CNT]], 1 +; CHECK-NEXT: [[SEL]] = select i1 [[ICMP]], i32 1, i32 [[ADD2]] +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[ADD1]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP8]] +; CHECK-NEXT: store i32 0, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label [[DOTLOOPEXIT:%.*]], label [[TMP7]] +; + %4 = mul nsw i32 %1, %0 + %5 = icmp sgt i32 %4, 0 + br i1 %5, label %7, label %6 + +6: ; preds = %7, %3 + ret void + +7: ; preds = %7, %3 + %8 = phi i32 [ %12, %7 ], [ 0, %3 ] + %9 = udiv i32 %8, %1 + %10 = sext i32 %9 to i64 + %11 = getelementptr inbounds i32, ptr %2, i64 %10 + store i32 0, ptr %11, align 4 + %12 = add nuw nsw i32 %8, 1 + %13 = icmp eq i32 %12, %4 + br i1 %13, label %6, label %7 +}