Index: lib/Transforms/Scalar/IndVarSimplify.cpp =================================================================== --- lib/Transforms/Scalar/IndVarSimplify.cpp +++ lib/Transforms/Scalar/IndVarSimplify.cpp @@ -762,6 +762,8 @@ Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); + bool WidenLoopCompare(NarrowIVDefUse DU); + void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; } // anonymous namespace @@ -926,6 +928,35 @@ DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); } +/// If the narrow use is a compare instruction, and the compare has a single use +/// then widen the compare instruction (and possibly the other operand). +bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) { + ICmpInst *Cmp = dyn_cast(DU.NarrowUse); + if (!Cmp || !Cmp->hasOneUse()) + return false; + + /// FIXME: Add support for unsigned compare. + bool IsSigned = CmpInst::isSigned(Cmp->getPredicate()); + if (!IsSigned) + return false; + + Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0); + unsigned CastWidth = SE->getTypeSizeInBits(Op->getType()); + unsigned IVWidth = SE->getTypeSizeInBits(WideType); + assert (CastWidth <= IVWidth && "Unexpected width while widening compare."); + + // Widen the compare instruction. + IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef); + + // Widen the other operand of the compare, if necessary. + if (CastWidth < IVWidth) { + Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp); + DU.NarrowUse->replaceUsesOfWith(Op, ExtOp); + } + return true; +} + /// WidenIVUse - Determine whether an individual user of the narrow IV can be /// widened. If so, return the wide clone of the user. Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { @@ -993,10 +1024,15 @@ // Does this user itself evaluate to a recurrence after widening? const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse); + if (!WideAddRec) + WideAddRec = GetExtendedOperandRecurrence(DU); + if (!WideAddRec) { - WideAddRec = GetExtendedOperandRecurrence(DU); - } - if (!WideAddRec) { + // If use is a loop condition, try to promote the condition instead of + // truncating the IV first. + if (WidenLoopCompare(DU)) + return nullptr; + // This user does not evaluate to a recurence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. Index: test/Transforms/IndVarSimplify/2011-09-27-hoistsext.ll =================================================================== --- test/Transforms/IndVarSimplify/2011-09-27-hoistsext.ll +++ test/Transforms/IndVarSimplify/2011-09-27-hoistsext.ll @@ -12,6 +12,7 @@ ; CHECK: for.body: ; CHECK-NOT: sext +; CHECK: indvars.iv.next ; CHECK: br for.body: %i2.115 = phi i32 [ 0, %entry ], [ %add249, %for.body ] Index: test/Transforms/IndVarSimplify/elim-extend.ll =================================================================== --- test/Transforms/IndVarSimplify/elim-extend.ll +++ test/Transforms/IndVarSimplify/elim-extend.ll @@ -7,6 +7,7 @@ define void @postincConstIV(i8* %base, i32 %limit) nounwind { entry: br label %loop +; CHECK: sext i32 ; CHECK: loop: ; CHECK-NOT: sext ; CHECK: exit: Index: test/Transforms/IndVarSimplify/no-iv-rewrite.ll =================================================================== --- test/Transforms/IndVarSimplify/no-iv-rewrite.ll +++ test/Transforms/IndVarSimplify/no-iv-rewrite.ll @@ -229,10 +229,10 @@ ; loop and the OR instruction is replaced by an ADD keeping the result ; equivalent. ; +; CHECK: sext ; CHECK: loop: ; CHECK: phi i64 ; CHECK-NOT: sext -; CHECK: icmp slt i32 ; CHECK: exit: ; CHECK: add i64 loop: Index: test/Transforms/IndVarSimplify/verify-scev.ll =================================================================== --- test/Transforms/IndVarSimplify/verify-scev.ll +++ test/Transforms/IndVarSimplify/verify-scev.ll @@ -380,11 +380,11 @@ for.body65.lr.ph: ; preds = %for.body48 %0 = load i32* undef, align 4 + %1 = sext i32 %0 to i64 br label %for.body65.us for.body65.us: ; preds = %for.inc219.us, %for.body65.lr.ph - %k.09.us = phi i32 [ %inc.us, %for.inc219.us ], [ 1, %for.body65.lr.ph ] - %idxprom66.us = sext i32 %k.09.us to i64 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc219.us ], [ 1, %for.body65.lr.ph ] br i1 undef, label %for.inc219.us, label %if.end72.us if.end72.us: ; preds = %for.body65.us @@ -406,8 +406,8 @@ br i1 undef, label %for.cond139.loopexit.us, label %for.cond152.us for.inc219.us: ; preds = %for.cond139.loopexit.us, %if.end110.us, %if.end93.us, %for.body65.us - %inc.us = add nsw i32 %k.09.us, 1 - %cmp64.us = icmp sgt i32 %inc.us, %0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp64.us = icmp sgt i64 %indvars.iv.next, %1 br i1 %cmp64.us, label %for.inc221, label %for.body65.us for.cond139.loopexit.us: ; preds = %for.cond152.us Index: test/Transforms/IndVarSimplify/widen-loop-comp.ll =================================================================== --- /dev/null +++ test/Transforms/IndVarSimplify/widen-loop-comp.ll @@ -0,0 +1,137 @@ +; RUN: opt < %s -indvars -S | FileCheck %s +target triple = "aarch64--linux-gnu" + +; Check the loop exit i32 compare instruction and operand are widened to i64 +; instead of truncating IV before its use in the i32 compare instruction. + +@idx = common global i32 0, align 4 +@e = common global i32 0, align 4 +@ptr = common global i32* null, align 8 + +; CHECK-LABEL: @test1 +; CHECK: for.body.lr.ph: +; CHECK: sext i32 +; CHECK: for.cond: +; CHECK: icmp slt i64 +; CHECK: for.body: +; CHECK: phi i64 + +define i32 @test1() { +entry: + store i32 -1, i32* @idx, align 4 + %0 = load i32* @e, align 4 + %cmp4 = icmp slt i32 %0, 0 + br i1 %cmp4, label %for.end.loopexit, label %for.body.lr.ph + +for.body.lr.ph: + %1 = load i32** @ptr, align 8 + %2 = load i32* @e, align 4 + br label %for.body + +for.cond: + %inc = add nsw i32 %i.05, 1 + %cmp = icmp slt i32 %i.05, %2 + br i1 %cmp, label %for.body, label %for.cond.for.end.loopexit_crit_edge + +for.body: + %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.cond ] + %idxprom = sext i32 %i.05 to i64 + %arrayidx = getelementptr inbounds i32* %1, i64 %idxprom + %3 = load i32* %arrayidx, align 4 + %tobool = icmp eq i32 %3, 0 + br i1 %tobool, label %if.then, label %for.cond + +if.then: + %i.05.lcssa = phi i32 [ %i.05, %for.body ] + store i32 %i.05.lcssa, i32* @idx, align 4 + br label %for.end + +for.cond.for.end.loopexit_crit_edge: + br label %for.end.loopexit + +for.end.loopexit: + br label %for.end + +for.end: + %4 = load i32* @idx, align 4 + ret i32 %4 +} + +; CHECK-LABEL: @test2 +; CHECK: for.body4.us +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %cmp2.us = icmp slt i64 +; CHECK-NOT: %2 = trunc i64 %indvars.iv.next to i32 +; CHECK-NOT: %cmp2.us = icmp slt i32 + +define void @test2([8 x i8]* %a, i8* %b, i8 %limit) { +entry: + %conv = zext i8 %limit to i32 + %cmp23 = icmp eq i8 %limit, 0 + br i1 %cmp23, label %for.cond1.preheader, label %for.cond1.preheader.us + +for.cond1.preheader.us: + %storemerge5.us = phi i32 [ 0, %entry ], [ %inc14.us, %for.inc13.us ] + br i1 true, label %for.body4.lr.ph.us, label %for.inc13.us + +for.inc13.us: + %inc14.us = add nsw i32 %storemerge5.us, 1 + %cmp.us = icmp slt i32 %inc14.us, 4 + br i1 %cmp.us, label %for.cond1.preheader.us, label %for.end + +for.body4.us: + %storemerge14.us = phi i32 [ 0, %for.body4.lr.ph.us ], [ %inc.us, %for.body4.us ] + %idxprom.us = sext i32 %storemerge14.us to i64 + %arrayidx6.us = getelementptr inbounds [8 x i8]* %a, i64 %idxprom5.us, i64 %idxprom.us + %0 = load i8* %arrayidx6.us, align 1 + %idxprom7.us = zext i8 %0 to i64 + %arrayidx8.us = getelementptr inbounds i8* %b, i64 %idxprom7.us + %1 = load i8* %arrayidx8.us, align 1 + store i8 %1, i8* %arrayidx6.us, align 1 + %inc.us = add nsw i32 %storemerge14.us, 1 + %cmp2.us = icmp slt i32 %inc.us, %conv + br i1 %cmp2.us, label %for.body4.us, label %for.inc13.us + +for.body4.lr.ph.us: + %idxprom5.us = sext i32 %storemerge5.us to i64 + br label %for.body4.us + +for.cond1.preheader: + %storemerge5 = phi i32 [ 0, %entry ], [ %inc14, %for.inc13 ] + br i1 false, label %for.inc13, label %for.inc13 + +for.inc13: + %inc14 = add nsw i32 %storemerge5, 1 + %cmp = icmp slt i32 %inc14, 4 + br i1 %cmp, label %for.cond1.preheader, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: @test3 +; CHECK: for.cond: +; CHECK: phi i64 +; CHECK: icmp ne i64 + +define i32 @test3(i32* %a) { +entry: + br label %for.cond + +for.cond: + %sum.0 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %i.0, 1000 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %idxprom = sext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds i32* %a, i64 %idxprom + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %sum.0, %0 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret i32 %sum.0 +} Index: test/Transforms/LoopSimplify/merge-exits.ll =================================================================== --- test/Transforms/LoopSimplify/merge-exits.ll +++ test/Transforms/LoopSimplify/merge-exits.ll @@ -1,6 +1,4 @@ -; RUN: opt < %s -loop-simplify -loop-rotate -instcombine -indvars -S -verify-loop-info -verify-dom-info > %t -; RUN: not grep sext %t -; RUN: grep "phi i64" %t | count 1 +; RUN: opt < %s -loop-simplify -loop-rotate -instcombine -indvars -S -verify-loop-info -verify-dom-info | FileCheck %s ; Loopsimplify should be able to merge the two loop exits ; into one, so that loop rotate can rotate the loop, so @@ -9,36 +7,42 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n32:64" -define float @t(float* %pTmp1, float* %peakWeight, i32 %bandEdgeIndex) nounwind { +; CHECK-LABEL: @test1 +; CHECK: bb +; CHECK: phi i64 +; CHECK-NOT: phi i64 +; CHECK-NOT: sext + +define float @test1(float* %pTmp1, float* %peakWeight, i32 %bandEdgeIndex) nounwind { entry: - %t0 = load float* %peakWeight, align 4 ; [#uses=1] + %t0 = load float* %peakWeight, align 4 br label %bb1 -bb: ; preds = %bb2 - %t1 = sext i32 %hiPart.0 to i64 ; [#uses=1] - %t2 = getelementptr float* %pTmp1, i64 %t1 ; [#uses=1] - %t3 = load float* %t2, align 4 ; [#uses=1] - %t4 = fadd float %t3, %distERBhi.0 ; [#uses=1] - %t5 = add i32 %hiPart.0, 1 ; [#uses=2] - %t6 = sext i32 %t5 to i64 ; [#uses=1] - %t7 = getelementptr float* %peakWeight, i64 %t6 ; [#uses=1] - %t8 = load float* %t7, align 4 ; [#uses=1] - %t9 = fadd float %t8, %peakCount.0 ; [#uses=1] +bb: + %t1 = sext i32 %hiPart.0 to i64 + %t2 = getelementptr float* %pTmp1, i64 %t1 + %t3 = load float* %t2, align 4 + %t4 = fadd float %t3, %distERBhi.0 + %t5 = add i32 %hiPart.0, 1 + %t6 = sext i32 %t5 to i64 + %t7 = getelementptr float* %peakWeight, i64 %t6 + %t8 = load float* %t7, align 4 + %t9 = fadd float %t8, %peakCount.0 br label %bb1 -bb1: ; preds = %bb, %entry - %peakCount.0 = phi float [ %t0, %entry ], [ %t9, %bb ] ; [#uses=2] - %hiPart.0 = phi i32 [ 0, %entry ], [ %t5, %bb ] ; [#uses=3] - %distERBhi.0 = phi float [ 0.000000e+00, %entry ], [ %t4, %bb ] ; [#uses=3] - %t10 = fcmp uge float %distERBhi.0, 2.500000e+00 ; [#uses=1] +bb1: + %peakCount.0 = phi float [ %t0, %entry ], [ %t9, %bb ] + %hiPart.0 = phi i32 [ 0, %entry ], [ %t5, %bb ] + %distERBhi.0 = phi float [ 0.000000e+00, %entry ], [ %t4, %bb ] + %t10 = fcmp uge float %distERBhi.0, 2.500000e+00 br i1 %t10, label %bb3, label %bb2 -bb2: ; preds = %bb1 - %t11 = add i32 %bandEdgeIndex, -1 ; [#uses=1] - %t12 = icmp sgt i32 %t11, %hiPart.0 ; [#uses=1] +bb2: + %t11 = add i32 %bandEdgeIndex, -1 + %t12 = icmp sgt i32 %t11, %hiPart.0 br i1 %t12, label %bb, label %bb3 -bb3: ; preds = %bb2, %bb1 - %t13 = fdiv float %peakCount.0, %distERBhi.0 ; [#uses=1] +bb3: + %t13 = fdiv float %peakCount.0, %distERBhi.0 ret float %t13 }