Index: lib/Transforms/Scalar/LoopRerollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopRerollPass.cpp +++ lib/Transforms/Scalar/LoopRerollPass.cpp @@ -512,11 +512,11 @@ // Check if an IV is only used to control the loop. There are two cases: // 1. It only has one use which is loop increment, and the increment is only -// used by comparison and the PHI, and the comparison is only used by branch. +// used by comparison and the PHI (could has sext with nsw in between), and the +// comparison is only used by branch. // 2. It is used by loop increment and the comparison, the loop increment is // only used by the PHI, and the comparison is used only by the branch. bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) { - unsigned IVUses = IV->getNumUses(); if (IVUses != 2 && IVUses != 1) return false; @@ -551,9 +551,17 @@ if (PN != IV) return false; } - // Must be a CMP - else if (!isCompareUsedByBranch(dyn_cast(UU))) - return false; + // Must be a CMP or an ext (of a value with nsw) then CMP + else { + Instruction *UUser = dyn_cast(UU); + // Skip SExt if we are extending an nsw value + // TODO: Allow ZExt too + if (BO->hasNoSignedWrap() && UUser && UUser->getNumUses() == 1 && + isa(UUser)) + UUser = dyn_cast(*(UUser->user_begin())); + if (!isCompareUsedByBranch(dyn_cast(UUser))) + return false; + } } } else return false; @@ -1161,6 +1169,11 @@ Instruction *UUser = dyn_cast(UU); // UUser could be compare, PHI or branch Uses[UUser].set(IL_All); + // Skip SExt + if (isa(UUser)) { + UUser = dyn_cast(*(UUser->user_begin())); + Uses[UUser].set(IL_All); + } // Is UUser a compare instruction? if (UU->hasOneUse()) { Instruction *BI = dyn_cast(*UUser->user_begin()); @@ -1502,8 +1515,6 @@ if (NeedNewIV) ICSCEV = SE->getMulExpr(IterCount, SE->getConstant(IterCount->getType(), Scale)); - else - ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); // Iteration count SCEV minus or plus 1 const SCEV *MinusPlus1SCEV = @@ -1515,17 +1526,25 @@ const SCEV *ICMinusPlus1SCEV = SE->getMinusSCEV(ICSCEV, MinusPlus1SCEV); // Iteration count minus 1 - Value *ICMinusPlus1 = nullptr; + Instruction *InsertPtr = nullptr; if (isa(ICMinusPlus1SCEV)) { - ICMinusPlus1 = - Expander.expandCodeFor(ICMinusPlus1SCEV, NewIV->getType(), BI); + InsertPtr = BI; } else { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); - ICMinusPlus1 = Expander.expandCodeFor( - ICMinusPlus1SCEV, NewIV->getType(), Preheader->getTerminator()); + InsertPtr = Preheader->getTerminator(); + } + + if (!isa(NewIV->getType()) && NeedNewIV && + (SE->getTypeSizeInBits(NewIV->getType()) < + SE->getTypeSizeInBits(ICMinusPlus1SCEV->getType()))) { + IRBuilder<> Builder(BI); + Builder.SetCurrentDebugLocation(BI->getDebugLoc()); + NewIV = Builder.CreateSExt(NewIV, ICMinusPlus1SCEV->getType()); } + Value *ICMinusPlus1 = Expander.expandCodeFor( + ICMinusPlus1SCEV, NewIV->getType(), InsertPtr); Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinusPlus1, "exitcond"); @@ -1705,6 +1724,7 @@ const SCEV *LIBETC = SE->getBackedgeTakenCount(L); const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType())); + DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n"); DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); // First, we need to find the induction variable with respect to which we can @@ -1731,6 +1751,7 @@ Changed = true; break; } + DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n"); // Trip count of L has changed so SE must be re-evaluated. if (Changed) Index: test/Transforms/LoopReroll/indvar_with_ext.ll =================================================================== --- /dev/null +++ test/Transforms/LoopReroll/indvar_with_ext.ll @@ -0,0 +1,186 @@ +; RUN: opt -S -loop-reroll %s | FileCheck %s +target triple = "aarch64--linux-gnu" + +define void @test(i32 %n, float* %arrayidx200, float* %arrayidx164, float* %arrayidx172) { +entry: + %rem.i = srem i32 %n, 4 + %t22 = load float, float* %arrayidx172, align 4 + %cmp.9 = icmp eq i32 %n, 0 + %t7 = sext i32 %n to i64 + br i1 %cmp.9, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: +;CHECK-LABEL: while.body: +;CHECK-NEXT: %indvars.iv.i423 = phi i64 [ %indvars.iv.next.i424, %while.body ], [ 0, %while.body.preheader ] +;CHECK-NEXT: [[T1:%[0-9]+]] = trunc i64 %indvars.iv.i423 to i32 +;CHECK-NEXT: %arrayidx62.i = getelementptr inbounds float, float* %arrayidx200, i64 %indvars.iv.i423 +;CHECK-NEXT: %t1 = load float, float* %arrayidx62.i, align 4 +;CHECK-NEXT: %arrayidx64.i = getelementptr inbounds float, float* %arrayidx164, i64 %indvars.iv.i423 +;CHECK-NEXT: %t2 = load float, float* %arrayidx64.i, align 4 +;CHECK-NEXT: %mul65.i = fmul fast float %t2, %t22 +;CHECK-NEXT: %add66.i = fadd fast float %mul65.i, %t1 +;CHECK-NEXT: store float %add66.i, float* %arrayidx62.i, align 4 +;CHECK-NEXT: %indvars.iv.next.i424 = add i64 %indvars.iv.i423, 1 +;CHECK-NEXT: [[T2:%[0-9]+]] = sext i32 [[T1]] to i64 +;CHECK-NEXT: %exitcond = icmp eq i64 [[T2]], %{{[0-9]+}} +;CHECK-NEXT: br i1 %exitcond, label %while.end.loopexit, label %while.body + + %indvars.iv.i423 = phi i64 [ %indvars.iv.next.i424, %while.body ], [ 0, %while.body.preheader ] + %i.22.i = phi i32 [ %add103.i, %while.body ], [ %rem.i, %while.body.preheader ] + %arrayidx62.i = getelementptr inbounds float, float* %arrayidx200, i64 %indvars.iv.i423 + %t1 = load float, float* %arrayidx62.i, align 4 + %arrayidx64.i = getelementptr inbounds float, float* %arrayidx164, i64 %indvars.iv.i423 + %t2 = load float, float* %arrayidx64.i, align 4 + %mul65.i = fmul fast float %t2, %t22 + %add66.i = fadd fast float %mul65.i, %t1 + store float %add66.i, float* %arrayidx62.i, align 4 + %t3 = add nsw i64 %indvars.iv.i423, 1 + %arrayidx71.i = getelementptr inbounds float, float* %arrayidx200, i64 %t3 + %t4 = load float, float* %arrayidx71.i, align 4 + %arrayidx74.i = getelementptr inbounds float, float* %arrayidx164, i64 %t3 + %t5 = load float, float* %arrayidx74.i, align 4 + %mul75.i = fmul fast float %t5, %t22 + %add76.i = fadd fast float %mul75.i, %t4 + store float %add76.i, float* %arrayidx71.i, align 4 + %add103.i = add nsw i32 %i.22.i, 2 + %t6 = sext i32 %add103.i to i64 + %cmp58.i = icmp slt i64 %t6, %t7 + %indvars.iv.next.i424 = add i64 %indvars.iv.i423, 2 + br i1 %cmp58.i, label %while.body, label %while.end.loopexit + +while.end.loopexit: + br label %while.end + +while.end: + ret void +} + +; Function Attrs: noinline norecurse nounwind +define i32 @test2(i64 %n, i32* nocapture %x, i32* nocapture readonly %y) { +entry: + %cmp18 = icmp sgt i64 %n, 0 + br i1 %cmp18, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + +;CHECK: for.body: +;CHECK-NEXT: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %for.body.preheader ] +;CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvar +;CHECK-NEXT: [[T1:%[0-9]+]] = load i32, i32* %arrayidx, align 4 +;CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, i32* %x, i64 %indvar +;CHECK-NEXT: store i32 [[T1]], i32* %arrayidx3, align 4 +;CHECK-NEXT: %indvar.next = add i64 %indvar, 1 +;CHECK-NEXT: %exitcond = icmp eq i64 %indvar, %{{[0-9]+}} +;CHECK-NEXT: br i1 %exitcond, label %for.end.loopexit, label %for.body + + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %x, i64 %indvars.iv + store i32 %0, i32* %arrayidx3, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx5 = getelementptr inbounds i32, i32* %y, i64 %1 + %2 = load i32, i32* %arrayidx5, align 4 + %arrayidx8 = getelementptr inbounds i32, i32* %x, i64 %1 + store i32 %2, i32* %arrayidx8, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, %n + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret i32 0 +} + +; Function Attrs: noinline norecurse nounwind +define i32 @test3(i32 %n, i32* nocapture %x, i32* nocapture readonly %y) { +entry: + %cmp21 = icmp sgt i32 %n, 0 + br i1 %cmp21, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + +;CHECK: for.body: +;CHECK: %add12 = add i8 %i.022, 2 +;CHECK-NEXT: %conv = sext i8 %add12 to i32 +;CHECK-NEXT: %cmp = icmp slt i32 %conv, %n +;CHECK-NEXT: br i1 %cmp, label %for.body, label %for.end.loopexit + + %conv23 = phi i32 [ %conv, %for.body ], [ 0, %for.body.preheader ] + %i.022 = phi i8 [ %add12, %for.body ], [ 0, %for.body.preheader ] + %idxprom = sext i8 %i.022 to i64 + %arrayidx = getelementptr inbounds i32, i32* %y, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %x, i64 %idxprom + store i32 %0, i32* %arrayidx3, align 4 + %add = or i32 %conv23, 1 + %idxprom5 = sext i32 %add to i64 + %arrayidx6 = getelementptr inbounds i32, i32* %y, i64 %idxprom5 + %1 = load i32, i32* %arrayidx6, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %x, i64 %idxprom5 + store i32 %1, i32* %arrayidx10, align 4 + %add12 = add i8 %i.022, 2 + %conv = sext i8 %add12 to i32 + %cmp = icmp slt i32 %conv, %n + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret i32 0 +} + +; Function Attrs: noinline norecurse nounwind +define i32 @test4(i64 %n, i32* nocapture %x, i32* nocapture readonly %y) { +entry: + %cmp18 = icmp eq i64 %n, 0 + br i1 %cmp18, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + +;CHECK: for.body: +;CHECK-NEXT: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %for.body.preheader ] +;CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvar +;CHECK-NEXT: [[T1:%[0-9]+]] = load i32, i32* %arrayidx, align 4 +;CHECK-NEXT: %arrayidx3 = getelementptr inbounds i32, i32* %x, i64 %indvar +;CHECK-NEXT: store i32 [[T1]], i32* %arrayidx3, align 4 +;CHECK-NEXT: %indvar.next = add i64 %indvar, 1 +;CHECK-NEXT: %exitcond = icmp eq i64 %indvar, %{{[0-9]+}} +;CHECK-NEXT: br i1 %exitcond, label %for.end.loopexit, label %for.body + + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %x, i64 %indvars.iv + store i32 %0, i32* %arrayidx3, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx5 = getelementptr inbounds i32, i32* %y, i64 %1 + %2 = load i32, i32* %arrayidx5, align 4 + %arrayidx8 = getelementptr inbounds i32, i32* %x, i64 %1 + store i32 %2, i32* %arrayidx8, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, %n + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret i32 0 +} +