diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -5561,6 +5561,46 @@ Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI, MSSAU); + + // In our cost analysis above, we assume that each addrec consumes exactly + // one register, and arrange to have increments inserted just before the + // latch to maximimize the chance this is true. However, if we reused + // existing IVs, we now need to move the increments to match our + // expectations. Otherwise, our cost modeling results in us having a + // chosen a non-optimal result for the actual schedule. (And yes, this + // scheduling decision does impact later codegen.) + for (PHINode &PN : L->getHeader()->phis()) { + // First, filter out anything not an obvious addrec + if (!SE.isSCEVable(PN.getType()) || !isa(SE.getSCEV(&PN))) + continue; + Instruction *IncV = + dyn_cast(PN.getIncomingValueForBlock(L->getLoopLatch())); + if (!IncV) + continue; + + if (IncV->getOpcode() != Instruction::Add && + IncV->getOpcode() != Instruction::Sub) + continue; + + if (IncV->getOperand(0) != &PN && + !isa(IncV->getOperand(1))) + // If not a constant step, might increase register pressure + // (We assume constants have been canonicalized to RHS) + continue; + + if (IncV->getParent() == IVIncInsertPos->getParent()) + // Only bother moving across blocks. Isel can handle block local case. + continue; + + // Can we legally schedule inc at the desired point? + if (!llvm::all_of(IncV->uses(), + [&](Use &U) {return DT.dominates(IVIncInsertPos, U);})) + continue; + IncV->moveBefore(IVIncInsertPos); + Changed = true; + } + + } LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, diff --git a/llvm/test/Transforms/LoopStrengthReduce/post-increment-insertion.ll b/llvm/test/Transforms/LoopStrengthReduce/post-increment-insertion.ll --- a/llvm/test/Transforms/LoopStrengthReduce/post-increment-insertion.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/post-increment-insertion.ll @@ -13,13 +13,13 @@ ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0 ; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i32, i32* [[SCEVGEP]], i64 [[IV]] ; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[SCEVGEP1]] unordered, align 4 ; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret i32 -1 @@ -145,13 +145,13 @@ ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV_NEXT]] = sub i64 [[IV]], 1 ; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0 ; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i32, i32* [[SCEVGEP]], i64 [[IV]] ; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[SCEVGEP1]] unordered, align 4 ; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] +; CHECK-NEXT: [[IV_NEXT]] = sub i64 [[IV]], 1 ; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret i32 -1