Index: llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -5561,6 +5561,46 @@ Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI, MSSAU); + + // In our cost analysis above, we assume that each addrec consumes exactly + // one register, and arrange to have increments inserted just before the + // latch to maximimize the chance this is true. However, if we reused + // existing IVs, we now need to move the increments to match our + // expectations. Otherwise, our cost modeling results in us having a + // chosen a non-optimal result for the actual schedule. (And yes, this + // scheduling decision does impact later codegen.) + for (PHINode &PN : L->getHeader()->phis()) { + // First, filter out anything not an obvious addrec + if (!SE.isSCEVable(PN.getType()) || !isa(SE.getSCEV(&PN))) + continue; + Instruction *IncV = + dyn_cast(PN.getIncomingValueForBlock(L->getLoopLatch())); + if (!IncV || (IncV->getOpcode() != Instruction::Add && + IncV->getOpcode() != Instruction::Sub)) + continue; + + unsigned PhiOp = IncV->getOperand(0) == &PN ? 0 : 1; + if (IncV->getOperand(PhiOp) != &PN) + continue; + + if (!isa(IncV->getOperand(1-PhiOp))) + // If not a constant step, might increase register pressure + continue; + + if (IncV->getParent() == IVIncInsertPos->getParent()) + // Only bother moving across blocks. Isel can handle block local case. + continue; + + // Can we legally schedule inc at the desired point? + if (!DT.dominates(IncV, IVIncInsertPos) || + !llvm::all_of(IncV->uses(), + [&](Use &U) {return DT.dominates(IVIncInsertPos, U);})) + continue; + IncV->moveBefore(IVIncInsertPos); + Changed = true; + } + + } LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, Index: llvm/test/Transforms/LoopStrengthReduce/post-increment-insertion.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/post-increment-insertion.ll +++ llvm/test/Transforms/LoopStrengthReduce/post-increment-insertion.ll @@ -5,7 +5,6 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2" target triple = "x86_64-unknown-linux-gnu" -; FIXME: iv.next is supposed to be inserted in the backedge. define i32 @test_01(i32* %p, i64 %len, i32 %x) { ; CHECK-LABEL: @test_01( ; CHECK-NEXT: entry: @@ -13,13 +12,13 @@ ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0 ; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i32, i32* [[SCEVGEP]], i64 [[IV]] ; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[SCEVGEP1]] unordered, align 4 ; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret i32 -1 @@ -137,3 +136,128 @@ failure: ; preds = %backedge unreachable } + +define i32 @test_04(i32* %p, i64 %len, i32 %x) { +; CHECK-LABEL: @test_04( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 -1 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i32, i32* [[SCEVGEP]], i64 [[IV]] +; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[SCEVGEP1]] unordered, align 4 +; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] +; CHECK-NEXT: [[IV_NEXT]] = sub i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret i32 -1 +; CHECK: failure: +; CHECK-NEXT: unreachable +; +entry: + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ] + %iv.next = sub i64 %iv, 1 + %cond_1 = icmp eq i64 %iv, 0 + br i1 %cond_1, label %exit, label %backedge + +backedge: ; preds = %loop + %addr = getelementptr inbounds i32, i32* %p, i64 %iv.next + %loaded = load atomic i32, i32* %addr unordered, align 4 + %cond_2 = icmp eq i32 %loaded, %x + br i1 %cond_2, label %failure, label %loop + +exit: ; preds = %loop + ret i32 -1 + +failure: ; preds = %backedge + unreachable +} + +define i32 @test_05(i32* %p, i64 %len, i32 %x) { +; CHECK-LABEL: @test_05( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = mul i64 [[IV]], 2 +; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[ADDR]] unordered, align 4 +; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] +; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret i32 -1 +; CHECK: failure: +; CHECK-NEXT: unreachable +; +entry: + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ] + %iv.next = mul i64 %iv, 2 + %cond_1 = icmp eq i64 %iv, 0 + br i1 %cond_1, label %exit, label %backedge + +backedge: ; preds = %loop + %addr = getelementptr inbounds i32, i32* %p, i64 %iv.next + %loaded = load atomic i32, i32* %addr unordered, align 4 + %cond_2 = icmp eq i32 %loaded, %x + br i1 %cond_2, label %failure, label %loop + +exit: ; preds = %loop + ret i32 -1 + +failure: ; preds = %backedge + unreachable +} + +define i32 @test_06(i32* %p, i64 %len, i32 %x, i64 %step) { +; CHECK-LABEL: @test_06( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[STEP:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[STEP]] +; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[STEP]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i32, i32* [[SCEVGEP]], i64 [[IV]] +; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[SCEVGEP1]] unordered, align 4 +; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] +; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret i32 -1 +; CHECK: failure: +; CHECK-NEXT: unreachable +; +entry: + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ] + %iv.next = add nsw i64 %iv, %step + %cond_1 = icmp eq i64 %iv, 0 + br i1 %cond_1, label %exit, label %backedge + +backedge: ; preds = %loop + %addr = getelementptr inbounds i32, i32* %p, i64 %iv.next + %loaded = load atomic i32, i32* %addr unordered, align 4 + %cond_2 = icmp eq i32 %loaded, %x + br i1 %cond_2, label %failure, label %loop + +exit: ; preds = %loop + ret i32 -1 + +failure: ; preds = %backedge + unreachable +}