Index: llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1284,7 +1284,29 @@ Value *Arg0, Value *Arg1, CmpInst *Cmp, Intrinsic::ID IID) { - if (BO->getParent() != Cmp->getParent()) { + auto isIVIncrement = [this, &Cmp](BinaryOperator *BO) { + auto *PN = dyn_cast(BO->getOperand(0)); + if (!PN) + return false; + const Loop *L = LI->getLoopFor(BO->getParent()); + if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch()) + return false; + if (PN->getIncomingValueForBlock(L->getLoopLatch()) != BO) + return false; + if (auto *Step = dyn_cast(BO->getOperand(1))) + if (L->contains(Step->getParent())) + return false; + // IV increment may have other users than the IV. We do not want to make + // dominance queries to analyze the legality of moving it towards the cmp, + // so just check that there is no other users. + if (!BO->hasOneUse()) + return false; + // Do not risk on moving increment into a child loop. + if (LI->getLoopFor(Cmp->getParent()) != L) + return false; + return true; + }; + if (BO->getParent() != Cmp->getParent() && !isIVIncrement(BO)) { // We used to use a dominator tree here to allow multi-block optimization. // But that was problematic because: // 1. It could cause a perf regression by hoisting the math op into the @@ -1295,9 +1317,16 @@ // This is because we recompute the DT on every change in the main CGP // run-loop. The recomputing is probably unnecessary in many cases, so if // that was fixed, using a DT here would be ok. + // + // There is one important particular case we still want to handle: if BO is + // the IV increment. Important properties that make it profitable: + // - We can speculate IV increment anywhere in the loop (as long as the + // indvar Phi is its only user); + // - Upon computing Cmp, we effectively compute something equivalent to the + // IV increment (despite it loops differently in the IR). So moving it up + // to the cmp point does not really increase register pressure. return false; } - // We allow matching the canonical IR (add X, C) back to (usubo X, -C). if (BO->getOpcode() == Instruction::Add && IID == Intrinsic::usub_with_overflow) { Index: llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll =================================================================== --- llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll +++ llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll @@ -89,15 +89,16 @@ define i32 @test_02(i32* %p, i64 %len, i32 %x) { ; CHECK-LABEL: test_02: ; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB2_1: ## %loop ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testq %rsi, %rsi -; CHECK-NEXT: je LBB2_4 +; CHECK-NEXT: subq $1, %rax +; CHECK-NEXT: jb LBB2_4 ; CHECK-NEXT: ## %bb.2: ## %backedge ; CHECK-NEXT: ## in Loop: Header=BB2_1 Depth=1 ; CHECK-NEXT: cmpl %edx, -4(%rdi,%rsi,4) -; CHECK-NEXT: leaq -1(%rsi), %rsi +; CHECK-NEXT: movq %rax, %rsi ; CHECK-NEXT: jne LBB2_1 ; CHECK-NEXT: ## %bb.3: ## %failure ; CHECK-NEXT: ud2 @@ -132,15 +133,16 @@ define i32 @test_03(i32* %p, i64 %len, i32 %x) { ; CHECK-LABEL: test_03: ; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB3_1: ## %loop ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testq %rsi, %rsi -; CHECK-NEXT: je LBB3_4 +; CHECK-NEXT: subq $1, %rax +; CHECK-NEXT: jb LBB3_4 ; CHECK-NEXT: ## %bb.2: ## %backedge ; CHECK-NEXT: ## in Loop: Header=BB3_1 Depth=1 ; CHECK-NEXT: cmpl %edx, -4(%rdi,%rsi,4) -; CHECK-NEXT: leaq -1(%rsi), %rsi +; CHECK-NEXT: movq %rax, %rsi ; CHECK-NEXT: jne LBB3_1 ; CHECK-NEXT: ## %bb.3: ## %failure ; CHECK-NEXT: ud2 Index: llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll =================================================================== --- llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -16,11 +16,11 @@ ; GENERIC-NEXT: movl (%rdx), %eax ; GENERIC-NEXT: movl 4(%rdx), %ebx ; GENERIC-NEXT: decl %ecx -; GENERIC-NEXT: leaq 20(%rdx), %r14 +; GENERIC-NEXT: leaq 20(%rdx), %r11 ; GENERIC-NEXT: movq _Te0@{{.*}}(%rip), %r9 ; GENERIC-NEXT: movq _Te1@{{.*}}(%rip), %r8 ; GENERIC-NEXT: movq _Te3@{{.*}}(%rip), %r10 -; GENERIC-NEXT: movq %rcx, %r11 +; GENERIC-NEXT: movq %rcx, %r14 ; GENERIC-NEXT: .p2align 4, 0x90 ; GENERIC-NEXT: LBB0_1: ## %bb ; GENERIC-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -32,30 +32,29 @@ ; GENERIC-NEXT: movzbl %bpl, %ebp ; GENERIC-NEXT: movl (%r8,%rbp,4), %ebp ; GENERIC-NEXT: xorl (%r9,%rax,4), %ebp -; GENERIC-NEXT: xorl -12(%r14), %ebp +; GENERIC-NEXT: xorl -12(%r11), %ebp ; GENERIC-NEXT: shrl $24, %ebx ; GENERIC-NEXT: movl (%r10,%rdi,4), %edi ; GENERIC-NEXT: xorl (%r9,%rbx,4), %edi -; GENERIC-NEXT: xorl -8(%r14), %edi +; GENERIC-NEXT: xorl -8(%r11), %edi ; GENERIC-NEXT: movl %ebp, %eax ; GENERIC-NEXT: shrl $24, %eax ; GENERIC-NEXT: movl (%r9,%rax,4), %eax -; GENERIC-NEXT: testq %r11, %r11 -; GENERIC-NEXT: je LBB0_3 +; GENERIC-NEXT: subq $1, %r14 +; GENERIC-NEXT: jb LBB0_3 ; GENERIC-NEXT: ## %bb.2: ## %bb1 ; GENERIC-NEXT: ## in Loop: Header=BB0_1 Depth=1 ; GENERIC-NEXT: movl %edi, %ebx ; GENERIC-NEXT: shrl $16, %ebx ; GENERIC-NEXT: movzbl %bl, %ebx ; GENERIC-NEXT: xorl (%r8,%rbx,4), %eax -; GENERIC-NEXT: xorl -4(%r14), %eax +; GENERIC-NEXT: xorl -4(%r11), %eax ; GENERIC-NEXT: shrl $24, %edi ; GENERIC-NEXT: movzbl %bpl, %ebx ; GENERIC-NEXT: movl (%r10,%rbx,4), %ebx ; GENERIC-NEXT: xorl (%r9,%rdi,4), %ebx -; GENERIC-NEXT: xorl (%r14), %ebx -; GENERIC-NEXT: decq %r11 -; GENERIC-NEXT: addq $16, %r14 +; GENERIC-NEXT: xorl (%r11), %ebx +; GENERIC-NEXT: addq $16, %r11 ; GENERIC-NEXT: jmp LBB0_1 ; GENERIC-NEXT: LBB0_3: ## %bb2 ; GENERIC-NEXT: shlq $4, %rcx @@ -99,12 +98,12 @@ ; ATOM-NEXT: ## kill: def $ecx killed $ecx def $rcx ; ATOM-NEXT: movl (%rdx), %r15d ; ATOM-NEXT: movl 4(%rdx), %eax -; ATOM-NEXT: leaq 20(%rdx), %r14 +; ATOM-NEXT: leaq 20(%rdx), %r11 ; ATOM-NEXT: movq _Te0@{{.*}}(%rip), %r9 ; ATOM-NEXT: movq _Te1@{{.*}}(%rip), %r8 ; ATOM-NEXT: movq _Te3@{{.*}}(%rip), %r10 ; ATOM-NEXT: decl %ecx -; ATOM-NEXT: movq %rcx, %r11 +; ATOM-NEXT: movq %rcx, %r14 ; ATOM-NEXT: .p2align 4, 0x90 ; ATOM-NEXT: LBB0_1: ## %bb ; ATOM-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -118,28 +117,27 @@ ; ATOM-NEXT: movzbl %r15b, %edi ; ATOM-NEXT: xorl (%r9,%rbp,4), %ebx ; ATOM-NEXT: movl (%r10,%rdi,4), %edi -; ATOM-NEXT: xorl -12(%r14), %ebx +; ATOM-NEXT: xorl -12(%r11), %ebx ; ATOM-NEXT: xorl (%r9,%rax,4), %edi ; ATOM-NEXT: movl %ebx, %eax -; ATOM-NEXT: xorl -8(%r14), %edi +; ATOM-NEXT: xorl -8(%r11), %edi ; ATOM-NEXT: shrl $24, %eax ; ATOM-NEXT: movl (%r9,%rax,4), %r15d -; ATOM-NEXT: testq %r11, %r11 +; ATOM-NEXT: subq $1, %r14 ; ATOM-NEXT: movl %edi, %eax -; ATOM-NEXT: je LBB0_3 +; ATOM-NEXT: jb LBB0_3 ; ATOM-NEXT: ## %bb.2: ## %bb1 ; ATOM-NEXT: ## in Loop: Header=BB0_1 Depth=1 ; ATOM-NEXT: shrl $16, %eax ; ATOM-NEXT: shrl $24, %edi -; ATOM-NEXT: decq %r11 -; ATOM-NEXT: movzbl %al, %ebp +; ATOM-NEXT: movzbl %al, %eax +; ATOM-NEXT: xorl (%r8,%rax,4), %r15d ; ATOM-NEXT: movzbl %bl, %eax ; ATOM-NEXT: movl (%r10,%rax,4), %eax -; ATOM-NEXT: xorl (%r8,%rbp,4), %r15d +; ATOM-NEXT: xorl -4(%r11), %r15d ; ATOM-NEXT: xorl (%r9,%rdi,4), %eax -; ATOM-NEXT: xorl -4(%r14), %r15d -; ATOM-NEXT: xorl (%r14), %eax -; ATOM-NEXT: addq $16, %r14 +; ATOM-NEXT: xorl (%r11), %eax +; ATOM-NEXT: addq $16, %r11 ; ATOM-NEXT: jmp LBB0_1 ; ATOM-NEXT: LBB0_3: ## %bb2 ; ATOM-NEXT: shrl $16, %eax Index: llvm/test/CodeGen/X86/usub_inc_iv.ll =================================================================== --- llvm/test/CodeGen/X86/usub_inc_iv.ll +++ llvm/test/CodeGen/X86/usub_inc_iv.ll @@ -97,13 +97,67 @@ } -; TODO: We can use trick with usub here. define i32 @test_02(i32* %p, i64 %len, i32 %x) { ; CHECK-LABEL: @test_02( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[MATH:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[IV]], i64 1) +; CHECK-NEXT: [[MATH]] = extractvalue { i64, i1 } [[TMP0]], 0 +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 +; CHECK-NEXT: br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]] +; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32* +; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4 +; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] +; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret i32 -1 +; CHECK: failure: +; CHECK-NEXT: unreachable +; +entry: + %scevgep = getelementptr i32, i32* %p, i64 -1 + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ] + %cond_1 = icmp eq i64 %iv, 0 + br i1 %cond_1, label %exit, label %backedge + +backedge: ; preds = %loop + %scevgep1 = getelementptr i32, i32* %scevgep, i64 %iv + %loaded = load atomic i32, i32* %scevgep1 unordered, align 4 + %cond_2 = icmp eq i32 %loaded, %x + %iv.next = add i64 %iv, -1 + br i1 %cond_2, label %failure, label %loop + +exit: ; preds = %loop + ret i32 -1 + +failure: ; preds = %backedge + unreachable +} + +declare i1 @use(i64 %x) +declare i1 @some_cond() + +; Make sure we do not move the increment below the point where it is used. +define i32 @test_03_neg(i32* %p, i64 %len, i32 %x) { +; CHECK-LABEL: @test_03_neg( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; CHECK-NEXT: [[COND_0:%.*]] = call i1 @use(i64 [[IV_NEXT]]) +; CHECK-NEXT: br i1 [[COND_0]], label [[MIDDLE:%.*]], label [[FAILURE:%.*]] +; CHECK: middle: ; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0 ; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: @@ -114,8 +168,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SUNKADDR2]] to i32* ; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4 ; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 -; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret i32 -1 ; CHECK: failure: @@ -127,6 +180,11 @@ loop: ; preds = %backedge, %entry %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ] + %iv.next = add i64 %iv, -1 + %cond_0 = call i1 @use(i64 %iv.next) + br i1 %cond_0, label %middle, label %failure + +middle: %cond_1 = icmp eq i64 %iv, 0 br i1 %cond_1, label %exit, label %backedge @@ -134,7 +192,6 @@ %scevgep1 = getelementptr i32, i32* %scevgep, i64 %iv %loaded = load atomic i32, i32* %scevgep1 unordered, align 4 %cond_2 = icmp eq i32 %loaded, %x - %iv.next = add i64 %iv, -1 br i1 %cond_2, label %failure, label %loop exit: ; preds = %loop @@ -143,3 +200,61 @@ failure: ; preds = %backedge unreachable } + +define i32 @test_04_neg(i32* %p, i64 %len, i32 %x) { +; CHECK-LABEL: @test_04_neg( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[INNER:%.*]] +; CHECK: inner: +; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[COND_1]], label [[INNER_BACKEDGE:%.*]], label [[EXIT:%.*]] +; CHECK: inner_backedge: +; CHECK-NEXT: [[COND_INNER:%.*]] = call i1 @some_cond() +; CHECK-NEXT: br i1 [[COND_INNER]], label [[INNER]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to i8* +; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[SUNKADDR]] +; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SUNKADDR2]] to i32* +; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4 +; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret i32 -1 +; CHECK: failure: +; CHECK-NEXT: unreachable +; +entry: + %scevgep = getelementptr i32, i32* %p, i64 -1 + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ] + br label %inner + +inner: + %cond_1 = icmp eq i64 %iv, 0 + br i1 %cond_1, label %inner_backedge, label %exit + +inner_backedge: + %cond_inner = call i1 @some_cond() + br i1 %cond_inner, label %inner, label %backedge + +backedge: + %scevgep1 = getelementptr i32, i32* %scevgep, i64 %iv + %loaded = load atomic i32, i32* %scevgep1 unordered, align 4 + %cond_2 = icmp eq i32 %loaded, %x + %iv.next = add i64 %iv, -1 + br i1 %cond_2, label %failure, label %loop + +exit: + ret i32 -1 + +failure: + unreachable +}