Index: llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -5140,6 +5140,53 @@ // Add the scale value. if (AddrMode.Scale) { Value *V = AddrMode.ScaledReg; + // Try to account for the following special case: + // 1. V is an inductive variable; + // 2. We use it with non-zero offset; + // 3. IV's increment is available at the point of memory instruction. + // + // In this case, we may reuse the IV increment instead of the IV Phi to + // achieve the following advantages: + // 1. If IV step matches the offset, we will have no need in the offset; + // 2. Even if they don't match, we will reduce the overlap of living IV + // and IV increment, that will potentially lead to better register + // assignment. + if (AddrMode.BaseOffs) { + auto GetIVStep = [this](const Value * V) + ->Optional > { + auto *PN = dyn_cast(V); + if (!PN) + return None; + const Loop *L = LI->getLoopFor(PN->getParent()); + if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch()) + return None; + auto *IVInc = dyn_cast( + PN->getIncomingValueForBlock(L->getLoopLatch())); + if (!IVInc || !L->contains(IVInc->getParent())) + return None; + ConstantInt *Step; + if (match(IVInc, m_ExtractValue<0>( + m_Intrinsic( + m_Specific(PN), m_ConstantInt(Step))))) + return std::make_pair(IVInc, -Step->getValue()); + // TODO: One more case that needs to be considered here is AddInstr. + // However, it seems that some other transform may undo this + // transform in this case, and it will lead to infinite compilation. + // Need to investigate why it happens and add the support for + // AddInstr here. + return None; + }; + if (auto IVStep = GetIVStep(V)) { + Instruction *IVInc = IVStep->first; + APInt Step = IVStep->second; + if (getDT(*IVInc->getParent()->getParent()) + .dominates(IVInc, MemoryInst)) { + V = IVInc; + AddrMode.BaseOffs -= Step.getLimitedValue() * AddrMode.Scale; + } + } + } + if (V->getType() == IntPtrTy) { // done. } else { Index: llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll =================================================================== --- llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll +++ llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll @@ -5,16 +5,14 @@ define i32 @test_01(i32* %p, i64 %len, i32 %x) { ; CHECK-LABEL: test_01: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_1: ## %loop ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $1, %rax +; CHECK-NEXT: subq $1, %rsi ; CHECK-NEXT: jb LBB0_4 ; CHECK-NEXT: ## %bb.2: ## %backedge ; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: cmpl %edx, -4(%rdi,%rsi,4) -; CHECK-NEXT: movq %rax, %rsi +; CHECK-NEXT: cmpl %edx, (%rdi,%rsi,4) ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.3: ## %failure ; CHECK-NEXT: ud2 @@ -47,16 +45,14 @@ define i32 @test_01a(i32* %p, i64 %len, i32 %x) { ; CHECK-LABEL: test_01a: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_1: ## %loop ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $1, %rax +; CHECK-NEXT: subq $1, %rsi ; CHECK-NEXT: jb LBB1_4 ; CHECK-NEXT: ## %bb.2: ## %backedge ; CHECK-NEXT: ## in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: cmpl %edx, -28(%rdi,%rsi,4) -; CHECK-NEXT: movq %rax, %rsi +; CHECK-NEXT: cmpl %edx, -24(%rdi,%rsi,4) ; CHECK-NEXT: jne LBB1_1 ; CHECK-NEXT: ## %bb.3: ## %failure ; CHECK-NEXT: ud2 Index: llvm/test/CodeGen/X86/usub_inc_iv.ll =================================================================== --- llvm/test/CodeGen/X86/usub_inc_iv.ll +++ llvm/test/CodeGen/X86/usub_inc_iv.ll @@ -12,11 +12,10 @@ ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 ; CHECK-NEXT: br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: -; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4 +; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8* ; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]] -; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR1]] to i32* ; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4 ; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] ; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] @@ -60,10 +59,10 @@ ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 ; CHECK-NEXT: br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: -; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4 +; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8* ; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]] -; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -28 +; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -24 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32* ; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4 ; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]