Index: llvm/lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -5140,6 +5140,53 @@
       // Add the scale value.
       if (AddrMode.Scale) {
         Value *V = AddrMode.ScaledReg;
+        // Try to account for the following special case:
+        // 1. V is an inductive variable;
+        // 2. We use it with non-zero offset;
+        // 3. IV's increment is available at the point of memory instruction.
+        //
+        // In this case, we may reuse the IV increment instead of the IV Phi to
+        // achieve the following advantages:
+        // 1. If IV step matches the offset, we will have no need in the offset;
+        // 2. Even if they don't match, we will reduce the overlap of living IV
+        //    and IV increment, that will potentially lead to better register
+        //    assignment.
+        if (AddrMode.BaseOffs) {
+          auto GetIVStep = [this](const Value * V)
+              ->Optional<std::pair<Instruction *, APInt> > {
+            auto *PN = dyn_cast<PHINode>(V);
+            if (!PN)
+              return None;
+            const Loop *L = LI->getLoopFor(PN->getParent());
+            if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch())
+              return None;
+            auto *IVInc = dyn_cast<Instruction>(
+                PN->getIncomingValueForBlock(L->getLoopLatch()));
+            if (!IVInc || !L->contains(IVInc->getParent()))
+              return None;
+            ConstantInt *Step;
+            if (match(IVInc, m_ExtractValue<0>(
+                                 m_Intrinsic<Intrinsic::usub_with_overflow>(
+                                     m_Specific(PN), m_ConstantInt(Step)))))
+              return std::make_pair(IVInc, -Step->getValue());
+            // TODO: One more case that needs to be considered here is AddInstr.
+            // However, it seems that some other transform may undo this
+            // transform in this case, and it will lead to infinite compilation.
+            // Need to investigate why it happens and add the support for
+            // AddInstr here.
+            return None;
+          };
+          if (auto IVStep = GetIVStep(V)) {
+            Instruction *IVInc = IVStep->first;
+            APInt Step = IVStep->second;
+            if (getDT(*IVInc->getParent()->getParent())
+                    .dominates(IVInc, MemoryInst)) {
+              V = IVInc;
+              AddrMode.BaseOffs -= Step.getLimitedValue() * AddrMode.Scale;
+            }
+          }
+        }
+
         if (V->getType() == IntPtrTy) {
           // done.
         } else {
Index: llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
===================================================================
--- llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
+++ llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
@@ -5,16 +5,14 @@
 define i32 @test_01(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_01:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
+; CHECK-NEXT:    subq $1, %rsi
 ; CHECK-NEXT:    jb LBB0_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    cmpl %edx, -4(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    cmpl %edx, (%rdi,%rsi,4)
 ; CHECK-NEXT:    jne LBB0_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
@@ -47,16 +45,14 @@
 define i32 @test_01a(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_01a:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB1_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
+; CHECK-NEXT:    subq $1, %rsi
 ; CHECK-NEXT:    jb LBB1_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    cmpl %edx, -28(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    cmpl %edx, -24(%rdi,%rsi,4)
 ; CHECK-NEXT:    jne LBB1_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
Index: llvm/test/CodeGen/X86/usub_inc_iv.ll
===================================================================
--- llvm/test/CodeGen/X86/usub_inc_iv.ll
+++ llvm/test/CodeGen/X86/usub_inc_iv.ll
@@ -12,11 +12,10 @@
 ; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
 ; CHECK-NEXT:    br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]
@@ -60,10 +59,10 @@
 ; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
 ; CHECK-NEXT:    br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -28
+; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -24
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]