Index: llvm/lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3038,6 +3038,8 @@
   const TargetLowering &TLI;
   const TargetRegisterInfo &TRI;
   const DataLayout &DL;
+  const LoopInfo &LI;
+  const DominatorTree &DT;
 
   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
   /// the memory instruction that we're computing this address for.
@@ -3073,16 +3075,17 @@
 
   AddressingModeMatcher(
       SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
-      const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
+      const TargetRegisterInfo &TRI, const LoopInfo &LI,
+      const DominatorTree &DT, Type *AT, unsigned AS, Instruction *MI,
       ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
       InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
       std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
       bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
       : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
-        DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
-        MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
-        PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP),
-        OptSize(OptSize), PSI(PSI), BFI(BFI) {
+        DL(MI->getModule()->getDataLayout()), LI(LI), DT(DT), AccessTy(AT),
+        AddrSpace(AS), MemoryInst(MI), AddrMode(AM),
+        InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT),
+        LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) {
     IgnoreProfitability = false;
   }
 
@@ -3097,18 +3100,17 @@
   static ExtAddrMode
   Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
         SmallVectorImpl<Instruction *> &AddrModeInsts,
-        const TargetLowering &TLI, const TargetRegisterInfo &TRI,
-        const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
-        TypePromotionTransaction &TPT,
+        const TargetLowering &TLI, const LoopInfo &LI, const DominatorTree &DT,
+        const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts,
+        InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
         std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
         bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
     ExtAddrMode Result;
 
-    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
-                                         MemoryInst, Result, InsertedInsts,
-                                         PromotedInsts, TPT, LargeOffsetGEP,
-                                         OptSize, PSI, BFI)
-                       .matchAddr(V, 0);
+    bool Success = AddressingModeMatcher(
+        AddrModeInsts, TLI, TRI, LI, DT, AccessTy, AS, MemoryInst, Result,
+        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
+        BFI).matchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
     return Result;
   }
@@ -3820,9 +3822,69 @@
       AddrMode = TestAddrMode;
       return true;
     }
+    // Restore status quo.
+    TestAddrMode = AddrMode;
   }
 
-  // Otherwise, not (x+c)*scale, just return what we have.
+  auto GetIVStep = [this](const Value * V)
+      ->Optional<std::pair<Instruction *, APInt> > {
+    auto *PN = dyn_cast<PHINode>(V);
+    if (!PN)
+      return None;
+    const Loop *L = LI.getLoopFor(PN->getParent());
+    if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch())
+      return None;
+    auto *IVInc =
+        dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
+    if (!IVInc || !L->contains(IVInc->getParent()))
+      return None;
+    ConstantInt *Step;
+    if (match(IVInc,
+              m_ExtractValue<0>(m_Intrinsic<Intrinsic::usub_with_overflow>(
+                  m_Specific(PN), m_ConstantInt(Step)))))
+      return std::make_pair(IVInc, -Step->getValue());
+    // TODO: One more case that needs to be considered here is AddInstr.
+    // However, it seems that some other transform may undo this
+    // transform in this case, and it will lead to infinite compilation.
+    // Need to investigate why it happens and add the support for
+    // AddInstr here.
+    return None;
+  };
+
+  // Try to account for the following special case:
+  // 1. ScaleReg is an inductive variable;
+  // 2. We use it with non-zero offset;
+  // 3. IV's increment is available at the point of memory instruction.
+  //
+  // In this case, we may reuse the IV increment instead of the IV Phi to
+  // achieve the following advantages:
+  // 1. If IV step matches the offset, we will have no need in the offset;
+  // 2. Even if they don't match, we will reduce the overlap of living IV
+  //    and IV increment, that will potentially lead to better register
+  //    assignment.
+  if (AddrMode.BaseOffs) {
+    if (auto IVStep = GetIVStep(ScaleReg)) {
+      Instruction *IVInc = IVStep->first;
+      APInt Step = IVStep->second;
+      if (DT.dominates(IVInc, MemoryInst)) {
+        TestAddrMode.InBounds = false;
+        TestAddrMode.ScaledReg = IVInc;
+        TestAddrMode.BaseOffs -= Step.getLimitedValue() * AddrMode.Scale;
+        // If this addressing mode is legal, commit it and remember that we
+        // folded
+        // this instruction.
+        if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
+          AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
+          AddrMode = TestAddrMode;
+          return true;
+        }
+        // Restore status quo.
+        TestAddrMode = AddrMode;
+      }
+    }
+  }
+
+  // Otherwise, just return what we have.
   return true;
 }
 
@@ -4912,9 +4974,10 @@
                                                                       0);
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
-    AddressingModeMatcher Matcher(
-        MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
-        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI);
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, DT,
+                                  AddressAccessTy, AS, MemoryInst, Result,
+                                  InsertedInsts, PromotedInsts, TPT,
+                                  LargeOffsetGEP, OptSize, PSI, BFI);
     Matcher.IgnoreProfitability = true;
     bool Success = Matcher.matchAddr(Address, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -5017,9 +5080,10 @@
     AddrModeInsts.clear();
     std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
                                                                       0);
+    Function *F = MemoryInst->getParent()->getParent();
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
-        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
+        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, getDT(*F),
+        *TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
         BFI.get());
 
     GetElementPtrInst *GEP = LargeOffsetGEP.first;
Index: llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
===================================================================
--- llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
+++ llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
@@ -5,16 +5,14 @@
 define i32 @test_01(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_01:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
+; CHECK-NEXT:    subq $1, %rsi
 ; CHECK-NEXT:    jb LBB0_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    cmpl %edx, -4(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    cmpl %edx, (%rdi,%rsi,4)
 ; CHECK-NEXT:    jne LBB0_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
@@ -47,16 +45,14 @@
 define i32 @test_01a(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_01a:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB1_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
+; CHECK-NEXT:    subq $1, %rsi
 ; CHECK-NEXT:    jb LBB1_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    cmpl %edx, -28(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    cmpl %edx, -24(%rdi,%rsi,4)
 ; CHECK-NEXT:    jne LBB1_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
@@ -89,16 +85,14 @@
 define i32 @test_02(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_02:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB2_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
+; CHECK-NEXT:    subq $1, %rsi
 ; CHECK-NEXT:    jb LBB2_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB2_1 Depth=1
-; CHECK-NEXT:    cmpl %edx, -4(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    cmpl %edx, (%rdi,%rsi,4)
 ; CHECK-NEXT:    jne LBB2_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
@@ -133,16 +127,14 @@
 define i32 @test_03(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_03:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB3_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
+; CHECK-NEXT:    subq $1, %rsi
 ; CHECK-NEXT:    jb LBB3_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB3_1 Depth=1
-; CHECK-NEXT:    cmpl %edx, -4(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    cmpl %edx, (%rdi,%rsi,4)
 ; CHECK-NEXT:    jne LBB3_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
Index: llvm/test/CodeGen/X86/usub_inc_iv.ll
===================================================================
--- llvm/test/CodeGen/X86/usub_inc_iv.ll
+++ llvm/test/CodeGen/X86/usub_inc_iv.ll
@@ -12,11 +12,10 @@
 ; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
 ; CHECK-NEXT:    br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]
@@ -60,10 +59,10 @@
 ; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
 ; CHECK-NEXT:    br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -28
+; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -24
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
@@ -108,11 +107,10 @@
 ; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
 ; CHECK-NEXT:    br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]