diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3735,6 +3735,14 @@
                            const MemOp &Op, unsigned DstAS, unsigned SrcAS,
                            const AttributeList &FuncAttributes) const;
 
+  /// Determines the optimal base offset to fit offsets into the addressing mode
+  /// after CodeGenPrepare::splitLargeGEPOffsets. Return (BaseOffset, End) which
+  /// indicates that we use BaseOffset as the base offset for GEPs in range [0,
+  /// End).
+  virtual std::pair<int64_t, size_t> selectBaseOffsetForPrefix(
+      ArrayRef<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
+          LargeOffsetGEPs) const;
+
   /// Check to see if the specified operand of the specified instruction is a
   /// constant integer.  If so, check to see if there are any bits set in the
   /// constant that are not demanded.  If so, shrink the constant and return
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6095,91 +6095,91 @@
     // Skip if all the GEPs have the same offsets.
     if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)
       continue;
-    GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;
-    int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
-    Value *NewBaseGEP = nullptr;
-
-    auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
-    while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
-      GetElementPtrInst *GEP = LargeOffsetGEP->first;
-      int64_t Offset = LargeOffsetGEP->second;
-      if (Offset != BaseOffset) {
-        TargetLowering::AddrMode AddrMode;
-        AddrMode.HasBaseReg = true;
-        AddrMode.BaseOffs = Offset - BaseOffset;
-        // The result type of the GEP might not be the type of the memory
-        // access.
-        if (!TLI->isLegalAddressingMode(*DL, AddrMode,
-                                        GEP->getResultElementType(),
-                                        GEP->getAddressSpace())) {
-          // We need to create a new base if the offset to the current base is
-          // too large to fit into the addressing mode. So, a very large struct
-          // may be split into several parts.
-          BaseGEP = GEP;
-          BaseOffset = Offset;
-          NewBaseGEP = nullptr;
+    while (!LargeOffsetGEPs.empty()) {
+      auto [BaseOffset, End] = TLI->selectBaseOffsetForPrefix(LargeOffsetGEPs);
+      Value *NewBaseGEP = nullptr;
+
+      auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
+      for (size_t Idx = 0; Idx < End; ++Idx) {
+        GetElementPtrInst *GEP = LargeOffsetGEP->first;
+        int64_t Offset = LargeOffsetGEP->second;
+        if (Offset != BaseOffset) {
+          TargetLowering::AddrMode AddrMode;
+          AddrMode.HasBaseReg = true;
+          AddrMode.BaseOffs = Offset - BaseOffset;
+          // The result type of the GEP might not be the type of the memory
+          // access.
+          if (!TLI->isLegalAddressingMode(*DL, AddrMode,
+                                          GEP->getResultElementType(),
+                                          GEP->getAddressSpace())) {
+            // We need to create a new base if the offset to the current base is
+            // too large to fit into the addressing mode. So, a very large
+            // struct may be split into several parts.
+            BaseOffset = Offset;
+            NewBaseGEP = nullptr;
+          }
         }
-      }
 
-      // Generate a new GEP to replace the current one.
-      LLVMContext &Ctx = GEP->getContext();
-      Type *PtrIdxTy = DL->getIndexType(GEP->getType());
-      Type *I8PtrTy =
-          Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
-      Type *I8Ty = Type::getInt8Ty(Ctx);
-
-      if (!NewBaseGEP) {
-        // Create a new base if we don't have one yet.  Find the insertion
-        // pointer for the new base first.
-        BasicBlock::iterator NewBaseInsertPt;
-        BasicBlock *NewBaseInsertBB;
-        if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
-          // If the base of the struct is an instruction, the new base will be
-          // inserted close to it.
-          NewBaseInsertBB = BaseI->getParent();
-          if (isa<PHINode>(BaseI))
+        // Generate a new GEP to replace the current one.
+        LLVMContext &Ctx = GEP->getContext();
+        Type *PtrIdxTy = DL->getIndexType(GEP->getType());
+        Type *I8PtrTy =
+            Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
+        Type *I8Ty = Type::getInt8Ty(Ctx);
+
+        if (!NewBaseGEP) {
+          // Create a new base if we don't have one yet.  Find the insertion
+          // pointer for the new base first.
+          BasicBlock::iterator NewBaseInsertPt;
+          BasicBlock *NewBaseInsertBB;
+          if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
+            // If the base of the struct is an instruction, the new base will be
+            // inserted close to it.
+            NewBaseInsertBB = BaseI->getParent();
+            if (isa<PHINode>(BaseI))
+              NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+            else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
+              NewBaseInsertBB = SplitEdge(
+                  NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI);
+              NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+            } else
+              NewBaseInsertPt = std::next(BaseI->getIterator());
+          } else {
+            // If the current base is an argument or global value, the new base
+            // will be inserted to the entry block.
+            NewBaseInsertBB = &GEP->getFunction()->getEntryBlock();
             NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
-          else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
-            NewBaseInsertBB =
-                SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI);
-            NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
-          } else
-            NewBaseInsertPt = std::next(BaseI->getIterator());
-        } else {
-          // If the current base is an argument or global value, the new base
-          // will be inserted to the entry block.
-          NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
-          NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+          }
+          IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
+          // Create a new base.
+          Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset);
+          NewBaseGEP = OldBase;
+          if (NewBaseGEP->getType() != I8PtrTy)
+            NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
+          NewBaseGEP =
+              NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
+          NewGEPBases.insert(NewBaseGEP);
         }
-        IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
-        // Create a new base.
-        Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset);
-        NewBaseGEP = OldBase;
-        if (NewBaseGEP->getType() != I8PtrTy)
-          NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
-        NewBaseGEP =
-            NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
-        NewGEPBases.insert(NewBaseGEP);
-      }
 
-      IRBuilder<> Builder(GEP);
-      Value *NewGEP = NewBaseGEP;
-      if (Offset == BaseOffset) {
-        if (GEP->getType() != I8PtrTy)
-          NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
-      } else {
-        // Calculate the new offset for the new GEP.
-        Value *Index = ConstantInt::get(PtrIdxTy, Offset - BaseOffset);
-        NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);
+        IRBuilder<> Builder(GEP);
+        Value *NewGEP = NewBaseGEP;
+        if (Offset == BaseOffset) {
+          if (GEP->getType() != I8PtrTy)
+            NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
+        } else {
+          // Calculate the new offset for the new GEP.
+          Value *Index = ConstantInt::get(PtrIdxTy, Offset - BaseOffset);
+          NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);
 
-        if (GEP->getType() != I8PtrTy)
-          NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
+          if (GEP->getType() != I8PtrTy)
+            NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
+        }
+        replaceAllUsesWith(GEP, NewGEP, FreshBBs, IsHugeFunc);
+        LargeOffsetGEPID.erase(GEP);
+        LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
+        GEP->eraseFromParent();
+        Changed = true;
       }
-      replaceAllUsesWith(GEP, NewGEP, FreshBBs, IsHugeFunc);
-      LargeOffsetGEPID.erase(GEP);
-      LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
-      GEP->eraseFromParent();
-      Changed = true;
     }
   }
   return Changed;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -284,6 +284,14 @@
   return true;
 }
 
+// By default we use the GEP with smallest offset for all GEPs.
+std::pair<int64_t, size_t> TargetLowering::selectBaseOffsetForPrefix(
+    ArrayRef<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
+        LargeOffsetGEPs) const {
+  return std::make_pair(LargeOffsetGEPs.begin()->second,
+                        LargeOffsetGEPs.size());
+}
+
 /// Soften the operands of a comparison. This code is shared among BR_CC,
 /// SELECT_CC, and SETCC handlers.
 void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -765,6 +765,10 @@
                               MachineBasicBlock::instr_iterator &MBBI,
                               const TargetInstrInfo *TII) const override;
 
+  std::pair<int64_t, size_t> selectBaseOffsetForPrefix(
+      ArrayRef<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
+          LargeOffsetGEPs) const override;
+
   /// RISCVCCAssignFn - This target-specific function extends the default
   /// CCValAssign with additional information used to lower RISC-V calling
   /// conventions.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16959,6 +16959,103 @@
   return getTargetMMOFlags(NodeX) == getTargetMMOFlags(NodeY);
 }
 
+static Type *inferAccessTypeOfGEP(GetElementPtrInst *GEP) {
+  for (auto User : GEP->users()) {
+    if (auto AccessType = dyn_cast<Instruction>(User)->getAccessType()) {
+      return AccessType;
+    }
+  }
+  // fallback
+  return GEP->getResultElementType();
+}
+
+// Select the optimal base offset to make offset of GEPs fit into simm12.
+std::pair<int64_t, size_t> RISCVTargetLowering::selectBaseOffsetForPrefix(
+    ArrayRef<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
+        LargeOffsetGEPs) const {
+  int64_t Alignment = 1;
+  int64_t Range = 4096; // simm12
+  // Infer type of memory ops from the result element type of GEPs to use
+  // compressed load/store insts.
+  if (Subtarget.hasStdExtCOrZca()) {
+    Type *EleTy = inferAccessTypeOfGEP(LargeOffsetGEPs.begin()->first);
+    if (Subtarget.hasStdExtCOrZca() && EleTy->isIntegerTy(32)) {
+      Alignment = 4;
+      Range = 128; // simm5 scaled by 4
+    }
+    if (Subtarget.hasStdExtCOrZca() && EleTy->isIntegerTy(64)) {
+      Alignment = 8;
+      Range = 256; // simm5 scaled by 8
+    }
+    if (Subtarget.hasStdExtCOrZca() && EleTy->isPointerTy()) {
+      Alignment = Subtarget.getXLen();
+      Range = 32 * Alignment; // simm5 scaled by size of XLEN
+    }
+    if ((Subtarget.hasStdExtC() || Subtarget.hasStdExtZcf()) &&
+        EleTy->isFloatTy()) {
+      Alignment = 4;
+      Range = 128; // simm5 scaled by 4
+    }
+    if ((Subtarget.hasStdExtC() || Subtarget.hasStdExtZcd()) &&
+        EleTy->isDoubleTy()) {
+      Alignment = 8;
+      Range = 256; // simm5 scaled by 8
+    }
+  }
+  int64_t RangeMin = -Range / 2, RangeMax = Range / 2 - 1;
+
+  auto Iter = LargeOffsetGEPs.begin();
+  int64_t Begin = Iter->second;
+  int64_t End = Iter->second;
+  // Max + 1 - Min >= Alignment ==>
+  // (Begin - RangeMin) + 1 - (End - RangeMax) >= Alignment ==>
+  // End <= Begin + Range - Alignment
+  while (Iter != LargeOffsetGEPs.end() &&
+         Iter->second <= Begin + Range - Alignment) {
+    End = Iter->second;
+    ++Iter;
+  }
+  // Select the optimal base offset which satisfies `BaseOffset + RangeMin <=
+  // Begin <= End <= BaseOffset + RangeMax`
+  int64_t BaseOffsetMin = End - RangeMax, BaseOffsetMax = Begin - RangeMin;
+  // Make sure that at least one offset which satisfies the alignment
+  // requirement can be selected.
+  assert(BaseOffsetMax + 1 - BaseOffsetMin >= Alignment);
+  size_t Count = static_cast<size_t>(Iter - LargeOffsetGEPs.begin());
+  while ((BaseOffsetMin - Begin) % Alignment != 0)
+    ++BaseOffsetMin;
+  while ((BaseOffsetMax - Begin) % Alignment != 0)
+    --BaseOffsetMax;
+  // BaseOffset = BaseOffsetMin + k * Alignment
+  // Choose optimal offset in the following order:
+  // 1. in the range of [-2048, 2048). We can use addi to obtain the address.
+  // 2. divisible by 4096. We can use lui + add to obtain the address.
+  // 3. Otherwise, we just choose BaseOffsetMin.
+  int64_t BaseOffset;
+  if (BaseOffsetMin <= 0 && BaseOffsetMax >= 0) {
+    BaseOffset = BaseOffsetMax % Alignment;
+  } else {
+    BaseOffset = std::abs(BaseOffsetMin) < std::abs(BaseOffsetMax)
+                     ? BaseOffsetMin
+                     : BaseOffsetMax;
+  }
+  if (BaseOffset < -2048 || BaseOffset > 2047) {
+    // k * Alignment is congruent to BaseOffsetMax modulo 4096.
+    if (Begin % Alignment == 0) {
+      int64_t Offset = BaseOffsetMax / 4096 * 4096;
+      while (Offset > BaseOffsetMin) {
+        if (BaseOffsetMin <= Offset && Offset <= BaseOffsetMax) {
+          BaseOffset = Offset;
+          break;
+        }
+        Offset -= 4096;
+      }
+    }
+  }
+
+  return std::make_pair(BaseOffset, Count);
+}
+
 namespace llvm::RISCVVIntrinsicsTable {
 
 #define GET_RISCVVIntrinsicsTable_IMPL
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll
@@ -506,11 +506,11 @@
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB9_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addi a4, a0, 32
-; CHECK-NEXT:    addi a5, a1, -128
-; CHECK-NEXT:    vlse32.v v8, (a5), a3
+; CHECK-NEXT:    addi a4, a1, -128
+; CHECK-NEXT:    vlse32.v v8, (a4), a3
 ; CHECK-NEXT:    vlse32.v v9, (a1), a3
 ; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    addi a4, a0, 32
 ; CHECK-NEXT:    vle32.v v11, (a4)
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vadd.vv v9, v11, v9
@@ -655,11 +655,11 @@
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; V-NEXT:  .LBB11_1: # %bb2
 ; V-NEXT:    # =>This Inner Loop Header: Depth=1
-; V-NEXT:    addi a4, a1, 80
 ; V-NEXT:    vlse64.v v8, (a1), a3
+; V-NEXT:    addi a4, a1, 80
 ; V-NEXT:    vlse64.v v9, (a4), a3
-; V-NEXT:    addi a4, a0, 16
 ; V-NEXT:    vse64.v v8, (a0)
+; V-NEXT:    addi a4, a0, 16
 ; V-NEXT:    vse64.v v9, (a4)
 ; V-NEXT:    addi a2, a2, -4
 ; V-NEXT:    addi a0, a0, 32
@@ -732,8 +732,8 @@
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; V-NEXT:  .LBB12_1: # %bb2
 ; V-NEXT:    # =>This Inner Loop Header: Depth=1
-; V-NEXT:    addi a4, a1, 16
 ; V-NEXT:    vle64.v v8, (a1)
+; V-NEXT:    addi a4, a1, 16
 ; V-NEXT:    vle64.v v9, (a4)
 ; V-NEXT:    addi a4, a0, 80
 ; V-NEXT:    vsse64.v v8, (a0), a3
diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32,RV32I
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64,RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+c -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32,RV32C
+; RUN: llc -mtriple=riscv64 -mattr=+c -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64,RV64C
 
 ; Check that memory accesses to array elements with large offsets have those
 ; offsets split into a base offset, plus a smaller offset that is folded into
@@ -13,31 +17,59 @@
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    lui a2, 20
-; RV32I-NEXT:    addi a2, a2, -1920
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    li a2, 2
-; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    sw a2, -1920(a0)
 ; RV32I-NEXT:    li a3, 1
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a3, 0(a1)
-; RV32I-NEXT:    sw a2, 4(a1)
+; RV32I-NEXT:    sw a3, -1916(a0)
+; RV32I-NEXT:    sw a3, -1920(a1)
+; RV32I-NEXT:    sw a2, -1916(a1)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test1:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    lui a2, 20
-; RV64I-NEXT:    addiw a2, a2, -1920
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    li a2, 2
-; RV64I-NEXT:    sw a2, 0(a0)
+; RV64I-NEXT:    sw a2, -1920(a0)
 ; RV64I-NEXT:    li a3, 1
-; RV64I-NEXT:    sw a3, 4(a0)
-; RV64I-NEXT:    sw a3, 0(a1)
-; RV64I-NEXT:    sw a2, 4(a1)
+; RV64I-NEXT:    sw a3, -1916(a0)
+; RV64I-NEXT:    sw a3, -1920(a1)
+; RV64I-NEXT:    sw a2, -1916(a1)
 ; RV64I-NEXT:    ret
+;
+; RV32C-LABEL: test1:
+; RV32C:       # %bb.0: # %entry
+; RV32C-NEXT:    lw a0, 0(a0)
+; RV32C-NEXT:    lui a2, 20
+; RV32C-NEXT:    addi a2, a2, -1976
+; RV32C-NEXT:    add a1, a1, a2
+; RV32C-NEXT:    add a0, a0, a2
+; RV32C-NEXT:    li a2, 2
+; RV32C-NEXT:    sw a2, 56(a0)
+; RV32C-NEXT:    li a3, 1
+; RV32C-NEXT:    sw a3, 60(a0)
+; RV32C-NEXT:    sw a3, 56(a1)
+; RV32C-NEXT:    sw a2, 60(a1)
+; RV32C-NEXT:    ret
+;
+; RV64C-LABEL: test1:
+; RV64C:       # %bb.0: # %entry
+; RV64C-NEXT:    ld a0, 0(a0)
+; RV64C-NEXT:    lui a2, 20
+; RV64C-NEXT:    addiw a2, a2, -1976
+; RV64C-NEXT:    add a1, a1, a2
+; RV64C-NEXT:    add a0, a0, a2
+; RV64C-NEXT:    li a2, 2
+; RV64C-NEXT:    sw a2, 56(a0)
+; RV64C-NEXT:    li a3, 1
+; RV64C-NEXT:    sw a3, 60(a0)
+; RV64C-NEXT:    sw a3, 56(a1)
+; RV64C-NEXT:    sw a2, 60(a1)
+; RV64C-NEXT:    ret
 entry:
   %s = load ptr, ptr %sp
   %gep0 = getelementptr [65536 x i32], ptr %s, i64 0, i32 20000
@@ -55,31 +87,29 @@
 define void @test2(ptr %sp, ptr %t, i32 %n) {
 ; RV32I-LABEL: test2:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a3, 0(a0)
+; RV32I-NEXT:    li a0, 0
 ; RV32I-NEXT:    lui a4, 20
-; RV32I-NEXT:    addi a4, a4, -1920
 ; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    add a0, a0, a4
+; RV32I-NEXT:    add a3, a3, a4
 ; RV32I-NEXT:    blez a2, .LBB1_2
 ; RV32I-NEXT:  .LBB1_1: # %while_body
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    addi a4, a3, 1
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a4, 0(a1)
-; RV32I-NEXT:    sw a3, 4(a1)
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    addi a4, a0, 1
+; RV32I-NEXT:    sw a4, -1920(a3)
+; RV32I-NEXT:    sw a0, -1916(a3)
+; RV32I-NEXT:    sw a4, -1920(a1)
+; RV32I-NEXT:    sw a0, -1916(a1)
+; RV32I-NEXT:    mv a0, a4
 ; RV32I-NEXT:    blt a4, a2, .LBB1_1
 ; RV32I-NEXT:  .LBB1_2: # %while_end
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test2:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    lui a4, 20
-; RV64I-NEXT:    addiw a4, a4, -1920
 ; RV64I-NEXT:    add a1, a1, a4
 ; RV64I-NEXT:    add a0, a0, a4
 ; RV64I-NEXT:    sext.w a2, a2
@@ -87,14 +117,57 @@
 ; RV64I-NEXT:  .LBB1_1: # %while_body
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    addiw a4, a3, 1
-; RV64I-NEXT:    sw a4, 0(a0)
-; RV64I-NEXT:    sw a3, 4(a0)
-; RV64I-NEXT:    sw a4, 0(a1)
-; RV64I-NEXT:    sw a3, 4(a1)
+; RV64I-NEXT:    sw a4, -1920(a0)
+; RV64I-NEXT:    sw a3, -1916(a0)
+; RV64I-NEXT:    sw a4, -1920(a1)
+; RV64I-NEXT:    sw a3, -1916(a1)
 ; RV64I-NEXT:    mv a3, a4
 ; RV64I-NEXT:    blt a4, a2, .LBB1_1
 ; RV64I-NEXT:  .LBB1_2: # %while_end
 ; RV64I-NEXT:    ret
+;
+; RV32C-LABEL: test2:
+; RV32C:       # %bb.0: # %entry
+; RV32C-NEXT:    li a3, 0
+; RV32C-NEXT:    lw a0, 0(a0)
+; RV32C-NEXT:    lui a4, 20
+; RV32C-NEXT:    addi a4, a4, -1976
+; RV32C-NEXT:    add a1, a1, a4
+; RV32C-NEXT:    add a0, a0, a4
+; RV32C-NEXT:    blez a2, .LBB1_2
+; RV32C-NEXT:  .LBB1_1: # %while_body
+; RV32C-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32C-NEXT:    addi a4, a3, 1
+; RV32C-NEXT:    sw a4, 56(a0)
+; RV32C-NEXT:    sw a3, 60(a0)
+; RV32C-NEXT:    sw a4, 56(a1)
+; RV32C-NEXT:    sw a3, 60(a1)
+; RV32C-NEXT:    mv a3, a4
+; RV32C-NEXT:    blt a4, a2, .LBB1_1
+; RV32C-NEXT:  .LBB1_2: # %while_end
+; RV32C-NEXT:    ret
+;
+; RV64C-LABEL: test2:
+; RV64C:       # %bb.0: # %entry
+; RV64C-NEXT:    li a3, 0
+; RV64C-NEXT:    ld a0, 0(a0)
+; RV64C-NEXT:    lui a4, 20
+; RV64C-NEXT:    addiw a4, a4, -1976
+; RV64C-NEXT:    add a1, a1, a4
+; RV64C-NEXT:    add a0, a0, a4
+; RV64C-NEXT:    sext.w a2, a2
+; RV64C-NEXT:    blez a2, .LBB1_2
+; RV64C-NEXT:  .LBB1_1: # %while_body
+; RV64C-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64C-NEXT:    addiw a4, a3, 1
+; RV64C-NEXT:    sw a4, 56(a0)
+; RV64C-NEXT:    sw a3, 60(a0)
+; RV64C-NEXT:    sw a4, 56(a1)
+; RV64C-NEXT:    sw a3, 60(a1)
+; RV64C-NEXT:    mv a3, a4
+; RV64C-NEXT:    blt a4, a2, .LBB1_1
+; RV64C-NEXT:  .LBB1_2: # %while_end
+; RV64C-NEXT:    ret
 entry:
   %s = load ptr, ptr %sp
   br label %while_cond
@@ -122,27 +195,27 @@
 ; instructions. Make sure we use an offset and common base for each of the
 ; stores.
 define void @test3(ptr %t) {
-; RV32I-LABEL: test3:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lui a1, 20
-; RV32I-NEXT:    addi a1, a1, -1920
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    sw a1, 4(a0)
-; RV32I-NEXT:    li a1, 3
-; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    ret
+; RV32-LABEL: test3:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lui a1, 20
+; RV32-NEXT:    addi a1, a1, -1920
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    li a1, 2
+; RV32-NEXT:    sw a1, 4(a0)
+; RV32-NEXT:    li a1, 3
+; RV32-NEXT:    sw a1, 8(a0)
+; RV32-NEXT:    ret
 ;
-; RV64I-LABEL: test3:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    lui a1, 20
-; RV64I-NEXT:    addiw a1, a1, -1920
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    sw a1, 4(a0)
-; RV64I-NEXT:    li a1, 3
-; RV64I-NEXT:    sw a1, 8(a0)
-; RV64I-NEXT:    ret
+; RV64-LABEL: test3:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lui a1, 20
+; RV64-NEXT:    addiw a1, a1, -1920
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    li a1, 2
+; RV64-NEXT:    sw a1, 4(a0)
+; RV64-NEXT:    li a1, 3
+; RV64-NEXT:    sw a1, 8(a0)
+; RV64-NEXT:    ret
 entry:
   %splitgep = getelementptr i8, ptr %t, i64 80000
   %0 = getelementptr i8, ptr %splitgep, i64 4
@@ -154,34 +227,151 @@
 
 ; Test from PR62734.
 define void @test4(ptr %dest) {
-; RV32I-LABEL: test4:
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    li a1, 1
+; CHECK-NEXT:    sb a1, 2044(a0)
+; CHECK-NEXT:    sb a1, 2045(a0)
+; CHECK-NEXT:    sb a1, 2046(a0)
+; CHECK-NEXT:    sb a1, 2047(a0)
+; CHECK-NEXT:    ret
+  %p1 = getelementptr i8, ptr %dest, i32 2048
+  store i8 1, ptr %p1
+  %p2 = getelementptr i8, ptr %dest, i32 2049
+  store i8 1, ptr %p2
+  %p3 = getelementptr i8, ptr %dest, i32 2050
+  store i8 1, ptr %p3
+  %p4 = getelementptr i8, ptr %dest, i32 2051
+  store i8 1, ptr %p4
+  ret void
+}
+
+; Select aligned offset when we can benefit from compressed load/store instructions.
+define void @test5(ptr %dest) {
+; RV32I-LABEL: test5:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a0, a0, 1965
+; RV32I-NEXT:    li a1, 1
+; RV32I-NEXT:    sw a1, 2035(a0)
+; RV32I-NEXT:    sw a1, 2039(a0)
+; RV32I-NEXT:    sw a1, 2043(a0)
+; RV32I-NEXT:    sw a1, 2047(a0)
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test5:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 1965
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    sw a1, 2035(a0)
+; RV64I-NEXT:    sw a1, 2039(a0)
+; RV64I-NEXT:    sw a1, 2043(a0)
+; RV64I-NEXT:    sw a1, 2047(a0)
+; RV64I-NEXT:    ret
+;
+; RV32C-LABEL: test5:
+; RV32C:       # %bb.0:
+; RV32C-NEXT:    addi a0, a0, 2047
+; RV32C-NEXT:    addi a0, a0, 1905
+; RV32C-NEXT:    li a1, 1
+; RV32C-NEXT:    sw a1, 48(a0)
+; RV32C-NEXT:    sw a1, 52(a0)
+; RV32C-NEXT:    sw a1, 56(a0)
+; RV32C-NEXT:    sw a1, 60(a0)
+; RV32C-NEXT:    ret
+;
+; RV64C-LABEL: test5:
+; RV64C:       # %bb.0:
+; RV64C-NEXT:    addi a0, a0, 2047
+; RV64C-NEXT:    addi a0, a0, 1905
+; RV64C-NEXT:    li a1, 1
+; RV64C-NEXT:    sw a1, 48(a0)
+; RV64C-NEXT:    sw a1, 52(a0)
+; RV64C-NEXT:    sw a1, 56(a0)
+; RV64C-NEXT:    sw a1, 60(a0)
+; RV64C-NEXT:    ret
+  %p1 = getelementptr i32, ptr %dest, i32 1000
+  store i32 1, ptr %p1
+  %p2 = getelementptr i32, ptr %dest, i32 1001
+  store i32 1, ptr %p2
+  %p3 = getelementptr i32, ptr %dest, i32 1002
+  store i32 1, ptr %p3
+  %p4 = getelementptr i32, ptr %dest, i32 1003
+  store i32 1, ptr %p4
+  ret void
+}
+
+; FIXME: We can reuse a1 to emit compressed load/store instructions.
+define void @test6(ptr %dest) {
+; RV32I-LABEL: test6:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a0, a0, 2047
-; RV32I-NEXT:    addi a1, a0, 1
+; RV32I-NEXT:    addi a1, a0, 5
 ; RV32I-NEXT:    li a2, 1
-; RV32I-NEXT:    sb a2, 1(a0)
-; RV32I-NEXT:    sb a2, 1(a1)
-; RV32I-NEXT:    sb a2, 2(a1)
-; RV32I-NEXT:    sb a2, 3(a1)
+; RV32I-NEXT:    sw a2, 2040(a0)
+; RV32I-NEXT:    sw a2, 2044(a0)
+; RV32I-NEXT:    sw a2, 2043(a1)
+; RV32I-NEXT:    sw a2, 2047(a1)
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: test4:
+; RV64I-LABEL: test6:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a1, a0, 1
+; RV64I-NEXT:    addi a1, a0, 5
 ; RV64I-NEXT:    li a2, 1
-; RV64I-NEXT:    sb a2, 1(a0)
-; RV64I-NEXT:    sb a2, 1(a1)
-; RV64I-NEXT:    sb a2, 2(a1)
-; RV64I-NEXT:    sb a2, 3(a1)
+; RV64I-NEXT:    sw a2, 2040(a0)
+; RV64I-NEXT:    sw a2, 2044(a0)
+; RV64I-NEXT:    sw a2, 2043(a1)
+; RV64I-NEXT:    sw a2, 2047(a1)
 ; RV64I-NEXT:    ret
-  %p1 = getelementptr i8, ptr %dest, i32 2048
+;
+; RV32C-LABEL: test6:
+; RV32C:       # %bb.0:
+; RV32C-NEXT:    addi a1, a0, 1992
+; RV32C-NEXT:    li a2, 1
+; RV32C-NEXT:    sw a2, 2040(a0)
+; RV32C-NEXT:    sw a2, 2044(a0)
+; RV32C-NEXT:    sw a2, 56(a1)
+; RV32C-NEXT:    sw a2, 60(a1)
+; RV32C-NEXT:    ret
+;
+; RV64C-LABEL: test6:
+; RV64C:       # %bb.0:
+; RV64C-NEXT:    addi a1, a0, 1992
+; RV64C-NEXT:    li a2, 1
+; RV64C-NEXT:    sw a2, 2040(a0)
+; RV64C-NEXT:    sw a2, 2044(a0)
+; RV64C-NEXT:    sw a2, 56(a1)
+; RV64C-NEXT:    sw a2, 60(a1)
+; RV64C-NEXT:    ret
+  %p1 = getelementptr i32, ptr %dest, i32 510
+  store i32 1, ptr %p1
+  %p2 = getelementptr i32, ptr %dest, i32 511
+  store i32 1, ptr %p2
+  %p3 = getelementptr i32, ptr %dest, i32 512
+  store i32 1, ptr %p3
+  %p4 = getelementptr i32, ptr %dest, i32 513
+  store i32 1, ptr %p4
+  ret void
+}
+
+; Select offset divisible by 4096 to use lui + add to obtain the address.
+define void @test7(ptr %dest) {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1024
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    li a1, 1
+; CHECK-NEXT:    sb a1, 1(a0)
+; CHECK-NEXT:    sb a1, 2(a0)
+; CHECK-NEXT:    sb a1, 3(a0)
+; CHECK-NEXT:    sb a1, 4(a0)
+; CHECK-NEXT:    ret
+  %p1 = getelementptr i8, ptr %dest, i32 4194305
   store i8 1, ptr %p1
-  %p2 = getelementptr i8, ptr %dest, i32 2049
+  %p2 = getelementptr i8, ptr %dest, i32 4194306
   store i8 1, ptr %p2
-  %p3 = getelementptr i8, ptr %dest, i32 2050
+  %p3 = getelementptr i8, ptr %dest, i32 4194307
   store i8 1, ptr %p3
-  %p4 = getelementptr i8, ptr %dest, i32 2051
+  %p4 = getelementptr i8, ptr %dest, i32 4194308
   store i8 1, ptr %p4
   ret void
 }