Index: llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
===================================================================
--- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -128,6 +128,7 @@
 #include "RISCVGenDAGISel.inc"
 
 private:
+  bool doPeepholeLoadStoreADDI(SDNode *Node);
   bool doPeepholeSExtW(SDNode *Node);
   bool doPeepholeMaskedRVV(SDNode *Node);
 };
Index: llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -146,6 +146,7 @@
       continue;
 
     MadeChange |= doPeepholeSExtW(N);
+    MadeChange |= doPeepholeLoadStoreADDI(N);
     MadeChange |= doPeepholeMaskedRVV(N);
   }
 
@@ -155,6 +156,48 @@
     CurDAG->RemoveDeadNodes();
 }
 
+// Returns true if N is a MachineSDNode that has a reg and a constant zero
+// memory operand. The indices of the base pointer and offset are returned in
+// BaseOpIdx and OffsetOpIdx.
+static bool hasConstantZeroMemOffset(SDNode *N, unsigned &BaseOpIdx,
+                                     unsigned &OffsetOpIdx) {
+  if (!N->isMachineOpcode())
+    return false;
+
+  switch (N->getMachineOpcode()) {
+  case RISCV::LB:
+  case RISCV::LH:
+  case RISCV::LW:
+  case RISCV::LBU:
+  case RISCV::LHU:
+  case RISCV::LWU:
+  case RISCV::LD:
+  case RISCV::FLH:
+  case RISCV::FLW:
+  case RISCV::FLD:
+    BaseOpIdx = 0;
+    OffsetOpIdx = 1;
+    break;
+  case RISCV::SB:
+  case RISCV::SH:
+  case RISCV::SW:
+  case RISCV::SD:
+  case RISCV::FSH:
+  case RISCV::FSW:
+  case RISCV::FSD:
+    BaseOpIdx = 1;
+    OffsetOpIdx = 2;
+    break;
+  default:
+    return false;
+  }
+
+  if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
+    return false;
+
+  return (N->getConstantOperandVal(OffsetOpIdx) == 0);
+}
+
 static SDNode *selectImmSeq(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
                             RISCVMatInt::InstSeq &Seq) {
   SDNode *Result = nullptr;
@@ -1821,8 +1864,8 @@
   return true;
 }
 
-// Is this ADD instruction only used as the base pointer of scalar loads and
-// stores?
+// Is this ADD/ADD_LO instruction only used as the base pointer of scalar
+// loads and stores?
 static bool isWorthFoldingAdd(SDValue Add) {
   for (auto Use : Add->uses()) {
     if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
@@ -1853,8 +1896,12 @@
   SDLoc DL(Addr);
   MVT VT = Addr.getSimpleValueType();
 
-  if (Addr.getOpcode() == RISCVISD::ADD_LO) {
-    Base = Addr.getOperand(0);
+  // Select the Base and Offset from the ADD_LO except in the case that the
+  // ADD_LO is used in non-memory instruction (e.g. as a base to an add) and
+  // the compressed extension is present. In that case, leaving it separated
+  // may increase the chance of compressing the load/store.
+  if (Addr.getOpcode() == RISCVISD::ADD_LO && (!Subtarget->hasStdExtC() ||
+        isWorthFoldingAdd(Addr))) { Base = Addr.getOperand(0);
     Offset = Addr.getOperand(1);
     return true;
   }
@@ -2336,6 +2383,46 @@
   return false;
 }
 
+// SelectAddrRegImm won't merge an ADD_LO into a memory operation if it has
+// uses that aren't scalar loads and stores. This will turn out to be a bad
+// decision if all those other uses end up being merged into memory
+// operations. This peephole folds the resulting ADDI back in that is the
+// case.
+bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
+  unsigned OffsetOpIdx, BaseOpIdx;
+  if (!hasConstantZeroMemOffset(N, BaseOpIdx, OffsetOpIdx))
+    return false;
+
+  SDValue Base = N->getOperand(BaseOpIdx);
+  if (!Base.isMachineOpcode())
+    return false;
+  if (Base.getMachineOpcode() != RISCV::ADDI)
+    return false;
+  if (!isa<GlobalAddressSDNode>(Base.getOperand(1)))
+    return false;
+  for (auto Use : Base->uses()) {
+    unsigned Dummy1, Dummy2;
+    if (!hasConstantZeroMemOffset(Use, Dummy1, Dummy2))
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+  LLVM_DEBUG(Base->dump(CurDAG));
+  LLVM_DEBUG(dbgs() << "\nN: ");
+  LLVM_DEBUG(N->dump(CurDAG));
+  LLVM_DEBUG(dbgs() << "\n");
+
+  if (BaseOpIdx == 0) { // Load
+    N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), Base.getOperand(1),
+                                   N->getOperand(2));
+  } else { // Store
+    N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+                                   Base.getOperand(1), N->getOperand(3));
+  }
+
+  return true;
+}
+
 // Try to remove sext.w if the input is a W instruction or can be made into
 // a W instruction cheaply.
 bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
Index: llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
===================================================================
--- llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefix=RV32 %s
+; RUN:   | FileCheck -check-prefixes=RV32,RV32I %s
+; RUN: llc -mtriple=riscv32 -mattr=+c -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32,RV32C %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefix=RV64 %s
+; RUN:   | FileCheck -check-prefixes=RV64,RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+c -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64,RV64C %s
 
 ; We can often fold an ADDI into the offset of load/store instructions:
 ;   (load (addi base, off1), off2) -> (load base, off1+off2)
@@ -37,13 +41,21 @@
 }
 
 define dso_local i64 @load_g_1() nounwind {
-; RV32-LABEL: load_g_1:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a1, %hi(g_1)
-; RV32-NEXT:    lw a0, %lo(g_1)(a1)
-; RV32-NEXT:    addi a1, a1, %lo(g_1)
-; RV32-NEXT:    lw a1, 4(a1)
-; RV32-NEXT:    ret
+; RV32I-LABEL: load_g_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, %hi(g_1)
+; RV32I-NEXT:    lw a0, %lo(g_1)(a1)
+; RV32I-NEXT:    addi a1, a1, %lo(g_1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    ret
+;
+; RV32C-LABEL: load_g_1:
+; RV32C:       # %bb.0: # %entry
+; RV32C-NEXT:    lui a0, %hi(g_1)
+; RV32C-NEXT:    addi a1, a0, %lo(g_1)
+; RV32C-NEXT:    lw a0, 0(a1)
+; RV32C-NEXT:    lw a1, 4(a1)
+; RV32C-NEXT:    ret
 ;
 ; RV64-LABEL: load_g_1:
 ; RV64:       # %bb.0: # %entry
@@ -56,13 +68,21 @@
 }
 
 define dso_local i64 @load_g_2() nounwind {
-; RV32-LABEL: load_g_2:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a1, %hi(g_2)
-; RV32-NEXT:    lw a0, %lo(g_2)(a1)
-; RV32-NEXT:    addi a1, a1, %lo(g_2)
-; RV32-NEXT:    lw a1, 4(a1)
-; RV32-NEXT:    ret
+; RV32I-LABEL: load_g_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, %hi(g_2)
+; RV32I-NEXT:    lw a0, %lo(g_2)(a1)
+; RV32I-NEXT:    addi a1, a1, %lo(g_2)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    ret
+;
+; RV32C-LABEL: load_g_2:
+; RV32C:       # %bb.0: # %entry
+; RV32C-NEXT:    lui a0, %hi(g_2)
+; RV32C-NEXT:    addi a1, a0, %lo(g_2)
+; RV32C-NEXT:    lw a0, 0(a1)
+; RV32C-NEXT:    lw a1, 4(a1)
+; RV32C-NEXT:    ret
 ;
 ; RV64-LABEL: load_g_2:
 ; RV64:       # %bb.0: # %entry
@@ -75,13 +95,21 @@
 }
 
 define dso_local i64 @load_g_4() nounwind {
-; RV32-LABEL: load_g_4:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a1, %hi(g_4)
-; RV32-NEXT:    lw a0, %lo(g_4)(a1)
-; RV32-NEXT:    addi a1, a1, %lo(g_4)
-; RV32-NEXT:    lw a1, 4(a1)
-; RV32-NEXT:    ret
+; RV32I-LABEL: load_g_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, %hi(g_4)
+; RV32I-NEXT:    lw a0, %lo(g_4)(a1)
+; RV32I-NEXT:    addi a1, a1, %lo(g_4)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    ret
+;
+; RV32C-LABEL: load_g_4:
+; RV32C:       # %bb.0: # %entry
+; RV32C-NEXT:    lui a0, %hi(g_4)
+; RV32C-NEXT:    addi a1, a0, %lo(g_4)
+; RV32C-NEXT:    lw a0, 0(a1)
+; RV32C-NEXT:    lw a1, 4(a1)
+; RV32C-NEXT:    ret
 ;
 ; RV64-LABEL: load_g_4:
 ; RV64:       # %bb.0: # %entry
@@ -130,13 +158,21 @@
 }
 
 define dso_local void @store_g_4() nounwind {
-; RV32-LABEL: store_g_4:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a0, %hi(g_4)
-; RV32-NEXT:    sw zero, %lo(g_4)(a0)
-; RV32-NEXT:    addi a0, a0, %lo(g_4)
-; RV32-NEXT:    sw zero, 4(a0)
-; RV32-NEXT:    ret
+; RV32I-LABEL: store_g_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a0, %hi(g_4)
+; RV32I-NEXT:    sw zero, %lo(g_4)(a0)
+; RV32I-NEXT:    addi a0, a0, %lo(g_4)
+; RV32I-NEXT:    sw zero, 4(a0)
+; RV32I-NEXT:    ret
+;
+; RV32C-LABEL: store_g_4:
+; RV32C:       # %bb.0: # %entry
+; RV32C-NEXT:    lui a0, %hi(g_4)
+; RV32C-NEXT:    addi a0, a0, %lo(g_4)
+; RV32C-NEXT:    sw zero, 4(a0)
+; RV32C-NEXT:    sw zero, 0(a0)
+; RV32C-NEXT:    ret
 ;
 ; RV64-LABEL: store_g_4:
 ; RV64:       # %bb.0: # %entry
@@ -244,23 +280,41 @@
 @ga32 = dso_local global [4 x i32] zeroinitializer, align 4
 
 define dso_local i32 @load_ga32_multi() nounwind {
-; RV32-LABEL: load_ga32_multi:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a0, %hi(ga32)
-; RV32-NEXT:    lw a1, %lo(ga32)(a0)
-; RV32-NEXT:    addi a0, a0, %lo(ga32)
-; RV32-NEXT:    lw a0, 4(a0)
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    ret
+; RV32I-LABEL: load_ga32_multi:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a0, %hi(ga32)
+; RV32I-NEXT:    lw a1, %lo(ga32)(a0)
+; RV32I-NEXT:    addi a0, a0, %lo(ga32)
+; RV32I-NEXT:    lw a0, 4(a0)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: load_ga32_multi:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lui a0, %hi(ga32)
-; RV64-NEXT:    lw a1, %lo(ga32)(a0)
-; RV64-NEXT:    addi a0, a0, %lo(ga32)
-; RV64-NEXT:    lw a0, 4(a0)
-; RV64-NEXT:    addw a0, a1, a0
-; RV64-NEXT:    ret
+; RV32C-LABEL: load_ga32_multi:
+; RV32C:       # %bb.0: # %entry
+; RV32C-NEXT:    lui a0, %hi(ga32)
+; RV32C-NEXT:    addi a0, a0, %lo(ga32)
+; RV32C-NEXT:    lw a1, 0(a0)
+; RV32C-NEXT:    lw a0, 4(a0)
+; RV32C-NEXT:    add a0, a0, a1
+; RV32C-NEXT:    ret
+;
+; RV64I-LABEL: load_ga32_multi:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lui a0, %hi(ga32)
+; RV64I-NEXT:    lw a1, %lo(ga32)(a0)
+; RV64I-NEXT:    addi a0, a0, %lo(ga32)
+; RV64I-NEXT:    lw a0, 4(a0)
+; RV64I-NEXT:    addw a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64C-LABEL: load_ga32_multi:
+; RV64C:       # %bb.0: # %entry
+; RV64C-NEXT:    lui a0, %hi(ga32)
+; RV64C-NEXT:    addi a0, a0, %lo(ga32)
+; RV64C-NEXT:    lw a1, 0(a0)
+; RV64C-NEXT:    lw a0, 4(a0)
+; RV64C-NEXT:    addw a0, a0, a1
+; RV64C-NEXT:    ret
 entry:
   %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @ga32, i32 0, i32 0)
   %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @ga32, i32 0, i32 1)
@@ -274,14 +328,23 @@
 @tl_8 = dso_local thread_local global i64 0, align 8
 
 define dso_local i64 @load_tl_4() nounwind {
-; RV32-LABEL: load_tl_4:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lui a0, %tprel_hi(tl_4)
-; RV32-NEXT:    add a1, a0, tp, %tprel_add(tl_4)
-; RV32-NEXT:    lw a0, %tprel_lo(tl_4)(a1)
-; RV32-NEXT:    addi a1, a1, %tprel_lo(tl_4)
-; RV32-NEXT:    lw a1, 4(a1)
-; RV32-NEXT:    ret
+; RV32I-LABEL: load_tl_4:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a0, %tprel_hi(tl_4)
+; RV32I-NEXT:    add a1, a0, tp, %tprel_add(tl_4)
+; RV32I-NEXT:    lw a0, %tprel_lo(tl_4)(a1)
+; RV32I-NEXT:    addi a1, a1, %tprel_lo(tl_4)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    ret
+;
+; RV32C-LABEL: load_tl_4:
+; RV32C:       # %bb.0: # %entry
+; RV32C-NEXT:    lui a0, %tprel_hi(tl_4)
+; RV32C-NEXT:    add a0, a0, tp, %tprel_add(tl_4)
+; RV32C-NEXT:    addi a1, a0, %tprel_lo(tl_4)
+; RV32C-NEXT:    lw a0, 0(a1)
+; RV32C-NEXT:    lw a1, 4(a1)
+; RV32C-NEXT:    ret
 ;
 ; RV64-LABEL: load_tl_4:
 ; RV64:       # %bb.0: # %entry
Index: llvm/test/CodeGen/RISCV/global-merge-minsize.ll
===================================================================
--- llvm/test/CodeGen/RISCV/global-merge-minsize.ll
+++ llvm/test/CodeGen/RISCV/global-merge-minsize.ll
@@ -25,10 +25,6 @@
   ret void
 }
 
-; TODO: It would be better for code size to alter the first store below by
-; first fully materialising .L_MergedGlobals in a1 and then storing to it with
-; a 0 offset.
-
 define void @f2(i32 %a) nounwind minsize optsize {
 ; CHECK-LABEL: f2:
 ; CHECK:       # %bb.0:
Index: llvm/test/CodeGen/RISCV/global-merge-offset.ll
===================================================================
--- llvm/test/CodeGen/RISCV/global-merge-offset.ll
+++ llvm/test/CodeGen/RISCV/global-merge-offset.ll
@@ -15,9 +15,6 @@
 @ga2 = dso_local global [ArrSize x i32] zeroinitializer, align 4
 @gi = dso_local global i32 0, align 4
 
-; TODO: It would be better for codesize if the final store below was
-; `sw a0, 0(a2)`.
-
 define void @f1(i32 %a) nounwind {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
Index: llvm/test/CodeGen/RISCV/global-merge.ll
===================================================================
--- llvm/test/CodeGen/RISCV/global-merge.ll
+++ llvm/test/CodeGen/RISCV/global-merge.ll
@@ -10,10 +10,6 @@
 @eg1 = dso_local global i32 0, align 4
 @eg2 = dso_local global i32 0, align 4
 
-; TODO: It would be better for code size to alter the first store below by
-; first fully materialising .L_MergedGlobals in a1 and then storing to it with
-; a 0 offset.
-
 define void @f1(i32 %a) nounwind {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0: