diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -153,6 +153,40 @@
     CurDAG->RemoveDeadNodes();
 }
 
+// Returns true if N is a MachineSDNode that has a reg and simm12 memory
+// operand. The indices of the base pointer and offset are returned in BaseOpIdx
+// and OffsetOpIdx.
+static bool hasMemOffset(SDNode *N, unsigned &BaseOpIdx,
+                         unsigned &OffsetOpIdx) {
+  switch (N->getMachineOpcode()) {
+  case RISCV::LB:
+  case RISCV::LH:
+  case RISCV::LW:
+  case RISCV::LBU:
+  case RISCV::LHU:
+  case RISCV::LWU:
+  case RISCV::LD:
+  case RISCV::FLH:
+  case RISCV::FLW:
+  case RISCV::FLD:
+    BaseOpIdx = 0;
+    OffsetOpIdx = 1;
+    return true;
+  case RISCV::SB:
+  case RISCV::SH:
+  case RISCV::SW:
+  case RISCV::SD:
+  case RISCV::FSH:
+  case RISCV::FSW:
+  case RISCV::FSD:
+    BaseOpIdx = 1;
+    OffsetOpIdx = 2;
+    return true;
+  }
+
+  return false;
+}
+
 static SDNode *selectImmWithConstantPool(SelectionDAG *CurDAG, const SDLoc &DL,
                                          const MVT VT, int64_t Imm,
                                          const RISCVSubtarget &Subtarget) {
@@ -664,6 +698,70 @@
     ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
     return;
   }
+  case ISD::ADD: {
+    // Try to select ADD + immediate used as memory addresses to
+    // (ADDI (ADD X, Imm-Lo12), Lo12) if it will allow the ADDI to be removed by
+    // doPeepholeLoadStoreADDI.
+
+    // LHS should be an immediate.
+    auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+    if (!N1C)
+      break;
+
+    int64_t Offset = N1C->getSExtValue();
+    int64_t Lo12 = SignExtend64<12>(Offset);
+
+    // Don't do this if the lower 12 bits are 0 or we could use ADDI directly.
+    if (Lo12 == 0 || isInt<12>(Offset))
+      break;
+
+    // Don't do this if we can use a pair of ADDIs.
+    if (isInt<12>(Offset / 2) && isInt<12>(Offset - Offset / 2))
+      break;
+
+    bool AllPointerUses = true;
+    for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
+      SDNode *User = *UI;
+
+      // Is this user a memory instruction that uses a register and immediate
+      // that has this ADD as its pointer.
+      unsigned BaseOpIdx, OffsetOpIdx;
+      if (!User->isMachineOpcode() ||
+          !hasMemOffset(User, BaseOpIdx, OffsetOpIdx) ||
+          UI.getOperandNo() != BaseOpIdx) {
+        AllPointerUses = false;
+        break;
+      }
+
+      // If the memory instruction already has an offset, make sure the combined
+      // offset is foldable.
+      int64_t MemOffs =
+          cast<ConstantSDNode>(User->getOperand(OffsetOpIdx))->getSExtValue();
+      MemOffs += Lo12;
+      if (!isInt<12>(MemOffs)) {
+        AllPointerUses = false;
+        break;
+      }
+    }
+
+    if (!AllPointerUses)
+      break;
+
+    Offset -= Lo12;
+    // Restore sign bits for RV32.
+    if (!Subtarget->is64Bit())
+      Offset = SignExtend64<32>(Offset);
+
+    // Emit (ADDI (ADD X, Hi), Lo)
+    SDNode *Imm = selectImm(CurDAG, DL, VT, Offset, *Subtarget);
+    SDNode *ADD = CurDAG->getMachineNode(RISCV::ADD, DL, VT,
+                                         Node->getOperand(0), SDValue(Imm, 0));
+    SDNode *ADDI =
+        CurDAG->getMachineNode(RISCV::ADDI, DL, VT, SDValue(ADD, 0),
+                               CurDAG->getTargetConstant(Lo12, DL, VT));
+    ReplaceNode(Node, ADDI);
+    return;
+  }
   case ISD::SRL: {
     // Optimize (srl (and X, C2), C) ->
     //          (srli (slli X, (XLen-C3), (XLen-C3) + C)
@@ -2097,37 +2195,9 @@
 //    -> (store val, (add base, src), off1+off2)
 // This is possible when off1+off2 fits a 12-bit immediate.
 bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
-  int OffsetOpIdx;
-  int BaseOpIdx;
-
-  // Only attempt this optimisation for I-type loads and S-type stores.
-  switch (N->getMachineOpcode()) {
-  default:
+  unsigned OffsetOpIdx, BaseOpIdx;
+  if (!hasMemOffset(N, BaseOpIdx, OffsetOpIdx))
     return false;
-  case RISCV::LB:
-  case RISCV::LH:
-  case RISCV::LW:
-  case RISCV::LBU:
-  case RISCV::LHU:
-  case RISCV::LWU:
-  case RISCV::LD:
-  case RISCV::FLH:
-  case RISCV::FLW:
-  case RISCV::FLD:
-    BaseOpIdx = 0;
-    OffsetOpIdx = 1;
-    break;
-  case RISCV::SB:
-  case RISCV::SH:
-  case RISCV::SW:
-  case RISCV::SD:
-  case RISCV::FSH:
-  case RISCV::FSW:
-  case RISCV::FSD:
-    BaseOpIdx = 1;
-    OffsetOpIdx = 2;
-    break;
-  }
 
   if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
     return false;
@@ -2137,54 +2207,8 @@
   if (!Base.isMachineOpcode())
     return false;
 
-  // There is a ADD between ADDI and load/store. We can only fold ADDI that
-  // do not have a FrameIndex operand.
-  SDValue Add;
-  unsigned AddBaseIdx;
-  if (Base.getMachineOpcode() == RISCV::ADD && Base.hasOneUse()) {
-    Add = Base;
-    SDValue Op0 = Base.getOperand(0);
-    SDValue Op1 = Base.getOperand(1);
-    if (Op0.isMachineOpcode() && Op0.getMachineOpcode() == RISCV::ADDI &&
-        !isa<FrameIndexSDNode>(Op0.getOperand(0)) &&
-        isa<ConstantSDNode>(Op0.getOperand(1))) {
-      AddBaseIdx = 1;
-      Base = Op0;
-    } else if (Op1.isMachineOpcode() && Op1.getMachineOpcode() == RISCV::ADDI &&
-               !isa<FrameIndexSDNode>(Op1.getOperand(0)) &&
-               isa<ConstantSDNode>(Op1.getOperand(1))) {
-      AddBaseIdx = 0;
-      Base = Op1;
-    } else if (Op1.isMachineOpcode() &&
-               Op1.getMachineOpcode() == RISCV::ADDIW &&
-               isa<ConstantSDNode>(Op1.getOperand(1)) &&
-               Op1.getOperand(0).isMachineOpcode() &&
-               Op1.getOperand(0).getMachineOpcode() == RISCV::LUI) {
-      // We found an LUI+ADDIW constant materialization. We might be able to
-      // fold the ADDIW offset if it could be treated as ADDI.
-      // Emulate the constant materialization to see if the result would be
-      // a simm32 if ADDI was used instead of ADDIW.
-
-      // First the LUI.
-      uint64_t Imm = Op1.getOperand(0).getConstantOperandVal(0);
-      Imm <<= 12;
-      Imm = SignExtend64<32>(Imm);
-
-      // Then the ADDI.
-      uint64_t LoImm = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue();
-      Imm += LoImm;
-
-      // If the result isn't a simm32, we can't do the optimization.
-      if (!isInt<32>(Imm))
-        return false;
-
-      AddBaseIdx = 0;
-      Base = Op1;
-    } else
-      return false;
-  } else if (Base.getMachineOpcode() == RISCV::ADDI) {
-    // If the base is an ADDI, we can merge it in to the load/store.
-  } else
+  // If the base is an ADDI, we can merge it in to the load/store.
+  if (Base.getMachineOpcode() != RISCV::ADDI)
     return false;
 
   SDValue ImmOperand = Base.getOperand(1);
@@ -2231,26 +2255,13 @@
   LLVM_DEBUG(N->dump(CurDAG));
   LLVM_DEBUG(dbgs() << "\n");
 
-  if (Add)
-    Add = SDValue(CurDAG->UpdateNodeOperands(Add.getNode(),
-                                             Add.getOperand(AddBaseIdx),
-                                             Base.getOperand(0)),
-                  0);
-
   // Modify the offset operand of the load/store.
   if (BaseOpIdx == 0) { // Load
-    if (Add)
-      N = CurDAG->UpdateNodeOperands(N, Add, ImmOperand, N->getOperand(2));
-    else
-      N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
-                                     N->getOperand(2));
+    N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
+                                   N->getOperand(2));
   } else { // Store
-    if (Add)
-      N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Add, ImmOperand,
-                                     N->getOperand(3));
-    else
-      N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
-                                     ImmOperand, N->getOperand(3));
+    N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+                                   ImmOperand, N->getOperand(3));
   }
 
   return true;
diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll
--- a/llvm/test/CodeGen/RISCV/mem.ll
+++ b/llvm/test/CodeGen/RISCV/mem.ll
@@ -227,10 +227,9 @@
 ; RV32I-LABEL: lw_sw_far_local:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 4
-; RV32I-NEXT:    addi a2, a2, -4
 ; RV32I-NEXT:    add a2, a0, a2
-; RV32I-NEXT:    lw a0, 0(a2)
-; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    lw a0, -4(a2)
+; RV32I-NEXT:    sw a1, -4(a2)
 ; RV32I-NEXT:    ret
   %1 = getelementptr inbounds i32, i32* %a, i64 4095
   %2 = load volatile i32, i32* %1
@@ -266,10 +265,9 @@
 ; RV32I-LABEL: lw_sw_really_far_local:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    addi a2, a2, -2048
 ; RV32I-NEXT:    add a2, a0, a2
-; RV32I-NEXT:    lw a0, 0(a2)
-; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    lw a0, -2048(a2)
+; RV32I-NEXT:    sw a1, -2048(a2)
 ; RV32I-NEXT:    ret
   %1 = getelementptr inbounds i32, i32* %a, i32 536870400
   %2 = load volatile i32, i32* %1
diff --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll
--- a/llvm/test/CodeGen/RISCV/mem64.ll
+++ b/llvm/test/CodeGen/RISCV/mem64.ll
@@ -257,10 +257,9 @@
 ; RV64I-LABEL: lw_sw_far_local:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 8
-; RV64I-NEXT:    addiw a2, a2, -8
 ; RV64I-NEXT:    add a2, a0, a2
-; RV64I-NEXT:    ld a0, 0(a2)
-; RV64I-NEXT:    sd a1, 0(a2)
+; RV64I-NEXT:    ld a0, -8(a2)
+; RV64I-NEXT:    sd a1, -8(a2)
 ; RV64I-NEXT:    ret
   %1 = getelementptr inbounds i64, i64* %a, i64 4095
   %2 = load volatile i64, i64* %1
@@ -273,10 +272,10 @@
 define i64 @lw_really_far_local(i64* %a)  {
 ; RV64I-LABEL: lw_really_far_local:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    addiw a1, a1, -2048
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 31
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ld a0, -2048(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr inbounds i64, i64* %a, i64 268435200
   %2 = load volatile i64, i64* %1
@@ -288,10 +287,10 @@
 define void @st_really_far_local(i64* %a, i64 %b)  {
 ; RV64I-LABEL: st_really_far_local:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    addiw a2, a2, -2048
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a2, a2, 31
 ; RV64I-NEXT:    add a0, a0, a2
-; RV64I-NEXT:    sd a1, 0(a0)
+; RV64I-NEXT:    sd a1, -2048(a0)
 ; RV64I-NEXT:    ret
   %1 = getelementptr inbounds i64, i64* %a, i64 268435200
   store i64 %b, i64* %1
@@ -303,11 +302,11 @@
 define i64 @lw_sw_really_far_local(i64* %a, i64 %b)  {
 ; RV64I-LABEL: lw_sw_really_far_local:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    addiw a2, a2, -2048
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a2, a2, 31
 ; RV64I-NEXT:    add a2, a0, a2
-; RV64I-NEXT:    ld a0, 0(a2)
-; RV64I-NEXT:    sd a1, 0(a2)
+; RV64I-NEXT:    ld a0, -2048(a2)
+; RV64I-NEXT:    sd a1, -2048(a2)
 ; RV64I-NEXT:    ret
   %1 = getelementptr inbounds i64, i64* %a, i64 268435200
   %2 = load volatile i64, i64* %1
diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -13,30 +13,28 @@
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    lui a2, 20
-; RV32I-NEXT:    addi a2, a2, -1920
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    li a2, 2
-; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    sw a2, -1920(a0)
 ; RV32I-NEXT:    li a3, 1
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a3, 0(a1)
-; RV32I-NEXT:    sw a2, 4(a1)
+; RV32I-NEXT:    sw a3, -1916(a0)
+; RV32I-NEXT:    sw a3, -1920(a1)
+; RV32I-NEXT:    sw a2, -1916(a1)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test1:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    lui a2, 20
-; RV64I-NEXT:    addiw a2, a2, -1920
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    li a2, 2
-; RV64I-NEXT:    sw a2, 0(a0)
+; RV64I-NEXT:    sw a2, -1920(a0)
 ; RV64I-NEXT:    li a3, 1
-; RV64I-NEXT:    sw a3, 4(a0)
-; RV64I-NEXT:    sw a3, 0(a1)
-; RV64I-NEXT:    sw a2, 4(a1)
+; RV64I-NEXT:    sw a3, -1916(a0)
+; RV64I-NEXT:    sw a3, -1920(a1)
+; RV64I-NEXT:    sw a2, -1916(a1)
 ; RV64I-NEXT:    ret
 entry:
   %s = load [65536 x i32]*, [65536 x i32]** %sp
@@ -119,28 +117,27 @@
 }
 
 ; GEPs have been manually split so the base GEP does not get used by any memory
-; instructions. Make sure we use a small offset in each of the stores.
+; instructions. Make sure we use an offset and common base for each of the
+; stores.
 define void @test3([65536 x i32]* %t) {
 ; RV32I-LABEL: test3:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lui a1, 20
-; RV32I-NEXT:    addi a1, a1, -1920
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a1, -1916(a0)
 ; RV32I-NEXT:    li a1, 3
-; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a1, -1912(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test3:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lui a1, 20
-; RV64I-NEXT:    addiw a1, a1, -1920
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    sw a1, 4(a0)
+; RV64I-NEXT:    sw a1, -1916(a0)
 ; RV64I-NEXT:    li a1, 3
-; RV64I-NEXT:    sw a1, 8(a0)
+; RV64I-NEXT:    sw a1, -1912(a0)
 ; RV64I-NEXT:    ret
 entry:
   %0 = bitcast [65536 x i32]* %t to i8*