diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7556,6 +7556,36 @@
   if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64)
     return SDValue();
 
+  // Find the minimal offset
+  uint64_t OffsetMin = UINT64_MAX;
+  for (auto *Store : Stores) {
+    // All the stores store different parts of the CombinedValue. A truncate is
+    // required to get the partial value.
+    SDValue Trunc = Store->getValue();
+    if (Trunc.getOpcode() != ISD::TRUNCATE)
+      return SDValue();
+    // Other than the first/last part, a shift operation is required to get the
+    // offset.
+    uint64_t Offset = 0;
+    SDValue WideVal = Trunc.getOperand(0);
+    if ((WideVal.getOpcode() == ISD::SRL || WideVal.getOpcode() == ISD::SRA) &&
+        isa<ConstantSDNode>(WideVal.getOperand(1))) {
+      // The shift amount must be a constant multiple of the narrow type.
+      // It is translated to the offset address in the wide source value "y".
+      //
+      // x = srl y, ShiftAmtC
+      // i8 z = trunc x
+      // store z, ...
+      uint64_t ShiftAmtC = WideVal.getConstantOperandVal(1);
+      if (ShiftAmtC % NarrowNumBits != 0)
+        return SDValue();
+
+      Offset = ShiftAmtC / NarrowNumBits;
+    }
+    if (Offset < OffsetMin)
+      OffsetMin = Offset;
+  }
+
   // Check if all bytes of the source value that we are looking at are stored
   // to the same base address. Collect offsets from Base address into OffsetMap.
   SDValue SourceValue;
@@ -7585,7 +7615,7 @@
       if (ShiftAmtC % NarrowNumBits != 0)
         return SDValue();
 
-      Offset = ShiftAmtC / NarrowNumBits;
+      Offset = ShiftAmtC / NarrowNumBits - OffsetMin;
       WideVal = WideVal.getOperand(0);
     }
 
@@ -7620,7 +7650,7 @@
     }
     // Map the offset in the store and the offset in the combined value, and
     // early return if it has been set before.
-    if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX)
+    if (Offset >= NumStores || OffsetMap[Offset] != INT64_MAX)
       return SDValue();
     OffsetMap[Offset] = ByteOffsetFromBase;
   }
@@ -7633,8 +7663,19 @@
   bool Fast = false;
   bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT,
                                         *FirstStore->getMemOperand(), &Fast);
-  if (!Allowed || !Fast)
-    return SDValue();
+  bool Use2St16 = false;
+  if (!Allowed || !Fast) {
+    // Is it OK to use two short store for this 4 bytes store?
+    if (NumStores == 4 && WideVT == MVT::i32) {
+      Allowed = TLI.allowsMemoryAccess(Context, Layout, MVT::i16,
+                                       *FirstStore->getMemOperand(), &Fast);
+      if (!Allowed || !Fast)
+        return SDValue();
+      Use2St16 = true;
+    } else {
+      return SDValue();
+    }
+  }
 
   // Check if the pieces of the value are going to the expected places in memory
   // to merge the stores.
@@ -7665,15 +7706,20 @@
   }
 
   SDLoc DL(N);
+  if (OffsetMin > 0) {
+    EVT Typ = SourceValue.getValueType();
+    SDValue ShiftAmt = DAG.getConstant(OffsetMin * NarrowNumBits, DL, Typ);
+    SourceValue = DAG.getNode(ISD::SRL, DL, Typ, SourceValue, ShiftAmt);
+  }
   if (WideVT != SourceValue.getValueType()) {
     assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits &&
            "Unexpected store value to merge");
     SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue);
   }
-
-  // Before legalize we can introduce illegal bswaps/rotates which will be later
-  // converted to an explicit bswap sequence. This way we end up with a single
-  // store and byte shuffling instead of several stores and byte shuffling.
+  // Before legalize we can introduce illegal bswaps/rotates which will be
+  // later converted to an explicit bswap sequence. This way we end up with a
+  // single store and byte shuffling instead of several stores and byte
+  // shuffling.
   if (NeedBswap) {
     SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
   } else if (NeedRotate) {
@@ -7682,6 +7728,26 @@
     SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt);
   }
 
+  if (Use2St16) {
+    SDValue SourceValueL;
+    SDValue SourceValueH;
+    SDValue ShiftAmt = DAG.getConstant(16, DL, MVT::i32);
+    SourceValueL = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, SourceValue);
+    SourceValueH = DAG.getNode(ISD::SRL, DL, MVT::i32, SourceValue, ShiftAmt);
+    SourceValueH = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, SourceValueH);
+
+    SDValue NewSt16L =
+        DAG.getStore(Chain, DL, SourceValueL, FirstStore->getBasePtr(),
+                     FirstStore->getPointerInfo(), FirstStore->getAlign());
+    SDValue AddrH =
+        DAG.getNode(ISD::ADD, DL, MVT::i32, FirstStore->getBasePtr(),
+                    DAG.getConstant(16, DL, MVT::i32));
+    SDValue NewSt16H =
+        DAG.getStore(NewSt16L, DL, SourceValueH, AddrH,
+                     FirstStore->getPointerInfo(), FirstStore->getAlign());
+    return NewSt16H;
+  }
+
   SDValue NewStore =
       DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(),
                    FirstStore->getPointerInfo(), FirstStore->getAlign());
diff --git a/llvm/test/CodeGen/RISCV/store-combine.ll b/llvm/test/CodeGen/RISCV/store-combine.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/store-combine.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32
+
+%struct.ab = type { i8, i8, i8, i8 }
+;
+define void @store_0(%struct.ab* nocapture noundef writeonly %_ab, i32 noundef %v) {
+; RV32-LABEL: store_0:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sh a1, 0(a0)
+; RV32-NEXT:    sh a2, 16(a0)
+; RV32-NEXT:    ret
+  %conv = trunc i32 %v to i8
+  %a = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 0
+  store i8 %conv, i8* %a, align 2, !tbaa !4
+  %shr = lshr i32 %v, 8
+  %conv2 = trunc i32 %shr to i8
+  %b = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 1
+  store i8 %conv2, i8* %b, align 1, !tbaa !8
+  %shr3 = lshr i32 %v, 16
+  %conv5 = trunc i32 %shr3 to i8
+  %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2
+  store i8 %conv5, i8* %c, align 2, !tbaa !9
+  %shr6 = lshr i32 %v, 24
+  %conv8 = trunc i32 %shr6 to i8
+  %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3
+  store i8 %conv8, i8* %d, align 1, !tbaa !10
+  ret void
+}
+
+define void @store_1(%struct.ab* nocapture noundef writeonly %_ab, i32 noundef %v) {
+; RV32-LABEL: store_1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    sh a1, 0(a0)
+; RV32-NEXT:    ret
+  %conv = trunc i32 %v to i8
+  %a = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 0
+  store i8 %conv, i8* %a, align 2, !tbaa !4
+  %shr = lshr i32 %v, 8
+  %conv2 = trunc i32 %shr to i8
+  %b = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 1
+  store i8 %conv2, i8* %b, align 1, !tbaa !8
+  ret void
+}
+
+define void @store3(%struct.ab* nocapture noundef writeonly %_ab, i32 noundef %v) {
+; RV32-LABEL: store3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 16
+; RV32-NEXT:    sh a1, 2(a0)
+; RV32-NEXT:    ret
+  %shr = lshr i32 %v, 16
+  %conv = trunc i32 %shr to i8
+  %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2
+  store i8 %conv, i8* %c, align 2, !tbaa !9
+  %shr1 = lshr i32 %v, 24
+  %conv3 = trunc i32 %shr1 to i8
+  %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3
+  store i8 %conv3, i8* %d, align 1, !tbaa !10
+  ret void
+}
+
+define void @store4(%struct.ab* nocapture noundef writeonly %_ab, i32 noundef %v) {
+; RV32-LABEL: store4:
+; RV32:       # %bb.0:
+; RV32-NEXT:    sh a1, 2(a0)
+; RV32-NEXT:    ret
+  %conv = trunc i32 %v to i8
+  %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2
+  store i8 %conv, i8* %c, align 2, !tbaa !9
+  %shr = lshr i32 %v, 8
+  %conv2 = trunc i32 %shr to i8
+  %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3
+  store i8 %conv2, i8* %d, align 1, !tbaa !10
+  ret void
+}
+
+!4 = !{!5, !6, i64 0}
+!5 = !{!"ab", !6, i64 0, !6, i64 1, !6, i64 2, !6, i64 3}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!5, !6, i64 1}
+!9 = !{!5, !6, i64 2}
+!10 = !{!5, !6, i64 3}