diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7755,7 +7755,7 @@
   // Check if all bytes of the source value that we are looking at are stored
   // to the same base address. Collect offsets from Base address into OffsetMap.
   SDValue SourceValue;
-  SmallVector<int64_t, 8> OffsetMap(NumStores, INT64_MAX);
+  std::map<unsigned, int64_t> OffsetMap;
   int64_t FirstOffset = INT64_MAX;
   StoreSDNode *FirstStore = nullptr;
   Optional<BaseIndexOffset> Base;
@@ -7814,9 +7814,8 @@
       FirstStore = Store;
       FirstOffset = ByteOffsetFromBase;
     }
-    // Map the offset in the store and the offset in the combined value, and
     // early return if it has been set before.
-    if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX)
+    if (OffsetMap.count(Offset) != 0)
       return SDValue();
     OffsetMap[Offset] = ByteOffsetFromBase;
   }
@@ -7824,25 +7823,49 @@
   assert(FirstOffset != INT64_MAX && "First byte offset must be set");
   assert(FirstStore && "First store must be set");
 
+  // check the shifts are consecutive
+  unsigned PreShift = INT32_MAX;
+  for (auto Iter : OffsetMap) {
+    if (PreShift != INT32_MAX) {
+      if (Iter.first - PreShift != 1)
+        return SDValue();
+    }
+    PreShift = Iter.first;
+  }
+
   // Check that a store of the wide type is both allowed and fast on the target
   const DataLayout &Layout = DAG.getDataLayout();
   bool Fast = false;
   bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT,
                                         *FirstStore->getMemOperand(), &Fast);
-  if (!Allowed || !Fast)
-    return SDValue();
+  bool Use2St16 = false;
+  if (!Allowed || !Fast) {
+    // Is it OK to use two short store for this 4 bytes store?
+    if (NumStores == 4 && WideVT == MVT::i32) {
+      Allowed = TLI.allowsMemoryAccess(Context, Layout, MVT::i16,
+                                       *FirstStore->getMemOperand(), &Fast);
+      if (!Allowed || !Fast)
+        return SDValue();
+      Use2St16 = true;
+    } else {
+      return SDValue();
+    }
+  }
 
   // Check if the pieces of the value are going to the expected places in memory
   // to merge the stores.
-  auto checkOffsets = [&](bool MatchLittleEndian) {
+  auto CheckOffsets = [&](bool MatchLittleEndian) {
+    unsigned I = 0;
     if (MatchLittleEndian) {
-      for (unsigned i = 0; i != NumStores; ++i)
-        if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset)
+      for (auto Iter = OffsetMap.begin(); Iter != OffsetMap.end(); Iter++) {
+        if (Iter->second != I++ * (NarrowNumBits / 8) + FirstOffset)
           return false;
-    } else { // MatchBigEndian by reversing loop counter.
-      for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j)
-        if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset)
+      }
+    } else {
+      for (auto Iter = OffsetMap.rbegin(); Iter != OffsetMap.rend(); Iter++) {
+        if (Iter->second != I++ * (NarrowNumBits / 8) + FirstOffset)
           return false;
+      }
     }
     return true;
   };
@@ -7850,26 +7873,32 @@
   // Check if the offsets line up for the native data layout of this target.
   bool NeedBswap = false;
   bool NeedRotate = false;
-  if (!checkOffsets(Layout.isLittleEndian())) {
+  if (!CheckOffsets(Layout.isLittleEndian())) {
     // Special-case: check if byte offsets line up for the opposite endian.
-    if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian()))
+    if (NarrowNumBits == 8 && CheckOffsets(Layout.isBigEndian()))
       NeedBswap = true;
-    else if (NumStores == 2 && checkOffsets(Layout.isBigEndian()))
+    else if (NumStores == 2 && CheckOffsets(Layout.isBigEndian()))
       NeedRotate = true;
     else
       return SDValue();
   }
 
   SDLoc DL(N);
+  unsigned FirstShift = OffsetMap.begin()->first;
+  if (FirstShift > 0) {
+    EVT Typ = SourceValue.getValueType();
+    SDValue ShiftAmt = DAG.getConstant(FirstShift * NarrowNumBits, DL, Typ);
+    SourceValue = DAG.getNode(ISD::SRL, DL, Typ, SourceValue, ShiftAmt);
+  }
   if (WideVT != SourceValue.getValueType()) {
     assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits &&
            "Unexpected store value to merge");
     SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue);
   }
-
-  // Before legalize we can introduce illegal bswaps/rotates which will be later
-  // converted to an explicit bswap sequence. This way we end up with a single
-  // store and byte shuffling instead of several stores and byte shuffling.
+  // Before legalize we can introduce illegal bswaps/rotates which will be
+  // later converted to an explicit bswap sequence. This way we end up with a
+  // single store and byte shuffling instead of several stores and byte
+  // shuffling.
   if (NeedBswap) {
     SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
   } else if (NeedRotate) {
@@ -7878,6 +7907,27 @@
     SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt);
   }
 
+  if (Use2St16) {
+    SDValue ShiftAmt = DAG.getConstant(16, DL, MVT::i32);
+    SDValue SourceValueL =
+        DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, SourceValue);
+    SDValue SourceValueH =
+        DAG.getNode(ISD::SRL, DL, MVT::i32, SourceValue, ShiftAmt);
+    SourceValueH = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, SourceValueH);
+
+    SDValue NewSt16L =
+        DAG.getStore(Chain, DL, SourceValueL, FirstStore->getBasePtr(),
+                     FirstStore->getPointerInfo(), FirstStore->getAlign());
+    SDValue Baseptr = FirstStore->getBasePtr();
+    EVT BaseptrType = Baseptr.getValueType();
+    SDValue AddrH = DAG.getNode(ISD::ADD, DL, BaseptrType, Baseptr,
+                                DAG.getConstant(16, DL, BaseptrType));
+    SDValue NewSt16H =
+        DAG.getStore(NewSt16L, DL, SourceValueH, AddrH,
+                     FirstStore->getPointerInfo(), FirstStore->getAlign());
+    return NewSt16H;
+  }
+
   SDValue NewStore =
       DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(),
                    FirstStore->getPointerInfo(), FirstStore->getAlign());
diff --git a/llvm/test/CodeGen/RISCV/store-combine.ll b/llvm/test/CodeGen/RISCV/store-combine.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/store-combine.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64
+
+; The ll test case is derived from the following C code
+;struct ab {
+;  char a;
+;  char b;
+;  char c;
+;  char d;
+;} __attribute__((aligned(2)));
+;
+;void store0(struct ab *_ab, unsigned v) {
+;  _ab->a = v & 0xff;
+;  _ab->b = (v >> 8) & 0xff;
+;  _ab->c = (v >> 16) & 0xff;
+;  _ab->d = (v >> 24) & 0xff;
+;}
+;void store1(struct ab *_ab, unsigned v) {
+;  _ab->a = v & 0xff;
+;  _ab->b = (v >> 8) & 0xff;
+;}
+;void store2(struct ab *_ab, unsigned v) {
+;  _ab->c = (v >> 16) & 0xff;
+;  _ab->d = (v >> 24) & 0xff;
+;}
+;void store3(struct ab *_ab, unsigned v) {
+;  _ab->c = (v)&0xff;
+;  _ab->d = (v >> 8) & 0xff;
+;}
+; No to combine stores for such case as
+; (char)(v >> 8) and (char)(v >> 24) can't
+; be converted to (short)(v>>8)
+;void store4(struct ab *_ab, unsigned v) {
+;  _ab->c = (v >> 8) & 0xff;
+;  _ab->d = (v >> 24) & 0xff;
+;}
+
+%struct.ab = type { i8, i8, i8, i8 }
+
+define void @store_0(%struct.ab* %_ab, i32 %v) {
+; RV32-LABEL: store_0:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a2, a1, 16
+; RV32-NEXT:    sh a1, 0(a0)
+; RV32-NEXT:    sh a2, 16(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: store_0:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sh a1, 0(a0)
+; RV64-NEXT:    srli a1, a1, 16
+; RV64-NEXT:    sh a1, 16(a0)
+; RV64-NEXT:    ret
+  %conv = trunc i32 %v to i8
+  %a = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 0
+  store i8 %conv, i8* %a, align 2
+  %shr = lshr i32 %v, 8
+  %conv2 = trunc i32 %shr to i8
+  %b = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 1
+  store i8 %conv2, i8* %b, align 1
+  %shr3 = lshr i32 %v, 16
+  %conv5 = trunc i32 %shr3 to i8
+  %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2
+  store i8 %conv5, i8* %c, align 2
+  %shr6 = lshr i32 %v, 24
+  %conv8 = trunc i32 %shr6 to i8
+  %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3
+  store i8 %conv8, i8* %d, align 1
+  ret void
+}
+
+define void @store_1(%struct.ab* %_ab, i32 %v) {
+; RV32-LABEL: store_1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    sh a1, 0(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: store_1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sh a1, 0(a0)
+; RV64-NEXT:    ret
+  %conv = trunc i32 %v to i8
+  %a = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 0
+  store i8 %conv, i8* %a, align 2
+  %shr = lshr i32 %v, 8
+  %conv2 = trunc i32 %shr to i8
+  %b = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 1
+  store i8 %conv2, i8* %b, align 1
+  ret void
+}
+
+define void @store2(%struct.ab* %_ab, i32 %v) {
+; RV32-LABEL: store2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 16
+; RV32-NEXT:    sh a1, 2(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: store2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srli a1, a1, 16
+; RV64-NEXT:    sh a1, 2(a0)
+; RV64-NEXT:    ret
+  %shr = lshr i32 %v, 16
+  %conv = trunc i32 %shr to i8
+  %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2
+  store i8 %conv, i8* %c, align 2
+  %shr1 = lshr i32 %v, 24
+  %conv3 = trunc i32 %shr1 to i8
+  %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3
+  store i8 %conv3, i8* %d, align 1
+  ret void
+}
+
+define void @store3(%struct.ab* %_ab, i32 %v) {
+; RV32-LABEL: store3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    sh a1, 2(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: store3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sh a1, 2(a0)
+; RV64-NEXT:    ret
+  %conv = trunc i32 %v to i8
+  %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2
+  store i8 %conv, i8* %c, align 2
+  %shr = lshr i32 %v, 8
+  %conv2 = trunc i32 %shr to i8
+  %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3
+  store i8 %conv2, i8* %d, align 1
+  ret void
+}
+
+; not to combine stores
+define dso_local void @store4(%struct.ab* %_ab, i32 %v){
+; RV32-LABEL: store4:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    srli a2, a1, 8
+; RV32-NEXT:    sb a2, 2(a0)
+; RV32-NEXT:    srli a1, a1, 24
+; RV32-NEXT:    sb a1, 3(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: store4:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    srli a2, a1, 8
+; RV64-NEXT:    sb a2, 2(a0)
+; RV64-NEXT:    srli a1, a1, 24
+; RV64-NEXT:    sb a1, 3(a0)
+; RV64-NEXT:    ret
+entry:
+  %shr = lshr i32 %v, 8
+  %conv = trunc i32 %shr to i8
+  %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2
+  store i8 %conv, i8* %c, align 2
+  %shr1 = lshr i32 %v, 24
+  %conv3 = trunc i32 %shr1 to i8
+  %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3
+  store i8 %conv3, i8* %d, align 1
+  ret void
+}