diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6869,8 +6869,9 @@
   SmallVector<StoreSDNode *, 8> Stores;
   for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
     // TODO: Allow unordered atomics when wider type is legal (see D66309)
-    if (Store->getMemoryVT() != MVT::i8 || !Store->isSimple() ||
-        Store->isIndexed())
+    EVT MemVT = Store->getMemoryVT();
+    if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) ||
+        !Store->isSimple() || Store->isIndexed())
       return SDValue();
     Stores.push_back(Store);
     Chain = Store->getChain();
@@ -6959,12 +6960,6 @@
   assert(FirstOffset != INT64_MAX && "First byte offset must be set");
   assert(FirstStore && "First store must be set");
 
-  // Check if the bytes of the combined value we are looking at match with
-  // either big or little endian value store.
-  Optional<bool> IsBigEndian = isBigEndian(OffsetMap, FirstOffset);
-  if (!IsBigEndian.hasValue())
-    return SDValue();
-
   // Check that a store of the wide type is both allowed and fast on the target
   const DataLayout &Layout = DAG.getDataLayout();
   bool Fast = false;
@@ -6973,6 +6968,31 @@
   if (!Allowed || !Fast)
     return SDValue();
 
+  // Check if the pieces of the value are going to the expected places in memory
+  // to merge the stores.
+  auto checkOffsets = [&](bool MatchLittleEndian) {
+    if (MatchLittleEndian) {
+      for (unsigned i = 0; i != NumStores; ++i)
+        if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset)
+          return false;
+    } else { // MatchBigEndian by reversing loop counter.
+      for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j)
+        if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset)
+          return false;
+    }
+    return true;
+  };
+
+  // Check if the offsets line up for the native data layout of this target.
+  bool NeedBswap = false;
+  if (!checkOffsets(Layout.isLittleEndian())) {
+    // Special-case: check if byte offsets line up for the opposite endian.
+    // TODO: We could use rotates for 16/32-bit merge pairs.
+    if (NarrowNumBits != 8 || !checkOffsets(Layout.isBigEndian()))
+      return SDValue();
+    NeedBswap = true;
+  }
+
   SDLoc DL(N);
   if (WideVT != SourceValue.getValueType()) {
     assert(SourceValue.getValueType().getSizeInBits() > WideNumBits &&
@@ -6983,7 +7003,6 @@
   // Before legalize we can introduce illegal bswaps which will be later
   // converted to an explicit bswap sequence. This way we end up with a single
   // store and byte shuffling instead of several stores and byte shuffling.
-  bool NeedBswap = Layout.isBigEndian() != *IsBigEndian;
   if (NeedBswap)
     SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
 
diff --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
--- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
@@ -200,12 +200,17 @@
 }
 
 define void @le_i32_to_i16(i32 %x, i16* %p0) {
-; CHECK-LABEL: le_i32_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #16
-; CHECK-NEXT:    strh w0, [x1]
-; CHECK-NEXT:    strh w8, [x1, #2]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i32_to_i16:
+; LE:       // %bb.0:
+; LE-NEXT:    str w0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i32_to_i16:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr w8, w0, #16
+; BE-NEXT:    strh w0, [x1]
+; BE-NEXT:    strh w8, [x1, #2]
+; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
   %t1 = trunc i32 %sh1 to i16
@@ -216,12 +221,17 @@
 }
 
 define void @le_i32_to_i16_order(i32 %x, i16* %p0) {
-; CHECK-LABEL: le_i32_to_i16_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #16
-; CHECK-NEXT:    strh w8, [x1, #2]
-; CHECK-NEXT:    strh w0, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i32_to_i16_order:
+; LE:       // %bb.0:
+; LE-NEXT:    str w0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i32_to_i16_order:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr w8, w0, #16
+; BE-NEXT:    strh w8, [x1, #2]
+; BE-NEXT:    strh w0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
   %t1 = trunc i32 %sh1 to i16
@@ -232,12 +242,17 @@
 }
 
 define void @be_i32_to_i16(i32 %x, i16* %p0) {
-; CHECK-LABEL: be_i32_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #16
-; CHECK-NEXT:    strh w0, [x1, #2]
-; CHECK-NEXT:    strh w8, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i32_to_i16:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr w8, w0, #16
+; LE-NEXT:    strh w0, [x1, #2]
+; LE-NEXT:    strh w8, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i32_to_i16:
+; BE:       // %bb.0:
+; BE-NEXT:    str w0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
   %t1 = trunc i32 %sh1 to i16
@@ -248,12 +263,17 @@
 }
 
 define void @be_i32_to_i16_order(i32 %x, i16* %p0) {
-; CHECK-LABEL: be_i32_to_i16_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #16
-; CHECK-NEXT:    strh w8, [x1]
-; CHECK-NEXT:    strh w0, [x1, #2]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i32_to_i16_order:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr w8, w0, #16
+; LE-NEXT:    strh w8, [x1]
+; LE-NEXT:    strh w0, [x1, #2]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i32_to_i16_order:
+; BE:       // %bb.0:
+; BE-NEXT:    str w0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
   %t1 = trunc i32 %sh1 to i16
@@ -440,16 +460,21 @@
 }
 
 define void @le_i64_to_i16(i64 %x, i16* %p0) {
-; CHECK-LABEL: le_i64_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    lsr x9, x0, #32
-; CHECK-NEXT:    lsr x10, x0, #48
-; CHECK-NEXT:    strh w0, [x1]
-; CHECK-NEXT:    strh w8, [x1, #2]
-; CHECK-NEXT:    strh w9, [x1, #4]
-; CHECK-NEXT:    strh w10, [x1, #6]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i64_to_i16:
+; LE:       // %bb.0:
+; LE-NEXT:    str x0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i64_to_i16:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr x8, x0, #16
+; BE-NEXT:    lsr x9, x0, #32
+; BE-NEXT:    lsr x10, x0, #48
+; BE-NEXT:    strh w0, [x1]
+; BE-NEXT:    strh w8, [x1, #2]
+; BE-NEXT:    strh w9, [x1, #4]
+; BE-NEXT:    strh w10, [x1, #6]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 16
   %sh2 = lshr i64 %x, 32
   %sh3 = lshr i64 %x, 48
@@ -468,16 +493,21 @@
 }
 
 define void @le_i64_to_i16_order(i64 %x, i16* %p0) {
-; CHECK-LABEL: le_i64_to_i16_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    lsr x9, x0, #32
-; CHECK-NEXT:    lsr x10, x0, #48
-; CHECK-NEXT:    strh w0, [x1]
-; CHECK-NEXT:    strh w8, [x1, #2]
-; CHECK-NEXT:    strh w10, [x1, #6]
-; CHECK-NEXT:    strh w9, [x1, #4]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i64_to_i16_order:
+; LE:       // %bb.0:
+; LE-NEXT:    str x0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i64_to_i16_order:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr x8, x0, #16
+; BE-NEXT:    lsr x9, x0, #32
+; BE-NEXT:    lsr x10, x0, #48
+; BE-NEXT:    strh w0, [x1]
+; BE-NEXT:    strh w8, [x1, #2]
+; BE-NEXT:    strh w10, [x1, #6]
+; BE-NEXT:    strh w9, [x1, #4]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 16
   %sh2 = lshr i64 %x, 32
   %sh3 = lshr i64 %x, 48
@@ -496,16 +526,21 @@
 }
 
 define void @be_i64_to_i16(i64 %x, i16* %p0) {
-; CHECK-LABEL: be_i64_to_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    lsr x9, x0, #32
-; CHECK-NEXT:    lsr x10, x0, #48
-; CHECK-NEXT:    strh w0, [x1, #6]
-; CHECK-NEXT:    strh w8, [x1, #4]
-; CHECK-NEXT:    strh w9, [x1, #2]
-; CHECK-NEXT:    strh w10, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i64_to_i16:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr x8, x0, #16
+; LE-NEXT:    lsr x9, x0, #32
+; LE-NEXT:    lsr x10, x0, #48
+; LE-NEXT:    strh w0, [x1, #6]
+; LE-NEXT:    strh w8, [x1, #4]
+; LE-NEXT:    strh w9, [x1, #2]
+; LE-NEXT:    strh w10, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i64_to_i16:
+; BE:       // %bb.0:
+; BE-NEXT:    str x0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 16
   %sh2 = lshr i64 %x, 32
   %sh3 = lshr i64 %x, 48
@@ -524,16 +559,21 @@
 }
 
 define void @be_i64_to_i16_order(i64 %x, i16* %p0) {
-; CHECK-LABEL: be_i64_to_i16_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    lsr x9, x0, #32
-; CHECK-NEXT:    lsr x10, x0, #48
-; CHECK-NEXT:    strh w0, [x1, #6]
-; CHECK-NEXT:    strh w10, [x1]
-; CHECK-NEXT:    strh w9, [x1, #2]
-; CHECK-NEXT:    strh w8, [x1, #4]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i64_to_i16_order:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr x8, x0, #16
+; LE-NEXT:    lsr x9, x0, #32
+; LE-NEXT:    lsr x10, x0, #48
+; LE-NEXT:    strh w0, [x1, #6]
+; LE-NEXT:    strh w10, [x1]
+; LE-NEXT:    strh w9, [x1, #2]
+; LE-NEXT:    strh w8, [x1, #4]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i64_to_i16_order:
+; BE:       // %bb.0:
+; BE-NEXT:    str x0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 16
   %sh2 = lshr i64 %x, 32
   %sh3 = lshr i64 %x, 48
@@ -552,11 +592,16 @@
 }
 
 define void @le_i64_to_i32(i64 %x, i32* %p0) {
-; CHECK-LABEL: le_i64_to_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #32
-; CHECK-NEXT:    stp w0, w8, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i64_to_i32:
+; LE:       // %bb.0:
+; LE-NEXT:    str x0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i64_to_i32:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr x8, x0, #32
+; BE-NEXT:    stp w0, w8, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
   %t1 = trunc i64 %sh1 to i32
@@ -567,11 +612,16 @@
 }
 
 define void @le_i64_to_i32_order(i64 %x, i32* %p0) {
-; CHECK-LABEL: le_i64_to_i32_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #32
-; CHECK-NEXT:    stp w0, w8, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: le_i64_to_i32_order:
+; LE:       // %bb.0:
+; LE-NEXT:    str x0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: le_i64_to_i32_order:
+; BE:       // %bb.0:
+; BE-NEXT:    lsr x8, x0, #32
+; BE-NEXT:    stp w0, w8, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
   %t1 = trunc i64 %sh1 to i32
@@ -582,11 +632,16 @@
 }
 
 define void @be_i64_to_i32(i64 %x, i32* %p0) {
-; CHECK-LABEL: be_i64_to_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #32
-; CHECK-NEXT:    stp w8, w0, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i64_to_i32:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr x8, x0, #32
+; LE-NEXT:    stp w8, w0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i64_to_i32:
+; BE:       // %bb.0:
+; BE-NEXT:    str x0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
   %t1 = trunc i64 %sh1 to i32
@@ -597,11 +652,16 @@
 }
 
 define void @be_i64_to_i32_order(i64 %x, i32* %p0) {
-; CHECK-LABEL: be_i64_to_i32_order:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #32
-; CHECK-NEXT:    stp w8, w0, [x1]
-; CHECK-NEXT:    ret
+; LE-LABEL: be_i64_to_i32_order:
+; LE:       // %bb.0:
+; LE-NEXT:    lsr x8, x0, #32
+; LE-NEXT:    stp w8, w0, [x1]
+; LE-NEXT:    ret
+;
+; BE-LABEL: be_i64_to_i32_order:
+; BE:       // %bb.0:
+; BE-NEXT:    str x0, [x1]
+; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
   %t1 = trunc i64 %sh1 to i32
@@ -611,6 +671,8 @@
   ret void
 }
 
+; Negative test - not consecutive addresses
+
 define void @i64_to_i32_wrong_addr(i64 %x, i32* %p0) {
 ; CHECK-LABEL: i64_to_i32_wrong_addr:
 ; CHECK:       // %bb.0:
@@ -627,6 +689,8 @@
   ret void
 }
 
+; Negative test - addresses don't line up with shift amounts
+
 define void @i64_to_i16_wrong_order(i64 %x, i16* %p0) {
 ; CHECK-LABEL: i64_to_i16_wrong_order:
 ; CHECK:       // %bb.0:
@@ -655,6 +719,8 @@
   ret void
 }
 
+; Negative test - no store of 't1'
+
 define void @i32_to_i8_incomplete(i32 %x, i8* %p0) {
 ; CHECK-LABEL: i32_to_i8_incomplete:
 ; CHECK:       // %bb.0:
@@ -680,6 +746,8 @@
   ret void
 }
 
+; Negative test - no store of 't3'
+
 define void @i64_to_i8_incomplete(i64 %x, i8* %p0) {
 ; CHECK-LABEL: i64_to_i8_incomplete:
 ; CHECK:       // %bb.0:
@@ -729,6 +797,8 @@
   ret void
 }
 
+; Negative test - not consecutive addresses
+
 define void @i32_to_i16_wrong_addr(i32 %x, i16* %p0) {
 ; CHECK-LABEL: i32_to_i16_wrong_addr:
 ; CHECK:       // %bb.0:
@@ -745,6 +815,8 @@
   ret void
 }
 
+; Negative test - addresses don't line up with shift amounts
+
 define void @i32_to_i8_wrong_order(i32 %x, i8* %p0) {
 ; CHECK-LABEL: i32_to_i8_wrong_order:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll
--- a/llvm/test/CodeGen/X86/stores-merging.ll
+++ b/llvm/test/CodeGen/X86/stores-merging.ll
@@ -468,9 +468,7 @@
 define void @trunc_i32_to_i16(i32 %x, i16* %p) {
 ; CHECK-LABEL: trunc_i32_to_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw %di, (%rsi)
-; CHECK-NEXT:    shrl $16, %edi
-; CHECK-NEXT:    movw %di, 2(%rsi)
+; CHECK-NEXT:    movl %edi, (%rsi)
 ; CHECK-NEXT:    retq
   %t1 = trunc i32 %x to i16
   %sh = lshr i32 %x, 16
@@ -522,15 +520,7 @@
 define void @trunc_i64_to_i16(i64 %x, i16* %p) {
 ; CHECK-LABEL: trunc_i64_to_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %rdi, %rcx
-; CHECK-NEXT:    movw %di, (%rsi)
-; CHECK-NEXT:    shrq $16, %rdi
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    shrq $48, %rcx
-; CHECK-NEXT:    movw %di, 2(%rsi)
-; CHECK-NEXT:    movw %ax, 4(%rsi)
-; CHECK-NEXT:    movw %cx, 6(%rsi)
+; CHECK-NEXT:    movq %rdi, (%rsi)
 ; CHECK-NEXT:    retq
   %t1 = trunc i64 %x to i16
   %sh1 = lshr i64 %x, 16
@@ -552,9 +542,7 @@
 define void @trunc_i64_to_i32(i64 %x, i32* %p) {
 ; CHECK-LABEL: trunc_i64_to_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, (%rsi)
-; CHECK-NEXT:    shrq $32, %rdi
-; CHECK-NEXT:    movl %edi, 4(%rsi)
+; CHECK-NEXT:    movq %rdi, (%rsi)
 ; CHECK-NEXT:    retq
   %t1 = trunc i64 %x to i32
   %sh = lshr i64 %x, 32