diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7011,12 +7011,15 @@
 
   // Check if the offsets line up for the native data layout of this target.
   bool NeedBswap = false;
+  bool NeedRotate = false;
   if (!checkOffsets(Layout.isLittleEndian())) {
     // Special-case: check if byte offsets line up for the opposite endian.
-    // TODO: We could use rotates for 16/32-bit merge pairs.
-    if (NarrowNumBits != 8 || !checkOffsets(Layout.isBigEndian()))
+    if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian()))
+      NeedBswap = true;
+    else if (NumStores == 2 && checkOffsets(Layout.isBigEndian()))
+      NeedRotate = true;
+    else
       return SDValue();
-    NeedBswap = true;
   }
 
   SDLoc DL(N);
@@ -7026,11 +7029,16 @@
     SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue);
   }
 
-  // Before legalize we can introduce illegal bswaps which will be later
+  // Before legalize we can introduce illegal bswaps/rotates which will be later
   // converted to an explicit bswap sequence. This way we end up with a single
   // store and byte shuffling instead of several stores and byte shuffling.
-  if (NeedBswap)
+  if (NeedBswap) {
     SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
+  } else if (NeedRotate) {
+    assert(WideNumBits % 2 == 0 && "Unexpected type for rotate");
+    SDValue RotAmt = DAG.getConstant(WideNumBits / 2, DL, WideVT);
+    SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt);
+  }
 
   SDValue NewStore =
       DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(),
diff --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
--- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
@@ -207,9 +207,8 @@
 ;
 ; BE-LABEL: le_i32_to_i16:
 ; BE:       // %bb.0:
-; BE-NEXT:    lsr w8, w0, #16
-; BE-NEXT:    strh w0, [x1]
-; BE-NEXT:    strh w8, [x1, #2]
+; BE-NEXT:    ror w8, w0, #16
+; BE-NEXT:    str w8, [x1]
 ; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
@@ -228,9 +227,8 @@
 ;
 ; BE-LABEL: le_i32_to_i16_order:
 ; BE:       // %bb.0:
-; BE-NEXT:    lsr w8, w0, #16
-; BE-NEXT:    strh w8, [x1, #2]
-; BE-NEXT:    strh w0, [x1]
+; BE-NEXT:    ror w8, w0, #16
+; BE-NEXT:    str w8, [x1]
 ; BE-NEXT:    ret
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
@@ -244,9 +242,8 @@
 define void @be_i32_to_i16(i32 %x, i16* %p0) {
 ; LE-LABEL: be_i32_to_i16:
 ; LE:       // %bb.0:
-; LE-NEXT:    lsr w8, w0, #16
-; LE-NEXT:    strh w0, [x1, #2]
-; LE-NEXT:    strh w8, [x1]
+; LE-NEXT:    ror w8, w0, #16
+; LE-NEXT:    str w8, [x1]
 ; LE-NEXT:    ret
 ;
 ; BE-LABEL: be_i32_to_i16:
@@ -265,9 +262,8 @@
 define void @be_i32_to_i16_order(i32 %x, i16* %p0) {
 ; LE-LABEL: be_i32_to_i16_order:
 ; LE:       // %bb.0:
-; LE-NEXT:    lsr w8, w0, #16
-; LE-NEXT:    strh w8, [x1]
-; LE-NEXT:    strh w0, [x1, #2]
+; LE-NEXT:    ror w8, w0, #16
+; LE-NEXT:    str w8, [x1]
 ; LE-NEXT:    ret
 ;
 ; BE-LABEL: be_i32_to_i16_order:
@@ -528,13 +524,12 @@
 define void @be_i64_to_i16(i64 %x, i16* %p0) {
 ; LE-LABEL: be_i64_to_i16:
 ; LE:       // %bb.0:
-; LE-NEXT:    lsr x8, x0, #16
-; LE-NEXT:    lsr x9, x0, #32
-; LE-NEXT:    lsr x10, x0, #48
-; LE-NEXT:    strh w0, [x1, #6]
-; LE-NEXT:    strh w8, [x1, #4]
-; LE-NEXT:    strh w9, [x1, #2]
-; LE-NEXT:    strh w10, [x1]
+; LE-NEXT:    lsr x8, x0, #32
+; LE-NEXT:    lsr x9, x0, #48
+; LE-NEXT:    ror w10, w0, #16
+; LE-NEXT:    str w10, [x1, #4]
+; LE-NEXT:    strh w8, [x1, #2]
+; LE-NEXT:    strh w9, [x1]
 ; LE-NEXT:    ret
 ;
 ; BE-LABEL: be_i64_to_i16:
@@ -599,8 +594,8 @@
 ;
 ; BE-LABEL: le_i64_to_i32:
 ; BE:       // %bb.0:
-; BE-NEXT:    lsr x8, x0, #32
-; BE-NEXT:    stp w0, w8, [x1]
+; BE-NEXT:    ror x8, x0, #32
+; BE-NEXT:    str x8, [x1]
 ; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
@@ -619,8 +614,8 @@
 ;
 ; BE-LABEL: le_i64_to_i32_order:
 ; BE:       // %bb.0:
-; BE-NEXT:    lsr x8, x0, #32
-; BE-NEXT:    stp w0, w8, [x1]
+; BE-NEXT:    ror x8, x0, #32
+; BE-NEXT:    str x8, [x1]
 ; BE-NEXT:    ret
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
@@ -634,8 +629,8 @@
 define void @be_i64_to_i32(i64 %x, i32* %p0) {
 ; LE-LABEL: be_i64_to_i32:
 ; LE:       // %bb.0:
-; LE-NEXT:    lsr x8, x0, #32
-; LE-NEXT:    stp w8, w0, [x1]
+; LE-NEXT:    ror x8, x0, #32
+; LE-NEXT:    str x8, [x1]
 ; LE-NEXT:    ret
 ;
 ; BE-LABEL: be_i64_to_i32:
@@ -654,8 +649,8 @@
 define void @be_i64_to_i32_order(i64 %x, i32* %p0) {
 ; LE-LABEL: be_i64_to_i32_order:
 ; LE:       // %bb.0:
-; LE-NEXT:    lsr x8, x0, #32
-; LE-NEXT:    stp w8, w0, [x1]
+; LE-NEXT:    ror x8, x0, #32
+; LE-NEXT:    str x8, [x1]
 ; LE-NEXT:    ret
 ;
 ; BE-LABEL: be_i64_to_i32_order:
diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll
--- a/llvm/test/CodeGen/X86/stores-merging.ll
+++ b/llvm/test/CodeGen/X86/stores-merging.ll
@@ -482,9 +482,8 @@
 define void @be_i32_to_i16(i32 %x, i16* %p0) {
 ; CHECK-LABEL: be_i32_to_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw %di, 2(%rsi)
-; CHECK-NEXT:    shrl $16, %edi
-; CHECK-NEXT:    movw %di, (%rsi)
+; CHECK-NEXT:    rorl $16, %edi
+; CHECK-NEXT:    movl %edi, (%rsi)
 ; CHECK-NEXT:    retq
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
@@ -498,10 +497,8 @@
 define void @be_i32_to_i16_order(i32 %x, i16* %p0) {
 ; CHECK-LABEL: be_i32_to_i16_order:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shrl $16, %eax
-; CHECK-NEXT:    movw %ax, (%rsi)
-; CHECK-NEXT:    movw %di, 2(%rsi)
+; CHECK-NEXT:    rorl $16, %edi
+; CHECK-NEXT:    movl %edi, (%rsi)
 ; CHECK-NEXT:    retq
   %sh1 = lshr i32 %x, 16
   %t0 = trunc i32 %x to i16
@@ -589,9 +586,8 @@
 define void @be_i64_to_i32(i64 %x, i32* %p0) {
 ; CHECK-LABEL: be_i64_to_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, 4(%rsi)
-; CHECK-NEXT:    shrq $32, %rdi
-; CHECK-NEXT:    movl %edi, (%rsi)
+; CHECK-NEXT:    rorq $32, %rdi
+; CHECK-NEXT:    movq %rdi, (%rsi)
 ; CHECK-NEXT:    retq
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32
@@ -605,10 +601,8 @@
 define void @be_i64_to_i32_order(i64 %x, i32* %p0) {
 ; CHECK-LABEL: be_i64_to_i32_order:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movl %eax, (%rsi)
-; CHECK-NEXT:    movl %edi, 4(%rsi)
+; CHECK-NEXT:    rorq $32, %rdi
+; CHECK-NEXT:    movq %rdi, (%rsi)
 ; CHECK-NEXT:    retq
   %sh1 = lshr i64 %x, 32
   %t0 = trunc i64 %x to i32