diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7011,12 +7011,15 @@ // Check if the offsets line up for the native data layout of this target. bool NeedBswap = false; + bool NeedRotate = false; if (!checkOffsets(Layout.isLittleEndian())) { // Special-case: check if byte offsets line up for the opposite endian. - // TODO: We could use rotates for 16/32-bit merge pairs. - if (NarrowNumBits != 8 || !checkOffsets(Layout.isBigEndian())) + if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian())) + NeedBswap = true; + else if (NumStores == 2 && checkOffsets(Layout.isBigEndian())) + NeedRotate = true; + else return SDValue(); - NeedBswap = true; } SDLoc DL(N); @@ -7026,11 +7029,16 @@ SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue); } - // Before legalize we can introduce illegal bswaps which will be later + // Before legalize we can introduce illegal bswaps/rotates which will be later // converted to an explicit bswap sequence. This way we end up with a single // store and byte shuffling instead of several stores and byte shuffling. - if (NeedBswap) + if (NeedBswap) { SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue); + } else if (NeedRotate) { + assert(WideNumBits % 2 == 0 && "Unexpected type for rotate"); + SDValue RotAmt = DAG.getConstant(WideNumBits / 2, DL, WideVT); + SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt); + } SDValue NewStore = DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(), diff --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll --- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll +++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll @@ -207,9 +207,8 @@ ; ; BE-LABEL: le_i32_to_i16: ; BE: // %bb.0: -; BE-NEXT: lsr w8, w0, #16 -; BE-NEXT: strh w0, [x1] -; BE-NEXT: strh w8, [x1, #2] +; BE-NEXT: ror w8, w0, #16 +; BE-NEXT: str w8, [x1] ; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 @@ -228,9 +227,8 @@ ; ; BE-LABEL: le_i32_to_i16_order: ; BE: // %bb.0: -; BE-NEXT: lsr w8, w0, #16 -; BE-NEXT: strh w8, [x1, #2] -; BE-NEXT: strh w0, [x1] +; BE-NEXT: ror w8, w0, #16 +; BE-NEXT: str w8, [x1] ; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 @@ -244,9 +242,8 @@ define void @be_i32_to_i16(i32 %x, i16* %p0) { ; LE-LABEL: be_i32_to_i16: ; LE: // %bb.0: -; LE-NEXT: lsr w8, w0, #16 -; LE-NEXT: strh w0, [x1, #2] -; LE-NEXT: strh w8, [x1] +; LE-NEXT: ror w8, w0, #16 +; LE-NEXT: str w8, [x1] ; LE-NEXT: ret ; ; BE-LABEL: be_i32_to_i16: @@ -265,9 +262,8 @@ define void @be_i32_to_i16_order(i32 %x, i16* %p0) { ; LE-LABEL: be_i32_to_i16_order: ; LE: // %bb.0: -; LE-NEXT: lsr w8, w0, #16 -; LE-NEXT: strh w8, [x1] -; LE-NEXT: strh w0, [x1, #2] +; LE-NEXT: ror w8, w0, #16 +; LE-NEXT: str w8, [x1] ; LE-NEXT: ret ; ; BE-LABEL: be_i32_to_i16_order: @@ -528,13 +524,12 @@ define void @be_i64_to_i16(i64 %x, i16* %p0) { ; LE-LABEL: be_i64_to_i16: ; LE: // %bb.0: -; LE-NEXT: lsr x8, x0, #16 -; LE-NEXT: lsr x9, x0, #32 -; LE-NEXT: lsr x10, x0, #48 -; LE-NEXT: strh w0, [x1, #6] -; LE-NEXT: strh w8, [x1, #4] -; LE-NEXT: strh w9, [x1, #2] -; LE-NEXT: strh w10, [x1] +; LE-NEXT: lsr x8, x0, #32 +; LE-NEXT: lsr x9, x0, #48 +; LE-NEXT: ror w10, w0, #16 +; LE-NEXT: str w10, [x1, #4] +; LE-NEXT: strh w8, [x1, #2] +; LE-NEXT: strh w9, [x1] ; LE-NEXT: ret ; ; BE-LABEL: be_i64_to_i16: @@ -599,8 +594,8 @@ ; ; BE-LABEL: le_i64_to_i32: ; BE: // %bb.0: -; BE-NEXT: lsr x8, x0, #32 -; BE-NEXT: stp w0, w8, [x1] +; BE-NEXT: ror x8, x0, #32 +; BE-NEXT: str x8, [x1] ; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 @@ -619,8 +614,8 @@ ; ; BE-LABEL: le_i64_to_i32_order: ; BE: // %bb.0: -; BE-NEXT: lsr x8, x0, #32 -; BE-NEXT: stp w0, w8, [x1] +; BE-NEXT: ror x8, x0, #32 +; BE-NEXT: str x8, [x1] ; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 @@ -634,8 +629,8 @@ define void @be_i64_to_i32(i64 %x, i32* %p0) { ; LE-LABEL: be_i64_to_i32: ; LE: // %bb.0: -; LE-NEXT: lsr x8, x0, #32 -; LE-NEXT: stp w8, w0, [x1] +; LE-NEXT: ror x8, x0, #32 +; LE-NEXT: str x8, [x1] ; LE-NEXT: ret ; ; BE-LABEL: be_i64_to_i32: @@ -654,8 +649,8 @@ define void @be_i64_to_i32_order(i64 %x, i32* %p0) { ; LE-LABEL: be_i64_to_i32_order: ; LE: // %bb.0: -; LE-NEXT: lsr x8, x0, #32 -; LE-NEXT: stp w8, w0, [x1] +; LE-NEXT: ror x8, x0, #32 +; LE-NEXT: str x8, [x1] ; LE-NEXT: ret ; ; BE-LABEL: be_i64_to_i32_order: diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll --- a/llvm/test/CodeGen/X86/stores-merging.ll +++ b/llvm/test/CodeGen/X86/stores-merging.ll @@ -482,9 +482,8 @@ define void @be_i32_to_i16(i32 %x, i16* %p0) { ; CHECK-LABEL: be_i32_to_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movw %di, 2(%rsi) -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: movw %di, (%rsi) +; CHECK-NEXT: rorl $16, %edi +; CHECK-NEXT: movl %edi, (%rsi) ; CHECK-NEXT: retq %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 @@ -498,10 +497,8 @@ define void @be_i32_to_i16_order(i32 %x, i16* %p0) { ; CHECK-LABEL: be_i32_to_i16_order: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shrl $16, %eax -; CHECK-NEXT: movw %ax, (%rsi) -; CHECK-NEXT: movw %di, 2(%rsi) +; CHECK-NEXT: rorl $16, %edi +; CHECK-NEXT: movl %edi, (%rsi) ; CHECK-NEXT: retq %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 @@ -589,9 +586,8 @@ define void @be_i64_to_i32(i64 %x, i32* %p0) { ; CHECK-LABEL: be_i64_to_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, 4(%rsi) -; CHECK-NEXT: shrq $32, %rdi -; CHECK-NEXT: movl %edi, (%rsi) +; CHECK-NEXT: rorq $32, %rdi +; CHECK-NEXT: movq %rdi, (%rsi) ; CHECK-NEXT: retq %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 @@ -605,10 +601,8 @@ define void @be_i64_to_i32_order(i64 %x, i32* %p0) { ; CHECK-LABEL: be_i64_to_i32_order: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movl %eax, (%rsi) -; CHECK-NEXT: movl %edi, 4(%rsi) +; CHECK-NEXT: rorq $32, %rdi +; CHECK-NEXT: movq %rdi, (%rsi) ; CHECK-NEXT: retq %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32