diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6869,8 +6869,9 @@ SmallVector Stores; for (StoreSDNode *Store = N; Store; Store = dyn_cast(Chain)) { // TODO: Allow unordered atomics when wider type is legal (see D66309) - if (Store->getMemoryVT() != MVT::i8 || !Store->isSimple() || - Store->isIndexed()) + EVT MemVT = Store->getMemoryVT(); + if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) || + !Store->isSimple() || Store->isIndexed()) return SDValue(); Stores.push_back(Store); Chain = Store->getChain(); @@ -6959,12 +6960,6 @@ assert(FirstOffset != INT64_MAX && "First byte offset must be set"); assert(FirstStore && "First store must be set"); - // Check if the bytes of the combined value we are looking at match with - // either big or little endian value store. - Optional IsBigEndian = isBigEndian(OffsetMap, FirstOffset); - if (!IsBigEndian.hasValue()) - return SDValue(); - // Check that a store of the wide type is both allowed and fast on the target const DataLayout &Layout = DAG.getDataLayout(); bool Fast = false; @@ -6973,6 +6968,31 @@ if (!Allowed || !Fast) return SDValue(); + // Check if the pieces of the value are going to the expected places in memory + // to merge the stores. + auto checkOffsets = [&](bool MatchLittleEndian) { + if (MatchLittleEndian) { + for (unsigned i = 0; i != NumStores; ++i) + if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset) + return false; + } else { // MatchBigEndian by reversing loop counter. + for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j) + if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset) + return false; + } + return true; + }; + + // Check if the offsets line up for the native data layout of this target. + bool NeedBswap = false; + if (!checkOffsets(Layout.isLittleEndian())) { + // Special-case: check if byte offsets line up for the opposite endian. + // TODO: We could use rotates for 16/32-bit merge pairs. + if (NarrowNumBits != 8 || !checkOffsets(Layout.isBigEndian())) + return SDValue(); + NeedBswap = true; + } + SDLoc DL(N); if (WideVT != SourceValue.getValueType()) { assert(SourceValue.getValueType().getSizeInBits() > WideNumBits && @@ -6983,7 +7003,6 @@ // Before legalize we can introduce illegal bswaps which will be later // converted to an explicit bswap sequence. This way we end up with a single // store and byte shuffling instead of several stores and byte shuffling. - bool NeedBswap = Layout.isBigEndian() != *IsBigEndian; if (NeedBswap) SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue); diff --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll --- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll +++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll @@ -200,12 +200,17 @@ } define void @le_i32_to_i16(i32 %x, i16* %p0) { -; CHECK-LABEL: le_i32_to_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w0, [x1] -; CHECK-NEXT: strh w8, [x1, #2] -; CHECK-NEXT: ret +; LE-LABEL: le_i32_to_i16: +; LE: // %bb.0: +; LE-NEXT: str w0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i32_to_i16: +; BE: // %bb.0: +; BE-NEXT: lsr w8, w0, #16 +; BE-NEXT: strh w0, [x1] +; BE-NEXT: strh w8, [x1, #2] +; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 %t1 = trunc i32 %sh1 to i16 @@ -216,12 +221,17 @@ } define void @le_i32_to_i16_order(i32 %x, i16* %p0) { -; CHECK-LABEL: le_i32_to_i16_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w8, [x1, #2] -; CHECK-NEXT: strh w0, [x1] -; CHECK-NEXT: ret +; LE-LABEL: le_i32_to_i16_order: +; LE: // %bb.0: +; LE-NEXT: str w0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i32_to_i16_order: +; BE: // %bb.0: +; BE-NEXT: lsr w8, w0, #16 +; BE-NEXT: strh w8, [x1, #2] +; BE-NEXT: strh w0, [x1] +; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 %t1 = trunc i32 %sh1 to i16 @@ -232,12 +242,17 @@ } define void @be_i32_to_i16(i32 %x, i16* %p0) { -; CHECK-LABEL: be_i32_to_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w0, [x1, #2] -; CHECK-NEXT: strh w8, [x1] -; CHECK-NEXT: ret +; LE-LABEL: be_i32_to_i16: +; LE: // %bb.0: +; LE-NEXT: lsr w8, w0, #16 +; LE-NEXT: strh w0, [x1, #2] +; LE-NEXT: strh w8, [x1] +; LE-NEXT: ret +; +; BE-LABEL: be_i32_to_i16: +; BE: // %bb.0: +; BE-NEXT: str w0, [x1] +; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 %t1 = trunc i32 %sh1 to i16 @@ -248,12 +263,17 @@ } define void @be_i32_to_i16_order(i32 %x, i16* %p0) { -; CHECK-LABEL: be_i32_to_i16_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w8, [x1] -; CHECK-NEXT: strh w0, [x1, #2] -; CHECK-NEXT: ret +; LE-LABEL: be_i32_to_i16_order: +; LE: // %bb.0: +; LE-NEXT: lsr w8, w0, #16 +; LE-NEXT: strh w8, [x1] +; LE-NEXT: strh w0, [x1, #2] +; LE-NEXT: ret +; +; BE-LABEL: be_i32_to_i16_order: +; BE: // %bb.0: +; BE-NEXT: str w0, [x1] +; BE-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 %t1 = trunc i32 %sh1 to i16 @@ -440,16 +460,21 @@ } define void @le_i64_to_i16(i64 %x, i16* %p0) { -; CHECK-LABEL: le_i64_to_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: lsr x9, x0, #32 -; CHECK-NEXT: lsr x10, x0, #48 -; CHECK-NEXT: strh w0, [x1] -; CHECK-NEXT: strh w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1, #4] -; CHECK-NEXT: strh w10, [x1, #6] -; CHECK-NEXT: ret +; LE-LABEL: le_i64_to_i16: +; LE: // %bb.0: +; LE-NEXT: str x0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i64_to_i16: +; BE: // %bb.0: +; BE-NEXT: lsr x8, x0, #16 +; BE-NEXT: lsr x9, x0, #32 +; BE-NEXT: lsr x10, x0, #48 +; BE-NEXT: strh w0, [x1] +; BE-NEXT: strh w8, [x1, #2] +; BE-NEXT: strh w9, [x1, #4] +; BE-NEXT: strh w10, [x1, #6] +; BE-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 %sh3 = lshr i64 %x, 48 @@ -468,16 +493,21 @@ } define void @le_i64_to_i16_order(i64 %x, i16* %p0) { -; CHECK-LABEL: le_i64_to_i16_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: lsr x9, x0, #32 -; CHECK-NEXT: lsr x10, x0, #48 -; CHECK-NEXT: strh w0, [x1] -; CHECK-NEXT: strh w8, [x1, #2] -; CHECK-NEXT: strh w10, [x1, #6] -; CHECK-NEXT: strh w9, [x1, #4] -; CHECK-NEXT: ret +; LE-LABEL: le_i64_to_i16_order: +; LE: // %bb.0: +; LE-NEXT: str x0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i64_to_i16_order: +; BE: // %bb.0: +; BE-NEXT: lsr x8, x0, #16 +; BE-NEXT: lsr x9, x0, #32 +; BE-NEXT: lsr x10, x0, #48 +; BE-NEXT: strh w0, [x1] +; BE-NEXT: strh w8, [x1, #2] +; BE-NEXT: strh w10, [x1, #6] +; BE-NEXT: strh w9, [x1, #4] +; BE-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 %sh3 = lshr i64 %x, 48 @@ -496,16 +526,21 @@ } define void @be_i64_to_i16(i64 %x, i16* %p0) { -; CHECK-LABEL: be_i64_to_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: lsr x9, x0, #32 -; CHECK-NEXT: lsr x10, x0, #48 -; CHECK-NEXT: strh w0, [x1, #6] -; CHECK-NEXT: strh w8, [x1, #4] -; CHECK-NEXT: strh w9, [x1, #2] -; CHECK-NEXT: strh w10, [x1] -; CHECK-NEXT: ret +; LE-LABEL: be_i64_to_i16: +; LE: // %bb.0: +; LE-NEXT: lsr x8, x0, #16 +; LE-NEXT: lsr x9, x0, #32 +; LE-NEXT: lsr x10, x0, #48 +; LE-NEXT: strh w0, [x1, #6] +; LE-NEXT: strh w8, [x1, #4] +; LE-NEXT: strh w9, [x1, #2] +; LE-NEXT: strh w10, [x1] +; LE-NEXT: ret +; +; BE-LABEL: be_i64_to_i16: +; BE: // %bb.0: +; BE-NEXT: str x0, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 %sh3 = lshr i64 %x, 48 @@ -524,16 +559,21 @@ } define void @be_i64_to_i16_order(i64 %x, i16* %p0) { -; CHECK-LABEL: be_i64_to_i16_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: lsr x9, x0, #32 -; CHECK-NEXT: lsr x10, x0, #48 -; CHECK-NEXT: strh w0, [x1, #6] -; CHECK-NEXT: strh w10, [x1] -; CHECK-NEXT: strh w9, [x1, #2] -; CHECK-NEXT: strh w8, [x1, #4] -; CHECK-NEXT: ret +; LE-LABEL: be_i64_to_i16_order: +; LE: // %bb.0: +; LE-NEXT: lsr x8, x0, #16 +; LE-NEXT: lsr x9, x0, #32 +; LE-NEXT: lsr x10, x0, #48 +; LE-NEXT: strh w0, [x1, #6] +; LE-NEXT: strh w10, [x1] +; LE-NEXT: strh w9, [x1, #2] +; LE-NEXT: strh w8, [x1, #4] +; LE-NEXT: ret +; +; BE-LABEL: be_i64_to_i16_order: +; BE: // %bb.0: +; BE-NEXT: str x0, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 %sh3 = lshr i64 %x, 48 @@ -552,11 +592,16 @@ } define void @le_i64_to_i32(i64 %x, i32* %p0) { -; CHECK-LABEL: le_i64_to_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: stp w0, w8, [x1] -; CHECK-NEXT: ret +; LE-LABEL: le_i64_to_i32: +; LE: // %bb.0: +; LE-NEXT: str x0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i64_to_i32: +; BE: // %bb.0: +; BE-NEXT: lsr x8, x0, #32 +; BE-NEXT: stp w0, w8, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 %t1 = trunc i64 %sh1 to i32 @@ -567,11 +612,16 @@ } define void @le_i64_to_i32_order(i64 %x, i32* %p0) { -; CHECK-LABEL: le_i64_to_i32_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: stp w0, w8, [x1] -; CHECK-NEXT: ret +; LE-LABEL: le_i64_to_i32_order: +; LE: // %bb.0: +; LE-NEXT: str x0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: le_i64_to_i32_order: +; BE: // %bb.0: +; BE-NEXT: lsr x8, x0, #32 +; BE-NEXT: stp w0, w8, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 %t1 = trunc i64 %sh1 to i32 @@ -582,11 +632,16 @@ } define void @be_i64_to_i32(i64 %x, i32* %p0) { -; CHECK-LABEL: be_i64_to_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: stp w8, w0, [x1] -; CHECK-NEXT: ret +; LE-LABEL: be_i64_to_i32: +; LE: // %bb.0: +; LE-NEXT: lsr x8, x0, #32 +; LE-NEXT: stp w8, w0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: be_i64_to_i32: +; BE: // %bb.0: +; BE-NEXT: str x0, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 %t1 = trunc i64 %sh1 to i32 @@ -597,11 +652,16 @@ } define void @be_i64_to_i32_order(i64 %x, i32* %p0) { -; CHECK-LABEL: be_i64_to_i32_order: -; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: stp w8, w0, [x1] -; CHECK-NEXT: ret +; LE-LABEL: be_i64_to_i32_order: +; LE: // %bb.0: +; LE-NEXT: lsr x8, x0, #32 +; LE-NEXT: stp w8, w0, [x1] +; LE-NEXT: ret +; +; BE-LABEL: be_i64_to_i32_order: +; BE: // %bb.0: +; BE-NEXT: str x0, [x1] +; BE-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 %t1 = trunc i64 %sh1 to i32 @@ -611,6 +671,8 @@ ret void } +; Negative test - not consecutive addresses + define void @i64_to_i32_wrong_addr(i64 %x, i32* %p0) { ; CHECK-LABEL: i64_to_i32_wrong_addr: ; CHECK: // %bb.0: @@ -627,6 +689,8 @@ ret void } +; Negative test - addresses don't line up with shift amounts + define void @i64_to_i16_wrong_order(i64 %x, i16* %p0) { ; CHECK-LABEL: i64_to_i16_wrong_order: ; CHECK: // %bb.0: @@ -655,6 +719,8 @@ ret void } +; Negative test - no store of 't1' + define void @i32_to_i8_incomplete(i32 %x, i8* %p0) { ; CHECK-LABEL: i32_to_i8_incomplete: ; CHECK: // %bb.0: @@ -680,6 +746,8 @@ ret void } +; Negative test - no store of 't3' + define void @i64_to_i8_incomplete(i64 %x, i8* %p0) { ; CHECK-LABEL: i64_to_i8_incomplete: ; CHECK: // %bb.0: @@ -729,6 +797,8 @@ ret void } +; Negative test - not consecutive addresses + define void @i32_to_i16_wrong_addr(i32 %x, i16* %p0) { ; CHECK-LABEL: i32_to_i16_wrong_addr: ; CHECK: // %bb.0: @@ -745,6 +815,8 @@ ret void } +; Negative test - addresses don't line up with shift amounts + define void @i32_to_i8_wrong_order(i32 %x, i8* %p0) { ; CHECK-LABEL: i32_to_i8_wrong_order: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll --- a/llvm/test/CodeGen/X86/stores-merging.ll +++ b/llvm/test/CodeGen/X86/stores-merging.ll @@ -468,9 +468,7 @@ define void @trunc_i32_to_i16(i32 %x, i16* %p) { ; CHECK-LABEL: trunc_i32_to_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movw %di, (%rsi) -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: movw %di, 2(%rsi) +; CHECK-NEXT: movl %edi, (%rsi) ; CHECK-NEXT: retq %t1 = trunc i32 %x to i16 %sh = lshr i32 %x, 16 @@ -522,15 +520,7 @@ define void @trunc_i64_to_i16(i64 %x, i16* %p) { ; CHECK-LABEL: trunc_i64_to_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: movw %di, (%rsi) -; CHECK-NEXT: shrq $16, %rdi -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: shrq $48, %rcx -; CHECK-NEXT: movw %di, 2(%rsi) -; CHECK-NEXT: movw %ax, 4(%rsi) -; CHECK-NEXT: movw %cx, 6(%rsi) +; CHECK-NEXT: movq %rdi, (%rsi) ; CHECK-NEXT: retq %t1 = trunc i64 %x to i16 %sh1 = lshr i64 %x, 16 @@ -552,9 +542,7 @@ define void @trunc_i64_to_i32(i64 %x, i32* %p) { ; CHECK-LABEL: trunc_i64_to_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, (%rsi) -; CHECK-NEXT: shrq $32, %rdi -; CHECK-NEXT: movl %edi, 4(%rsi) +; CHECK-NEXT: movq %rdi, (%rsi) ; CHECK-NEXT: retq %t1 = trunc i64 %x to i32 %sh = lshr i64 %x, 32