diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7755,7 +7755,7 @@ // Check if all bytes of the source value that we are looking at are stored // to the same base address. Collect offsets from Base address into OffsetMap. SDValue SourceValue; - SmallVector OffsetMap(NumStores, INT64_MAX); + std::map OffsetMap; int64_t FirstOffset = INT64_MAX; StoreSDNode *FirstStore = nullptr; Optional Base; @@ -7814,9 +7814,8 @@ FirstStore = Store; FirstOffset = ByteOffsetFromBase; } - // Map the offset in the store and the offset in the combined value, and // early return if it has been set before. - if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX) + if (OffsetMap.count(Offset) != 0) return SDValue(); OffsetMap[Offset] = ByteOffsetFromBase; } @@ -7824,25 +7823,49 @@ assert(FirstOffset != INT64_MAX && "First byte offset must be set"); assert(FirstStore && "First store must be set"); + // check the shifts are consecutive + unsigned PreShift = INT32_MAX; + for (auto Iter : OffsetMap) { + if (PreShift != INT32_MAX) { + if (Iter.first - PreShift != 1) + return SDValue(); + } + PreShift = Iter.first; + } + // Check that a store of the wide type is both allowed and fast on the target const DataLayout &Layout = DAG.getDataLayout(); bool Fast = false; bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT, *FirstStore->getMemOperand(), &Fast); - if (!Allowed || !Fast) - return SDValue(); + bool Use2St16 = false; + if (!Allowed || !Fast) { + // Is it OK to use two short store for this 4 bytes store? + if (NumStores == 4 && WideVT == MVT::i32) { + Allowed = TLI.allowsMemoryAccess(Context, Layout, MVT::i16, + *FirstStore->getMemOperand(), &Fast); + if (!Allowed || !Fast) + return SDValue(); + Use2St16 = true; + } else { + return SDValue(); + } + } // Check if the pieces of the value are going to the expected places in memory // to merge the stores. - auto checkOffsets = [&](bool MatchLittleEndian) { + auto CheckOffsets = [&](bool MatchLittleEndian) { + unsigned I = 0; if (MatchLittleEndian) { - for (unsigned i = 0; i != NumStores; ++i) - if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset) + for (auto Iter = OffsetMap.begin(); Iter != OffsetMap.end(); Iter++) { + if (Iter->second != I++ * (NarrowNumBits / 8) + FirstOffset) return false; - } else { // MatchBigEndian by reversing loop counter. - for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j) - if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset) + } + } else { + for (auto Iter = OffsetMap.rbegin(); Iter != OffsetMap.rend(); Iter++) { + if (Iter->second != I++ * (NarrowNumBits / 8) + FirstOffset) return false; + } } return true; }; @@ -7850,26 +7873,32 @@ // Check if the offsets line up for the native data layout of this target. bool NeedBswap = false; bool NeedRotate = false; - if (!checkOffsets(Layout.isLittleEndian())) { + if (!CheckOffsets(Layout.isLittleEndian())) { // Special-case: check if byte offsets line up for the opposite endian. - if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian())) + if (NarrowNumBits == 8 && CheckOffsets(Layout.isBigEndian())) NeedBswap = true; - else if (NumStores == 2 && checkOffsets(Layout.isBigEndian())) + else if (NumStores == 2 && CheckOffsets(Layout.isBigEndian())) NeedRotate = true; else return SDValue(); } SDLoc DL(N); + unsigned FirstShift = OffsetMap.begin()->first; + if (FirstShift > 0) { + EVT Typ = SourceValue.getValueType(); + SDValue ShiftAmt = DAG.getConstant(FirstShift * NarrowNumBits, DL, Typ); + SourceValue = DAG.getNode(ISD::SRL, DL, Typ, SourceValue, ShiftAmt); + } if (WideVT != SourceValue.getValueType()) { assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits && "Unexpected store value to merge"); SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue); } - - // Before legalize we can introduce illegal bswaps/rotates which will be later - // converted to an explicit bswap sequence. This way we end up with a single - // store and byte shuffling instead of several stores and byte shuffling. + // Before legalize we can introduce illegal bswaps/rotates which will be + // later converted to an explicit bswap sequence. This way we end up with a + // single store and byte shuffling instead of several stores and byte + // shuffling. if (NeedBswap) { SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue); } else if (NeedRotate) { @@ -7878,6 +7907,27 @@ SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt); } + if (Use2St16) { + SDValue ShiftAmt = DAG.getConstant(16, DL, MVT::i32); + SDValue SourceValueL = + DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, SourceValue); + SDValue SourceValueH = + DAG.getNode(ISD::SRL, DL, MVT::i32, SourceValue, ShiftAmt); + SourceValueH = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, SourceValueH); + + SDValue NewSt16L = + DAG.getStore(Chain, DL, SourceValueL, FirstStore->getBasePtr(), + FirstStore->getPointerInfo(), FirstStore->getAlign()); + SDValue Baseptr = FirstStore->getBasePtr(); + EVT BaseptrType = Baseptr.getValueType(); + SDValue AddrH = DAG.getNode(ISD::ADD, DL, BaseptrType, Baseptr, + DAG.getConstant(16, DL, BaseptrType)); + SDValue NewSt16H = + DAG.getStore(NewSt16L, DL, SourceValueH, AddrH, + FirstStore->getPointerInfo(), FirstStore->getAlign()); + return NewSt16H; + } + SDValue NewStore = DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(), FirstStore->getPointerInfo(), FirstStore->getAlign()); diff --git a/llvm/test/CodeGen/RISCV/store-combine.ll b/llvm/test/CodeGen/RISCV/store-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/store-combine.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV32 +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV64 + +; The ll test case is derived from the following C code +;struct ab { +; char a; +; char b; +; char c; +; char d; +;} __attribute__((aligned(2))); +; +;void store0(struct ab *_ab, unsigned v) { +; _ab->a = v & 0xff; +; _ab->b = (v >> 8) & 0xff; +; _ab->c = (v >> 16) & 0xff; +; _ab->d = (v >> 24) & 0xff; +;} +;void store1(struct ab *_ab, unsigned v) { +; _ab->a = v & 0xff; +; _ab->b = (v >> 8) & 0xff; +;} +;void store2(struct ab *_ab, unsigned v) { +; _ab->c = (v >> 16) & 0xff; +; _ab->d = (v >> 24) & 0xff; +;} +;void store3(struct ab *_ab, unsigned v) { +; _ab->c = (v)&0xff; +; _ab->d = (v >> 8) & 0xff; +;} +; No to combine stores for such case as +; (char)(v >> 8) and (char)(v >> 24) can't +; be converted to (short)(v>>8) +;void store4(struct ab *_ab, unsigned v) { +; _ab->c = (v >> 8) & 0xff; +; _ab->d = (v >> 24) & 0xff; +;} + +%struct.ab = type { i8, i8, i8, i8 } + +define void @store_0(%struct.ab* %_ab, i32 %v) { +; RV32-LABEL: store_0: +; RV32: # %bb.0: +; RV32-NEXT: srli a2, a1, 16 +; RV32-NEXT: sh a1, 0(a0) +; RV32-NEXT: sh a2, 16(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store_0: +; RV64: # %bb.0: +; RV64-NEXT: sh a1, 0(a0) +; RV64-NEXT: srli a1, a1, 16 +; RV64-NEXT: sh a1, 16(a0) +; RV64-NEXT: ret + %conv = trunc i32 %v to i8 + %a = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 0 + store i8 %conv, i8* %a, align 2 + %shr = lshr i32 %v, 8 + %conv2 = trunc i32 %shr to i8 + %b = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 1 + store i8 %conv2, i8* %b, align 1 + %shr3 = lshr i32 %v, 16 + %conv5 = trunc i32 %shr3 to i8 + %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2 + store i8 %conv5, i8* %c, align 2 + %shr6 = lshr i32 %v, 24 + %conv8 = trunc i32 %shr6 to i8 + %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3 + store i8 %conv8, i8* %d, align 1 + ret void +} + +define void @store_1(%struct.ab* %_ab, i32 %v) { +; RV32-LABEL: store_1: +; RV32: # %bb.0: +; RV32-NEXT: sh a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store_1: +; RV64: # %bb.0: +; RV64-NEXT: sh a1, 0(a0) +; RV64-NEXT: ret + %conv = trunc i32 %v to i8 + %a = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 0 + store i8 %conv, i8* %a, align 2 + %shr = lshr i32 %v, 8 + %conv2 = trunc i32 %shr to i8 + %b = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 1 + store i8 %conv2, i8* %b, align 1 + ret void +} + +define void @store2(%struct.ab* %_ab, i32 %v) { +; RV32-LABEL: store2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 16 +; RV32-NEXT: sh a1, 2(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store2: +; RV64: # %bb.0: +; RV64-NEXT: srli a1, a1, 16 +; RV64-NEXT: sh a1, 2(a0) +; RV64-NEXT: ret + %shr = lshr i32 %v, 16 + %conv = trunc i32 %shr to i8 + %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2 + store i8 %conv, i8* %c, align 2 + %shr1 = lshr i32 %v, 24 + %conv3 = trunc i32 %shr1 to i8 + %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3 + store i8 %conv3, i8* %d, align 1 + ret void +} + +define void @store3(%struct.ab* %_ab, i32 %v) { +; RV32-LABEL: store3: +; RV32: # %bb.0: +; RV32-NEXT: sh a1, 2(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store3: +; RV64: # %bb.0: +; RV64-NEXT: sh a1, 2(a0) +; RV64-NEXT: ret + %conv = trunc i32 %v to i8 + %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2 + store i8 %conv, i8* %c, align 2 + %shr = lshr i32 %v, 8 + %conv2 = trunc i32 %shr to i8 + %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3 + store i8 %conv2, i8* %d, align 1 + ret void +} + +; not to combine stores +define dso_local void @store4(%struct.ab* %_ab, i32 %v){ +; RV32-LABEL: store4: +; RV32: # %bb.0: # %entry +; RV32-NEXT: srli a2, a1, 8 +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: srli a1, a1, 24 +; RV32-NEXT: sb a1, 3(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store4: +; RV64: # %bb.0: # %entry +; RV64-NEXT: srli a2, a1, 8 +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: srli a1, a1, 24 +; RV64-NEXT: sb a1, 3(a0) +; RV64-NEXT: ret +entry: + %shr = lshr i32 %v, 8 + %conv = trunc i32 %shr to i8 + %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2 + store i8 %conv, i8* %c, align 2 + %shr1 = lshr i32 %v, 24 + %conv3 = trunc i32 %shr1 to i8 + %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3 + store i8 %conv3, i8* %d, align 1 + ret void +}