diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7556,6 +7556,36 @@ if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64) return SDValue(); + // Find the minimal offset + uint64_t OffsetMin = UINT64_MAX; + for (auto *Store : Stores) { + // All the stores store different parts of the CombinedValue. A truncate is + // required to get the partial value. + SDValue Trunc = Store->getValue(); + if (Trunc.getOpcode() != ISD::TRUNCATE) + return SDValue(); + // Other than the first/last part, a shift operation is required to get the + // offset. + uint64_t Offset = 0; + SDValue WideVal = Trunc.getOperand(0); + if ((WideVal.getOpcode() == ISD::SRL || WideVal.getOpcode() == ISD::SRA) && + isa(WideVal.getOperand(1))) { + // The shift amount must be a constant multiple of the narrow type. + // It is translated to the offset address in the wide source value "y". + // + // x = srl y, ShiftAmtC + // i8 z = trunc x + // store z, ... + uint64_t ShiftAmtC = WideVal.getConstantOperandVal(1); + if (ShiftAmtC % NarrowNumBits != 0) + return SDValue(); + + Offset = ShiftAmtC / NarrowNumBits; + } + if (Offset < OffsetMin) + OffsetMin = Offset; + } + // Check if all bytes of the source value that we are looking at are stored // to the same base address. Collect offsets from Base address into OffsetMap. SDValue SourceValue; @@ -7585,7 +7615,7 @@ if (ShiftAmtC % NarrowNumBits != 0) return SDValue(); - Offset = ShiftAmtC / NarrowNumBits; + Offset = ShiftAmtC / NarrowNumBits - OffsetMin; WideVal = WideVal.getOperand(0); } @@ -7620,7 +7650,7 @@ } // Map the offset in the store and the offset in the combined value, and // early return if it has been set before. - if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX) + if (Offset >= NumStores || OffsetMap[Offset] != INT64_MAX) return SDValue(); OffsetMap[Offset] = ByteOffsetFromBase; } @@ -7633,8 +7663,19 @@ bool Fast = false; bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT, *FirstStore->getMemOperand(), &Fast); - if (!Allowed || !Fast) - return SDValue(); + bool Use2St16 = false; + if (!Allowed || !Fast) { + // Is it OK to use two short store for this 4 bytes store? + if (NumStores == 4 && WideVT == MVT::i32) { + Allowed = TLI.allowsMemoryAccess(Context, Layout, MVT::i16, + *FirstStore->getMemOperand(), &Fast); + if (!Allowed || !Fast) + return SDValue(); + Use2St16 = true; + } else { + return SDValue(); + } + } // Check if the pieces of the value are going to the expected places in memory // to merge the stores. @@ -7665,15 +7706,20 @@ } SDLoc DL(N); + if (OffsetMin > 0) { + EVT Typ = SourceValue.getValueType(); + SDValue ShiftAmt = DAG.getConstant(OffsetMin * NarrowNumBits, DL, Typ); + SourceValue = DAG.getNode(ISD::SRL, DL, Typ, SourceValue, ShiftAmt); + } if (WideVT != SourceValue.getValueType()) { assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits && "Unexpected store value to merge"); SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue); } - - // Before legalize we can introduce illegal bswaps/rotates which will be later - // converted to an explicit bswap sequence. This way we end up with a single - // store and byte shuffling instead of several stores and byte shuffling. + // Before legalize we can introduce illegal bswaps/rotates which will be + // later converted to an explicit bswap sequence. This way we end up with a + // single store and byte shuffling instead of several stores and byte + // shuffling. if (NeedBswap) { SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue); } else if (NeedRotate) { @@ -7682,6 +7728,26 @@ SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt); } + if (Use2St16) { + SDValue SourceValueL; + SDValue SourceValueH; + SDValue ShiftAmt = DAG.getConstant(16, DL, MVT::i32); + SourceValueL = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, SourceValue); + SourceValueH = DAG.getNode(ISD::SRL, DL, MVT::i32, SourceValue, ShiftAmt); + SourceValueH = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, SourceValueH); + + SDValue NewSt16L = + DAG.getStore(Chain, DL, SourceValueL, FirstStore->getBasePtr(), + FirstStore->getPointerInfo(), FirstStore->getAlign()); + SDValue AddrH = + DAG.getNode(ISD::ADD, DL, MVT::i32, FirstStore->getBasePtr(), + DAG.getConstant(16, DL, MVT::i32)); + SDValue NewSt16H = + DAG.getStore(NewSt16L, DL, SourceValueH, AddrH, + FirstStore->getPointerInfo(), FirstStore->getAlign()); + return NewSt16H; + } + SDValue NewStore = DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(), FirstStore->getPointerInfo(), FirstStore->getAlign()); diff --git a/llvm/test/CodeGen/RISCV/store-combine.ll b/llvm/test/CodeGen/RISCV/store-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/store-combine.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV32 + +%struct.ab = type { i8, i8, i8, i8 } +; +define void @store_0(%struct.ab* nocapture noundef writeonly %_ab, i32 noundef %v) { +; RV32-LABEL: store_0: +; RV32: # %bb.0: +; RV32-NEXT: srli a2, a1, 16 +; RV32-NEXT: sh a1, 0(a0) +; RV32-NEXT: sh a2, 16(a0) +; RV32-NEXT: ret + %conv = trunc i32 %v to i8 + %a = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 0 + store i8 %conv, i8* %a, align 2, !tbaa !4 + %shr = lshr i32 %v, 8 + %conv2 = trunc i32 %shr to i8 + %b = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 1 + store i8 %conv2, i8* %b, align 1, !tbaa !8 + %shr3 = lshr i32 %v, 16 + %conv5 = trunc i32 %shr3 to i8 + %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2 + store i8 %conv5, i8* %c, align 2, !tbaa !9 + %shr6 = lshr i32 %v, 24 + %conv8 = trunc i32 %shr6 to i8 + %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3 + store i8 %conv8, i8* %d, align 1, !tbaa !10 + ret void +} + +define void @store_1(%struct.ab* nocapture noundef writeonly %_ab, i32 noundef %v) { +; RV32-LABEL: store_1: +; RV32: # %bb.0: +; RV32-NEXT: sh a1, 0(a0) +; RV32-NEXT: ret + %conv = trunc i32 %v to i8 + %a = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 0 + store i8 %conv, i8* %a, align 2, !tbaa !4 + %shr = lshr i32 %v, 8 + %conv2 = trunc i32 %shr to i8 + %b = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 1 + store i8 %conv2, i8* %b, align 1, !tbaa !8 + ret void +} + +define void @store3(%struct.ab* nocapture noundef writeonly %_ab, i32 noundef %v) { +; RV32-LABEL: store3: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 16 +; RV32-NEXT: sh a1, 2(a0) +; RV32-NEXT: ret + %shr = lshr i32 %v, 16 + %conv = trunc i32 %shr to i8 + %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2 + store i8 %conv, i8* %c, align 2, !tbaa !9 + %shr1 = lshr i32 %v, 24 + %conv3 = trunc i32 %shr1 to i8 + %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3 + store i8 %conv3, i8* %d, align 1, !tbaa !10 + ret void +} + +define void @store4(%struct.ab* nocapture noundef writeonly %_ab, i32 noundef %v) { +; RV32-LABEL: store4: +; RV32: # %bb.0: +; RV32-NEXT: sh a1, 2(a0) +; RV32-NEXT: ret + %conv = trunc i32 %v to i8 + %c = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 2 + store i8 %conv, i8* %c, align 2, !tbaa !9 + %shr = lshr i32 %v, 8 + %conv2 = trunc i32 %shr to i8 + %d = getelementptr inbounds %struct.ab, %struct.ab* %_ab, i32 0, i32 3 + store i8 %conv2, i8* %d, align 1, !tbaa !10 + ret void +} + +!4 = !{!5, !6, i64 0} +!5 = !{!"ab", !6, i64 0, !6, i64 1, !6, i64 2, !6, i64 3} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = !{!5, !6, i64 1} +!9 = !{!5, !6, i64 2} +!10 = !{!5, !6, i64 3}