Index: llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h =================================================================== --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -128,6 +128,7 @@ #include "RISCVGenDAGISel.inc" private: + bool doPeepholeLoadStoreADDI(SDNode *Node); bool doPeepholeSExtW(SDNode *Node); bool doPeepholeMaskedRVV(SDNode *Node); }; Index: llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -146,6 +146,7 @@ continue; MadeChange |= doPeepholeSExtW(N); + MadeChange |= doPeepholeLoadStoreADDI(N); MadeChange |= doPeepholeMaskedRVV(N); } @@ -155,6 +156,48 @@ CurDAG->RemoveDeadNodes(); } +// Returns true if N is a MachineSDNode that has a reg and a constant zero +// memory operand. The indices of the base pointer and offset are returned in +// BaseOpIdx and OffsetOpIdx. +static bool hasConstantZeroMemOffset(SDNode *N, unsigned &BaseOpIdx, + unsigned &OffsetOpIdx) { + if (!N->isMachineOpcode()) + return false; + + switch (N->getMachineOpcode()) { + case RISCV::LB: + case RISCV::LH: + case RISCV::LW: + case RISCV::LBU: + case RISCV::LHU: + case RISCV::LWU: + case RISCV::LD: + case RISCV::FLH: + case RISCV::FLW: + case RISCV::FLD: + BaseOpIdx = 0; + OffsetOpIdx = 1; + break; + case RISCV::SB: + case RISCV::SH: + case RISCV::SW: + case RISCV::SD: + case RISCV::FSH: + case RISCV::FSW: + case RISCV::FSD: + BaseOpIdx = 1; + OffsetOpIdx = 2; + break; + default: + return false; + } + + if (!isa(N->getOperand(OffsetOpIdx))) + return false; + + return (N->getConstantOperandVal(OffsetOpIdx) == 0); +} + static SDNode *selectImmSeq(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT, RISCVMatInt::InstSeq &Seq) { SDNode *Result = nullptr; @@ -1821,8 +1864,8 @@ return true; } -// Is this ADD instruction only used as the base pointer of scalar loads and -// stores? +// Is this ADD/ADD_LO instruction only used as the base pointer of scalar +// loads and stores? static bool isWorthFoldingAdd(SDValue Add) { for (auto Use : Add->uses()) { if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE && @@ -1853,8 +1896,12 @@ SDLoc DL(Addr); MVT VT = Addr.getSimpleValueType(); - if (Addr.getOpcode() == RISCVISD::ADD_LO) { - Base = Addr.getOperand(0); + // Select the Base and Offset from the ADD_LO except in the case that the + // ADD_LO is used in non-memory instruction (e.g. as a base to an add) and + // the compressed extension is present. In that case, leaving it separated + // may increase the chance of compressing the load/store. + if (Addr.getOpcode() == RISCVISD::ADD_LO && (!Subtarget->hasStdExtC() || + isWorthFoldingAdd(Addr))) { Base = Addr.getOperand(0); Offset = Addr.getOperand(1); return true; } @@ -2336,6 +2383,46 @@ return false; } +// SelectAddrRegImm won't merge an ADD_LO into a memory operation if it has +// uses that aren't scalar loads and stores. This will turn out to be a bad +// decision if all those other uses end up being merged into memory +// operations. This peephole folds the resulting ADDI back in that is the +// case. +bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) { + unsigned OffsetOpIdx, BaseOpIdx; + if (!hasConstantZeroMemOffset(N, BaseOpIdx, OffsetOpIdx)) + return false; + + SDValue Base = N->getOperand(BaseOpIdx); + if (!Base.isMachineOpcode()) + return false; + if (Base.getMachineOpcode() != RISCV::ADDI) + return false; + if (!isa(Base.getOperand(1))) + return false; + for (auto Use : Base->uses()) { + unsigned Dummy1, Dummy2; + if (!hasConstantZeroMemOffset(Use, Dummy1, Dummy2)) + return false; + } + + LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: "); + LLVM_DEBUG(Base->dump(CurDAG)); + LLVM_DEBUG(dbgs() << "\nN: "); + LLVM_DEBUG(N->dump(CurDAG)); + LLVM_DEBUG(dbgs() << "\n"); + + if (BaseOpIdx == 0) { // Load + N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), Base.getOperand(1), + N->getOperand(2)); + } else { // Store + N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0), + Base.getOperand(1), N->getOperand(3)); + } + + return true; +} + // Try to remove sext.w if the input is a W instruction or can be made into // a W instruction cheaply. bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) { Index: llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll =================================================================== --- llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll +++ llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ -; RUN: | FileCheck -check-prefix=RV32 %s +; RUN: | FileCheck -check-prefixes=RV32,RV32I %s +; RUN: llc -mtriple=riscv32 -mattr=+c -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32,RV32C %s ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ -; RUN: | FileCheck -check-prefix=RV64 %s +; RUN: | FileCheck -check-prefixes=RV64,RV64I %s +; RUN: llc -mtriple=riscv64 -mattr=+c -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64,RV64C %s ; We can often fold an ADDI into the offset of load/store instructions: ; (load (addi base, off1), off2) -> (load base, off1+off2) @@ -37,13 +41,21 @@ } define dso_local i64 @load_g_1() nounwind { -; RV32-LABEL: load_g_1: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(g_1) -; RV32-NEXT: lw a0, %lo(g_1)(a1) -; RV32-NEXT: addi a1, a1, %lo(g_1) -; RV32-NEXT: lw a1, 4(a1) -; RV32-NEXT: ret +; RV32I-LABEL: load_g_1: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, %hi(g_1) +; RV32I-NEXT: lw a0, %lo(g_1)(a1) +; RV32I-NEXT: addi a1, a1, %lo(g_1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: ret +; +; RV32C-LABEL: load_g_1: +; RV32C: # %bb.0: # %entry +; RV32C-NEXT: lui a0, %hi(g_1) +; RV32C-NEXT: addi a1, a0, %lo(g_1) +; RV32C-NEXT: lw a0, 0(a1) +; RV32C-NEXT: lw a1, 4(a1) +; RV32C-NEXT: ret ; ; RV64-LABEL: load_g_1: ; RV64: # %bb.0: # %entry @@ -56,13 +68,21 @@ } define dso_local i64 @load_g_2() nounwind { -; RV32-LABEL: load_g_2: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(g_2) -; RV32-NEXT: lw a0, %lo(g_2)(a1) -; RV32-NEXT: addi a1, a1, %lo(g_2) -; RV32-NEXT: lw a1, 4(a1) -; RV32-NEXT: ret +; RV32I-LABEL: load_g_2: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, %hi(g_2) +; RV32I-NEXT: lw a0, %lo(g_2)(a1) +; RV32I-NEXT: addi a1, a1, %lo(g_2) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: ret +; +; RV32C-LABEL: load_g_2: +; RV32C: # %bb.0: # %entry +; RV32C-NEXT: lui a0, %hi(g_2) +; RV32C-NEXT: addi a1, a0, %lo(g_2) +; RV32C-NEXT: lw a0, 0(a1) +; RV32C-NEXT: lw a1, 4(a1) +; RV32C-NEXT: ret ; ; RV64-LABEL: load_g_2: ; RV64: # %bb.0: # %entry @@ -75,13 +95,21 @@ } define dso_local i64 @load_g_4() nounwind { -; RV32-LABEL: load_g_4: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(g_4) -; RV32-NEXT: lw a0, %lo(g_4)(a1) -; RV32-NEXT: addi a1, a1, %lo(g_4) -; RV32-NEXT: lw a1, 4(a1) -; RV32-NEXT: ret +; RV32I-LABEL: load_g_4: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, %hi(g_4) +; RV32I-NEXT: lw a0, %lo(g_4)(a1) +; RV32I-NEXT: addi a1, a1, %lo(g_4) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: ret +; +; RV32C-LABEL: load_g_4: +; RV32C: # %bb.0: # %entry +; RV32C-NEXT: lui a0, %hi(g_4) +; RV32C-NEXT: addi a1, a0, %lo(g_4) +; RV32C-NEXT: lw a0, 0(a1) +; RV32C-NEXT: lw a1, 4(a1) +; RV32C-NEXT: ret ; ; RV64-LABEL: load_g_4: ; RV64: # %bb.0: # %entry @@ -130,13 +158,21 @@ } define dso_local void @store_g_4() nounwind { -; RV32-LABEL: store_g_4: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a0, %hi(g_4) -; RV32-NEXT: sw zero, %lo(g_4)(a0) -; RV32-NEXT: addi a0, a0, %lo(g_4) -; RV32-NEXT: sw zero, 4(a0) -; RV32-NEXT: ret +; RV32I-LABEL: store_g_4: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a0, %hi(g_4) +; RV32I-NEXT: sw zero, %lo(g_4)(a0) +; RV32I-NEXT: addi a0, a0, %lo(g_4) +; RV32I-NEXT: sw zero, 4(a0) +; RV32I-NEXT: ret +; +; RV32C-LABEL: store_g_4: +; RV32C: # %bb.0: # %entry +; RV32C-NEXT: lui a0, %hi(g_4) +; RV32C-NEXT: addi a0, a0, %lo(g_4) +; RV32C-NEXT: sw zero, 4(a0) +; RV32C-NEXT: sw zero, 0(a0) +; RV32C-NEXT: ret ; ; RV64-LABEL: store_g_4: ; RV64: # %bb.0: # %entry @@ -244,23 +280,41 @@ @ga32 = dso_local global [4 x i32] zeroinitializer, align 4 define dso_local i32 @load_ga32_multi() nounwind { -; RV32-LABEL: load_ga32_multi: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a0, %hi(ga32) -; RV32-NEXT: lw a1, %lo(ga32)(a0) -; RV32-NEXT: addi a0, a0, %lo(ga32) -; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: ret +; RV32I-LABEL: load_ga32_multi: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a0, %hi(ga32) +; RV32I-NEXT: lw a1, %lo(ga32)(a0) +; RV32I-NEXT: addi a0, a0, %lo(ga32) +; RV32I-NEXT: lw a0, 4(a0) +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: ret ; -; RV64-LABEL: load_ga32_multi: -; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a0, %hi(ga32) -; RV64-NEXT: lw a1, %lo(ga32)(a0) -; RV64-NEXT: addi a0, a0, %lo(ga32) -; RV64-NEXT: lw a0, 4(a0) -; RV64-NEXT: addw a0, a1, a0 -; RV64-NEXT: ret +; RV32C-LABEL: load_ga32_multi: +; RV32C: # %bb.0: # %entry +; RV32C-NEXT: lui a0, %hi(ga32) +; RV32C-NEXT: addi a0, a0, %lo(ga32) +; RV32C-NEXT: lw a1, 0(a0) +; RV32C-NEXT: lw a0, 4(a0) +; RV32C-NEXT: add a0, a0, a1 +; RV32C-NEXT: ret +; +; RV64I-LABEL: load_ga32_multi: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: lui a0, %hi(ga32) +; RV64I-NEXT: lw a1, %lo(ga32)(a0) +; RV64I-NEXT: addi a0, a0, %lo(ga32) +; RV64I-NEXT: lw a0, 4(a0) +; RV64I-NEXT: addw a0, a1, a0 +; RV64I-NEXT: ret +; +; RV64C-LABEL: load_ga32_multi: +; RV64C: # %bb.0: # %entry +; RV64C-NEXT: lui a0, %hi(ga32) +; RV64C-NEXT: addi a0, a0, %lo(ga32) +; RV64C-NEXT: lw a1, 0(a0) +; RV64C-NEXT: lw a0, 4(a0) +; RV64C-NEXT: addw a0, a0, a1 +; RV64C-NEXT: ret entry: %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @ga32, i32 0, i32 0) %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @ga32, i32 0, i32 1) @@ -274,14 +328,23 @@ @tl_8 = dso_local thread_local global i64 0, align 8 define dso_local i64 @load_tl_4() nounwind { -; RV32-LABEL: load_tl_4: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a0, %tprel_hi(tl_4) -; RV32-NEXT: add a1, a0, tp, %tprel_add(tl_4) -; RV32-NEXT: lw a0, %tprel_lo(tl_4)(a1) -; RV32-NEXT: addi a1, a1, %tprel_lo(tl_4) -; RV32-NEXT: lw a1, 4(a1) -; RV32-NEXT: ret +; RV32I-LABEL: load_tl_4: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a0, %tprel_hi(tl_4) +; RV32I-NEXT: add a1, a0, tp, %tprel_add(tl_4) +; RV32I-NEXT: lw a0, %tprel_lo(tl_4)(a1) +; RV32I-NEXT: addi a1, a1, %tprel_lo(tl_4) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: ret +; +; RV32C-LABEL: load_tl_4: +; RV32C: # %bb.0: # %entry +; RV32C-NEXT: lui a0, %tprel_hi(tl_4) +; RV32C-NEXT: add a0, a0, tp, %tprel_add(tl_4) +; RV32C-NEXT: addi a1, a0, %tprel_lo(tl_4) +; RV32C-NEXT: lw a0, 0(a1) +; RV32C-NEXT: lw a1, 4(a1) +; RV32C-NEXT: ret ; ; RV64-LABEL: load_tl_4: ; RV64: # %bb.0: # %entry Index: llvm/test/CodeGen/RISCV/global-merge-minsize.ll =================================================================== --- llvm/test/CodeGen/RISCV/global-merge-minsize.ll +++ llvm/test/CodeGen/RISCV/global-merge-minsize.ll @@ -25,10 +25,6 @@ ret void } -; TODO: It would be better for code size to alter the first store below by -; first fully materialising .L_MergedGlobals in a1 and then storing to it with -; a 0 offset. - define void @f2(i32 %a) nounwind minsize optsize { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: Index: llvm/test/CodeGen/RISCV/global-merge-offset.ll =================================================================== --- llvm/test/CodeGen/RISCV/global-merge-offset.ll +++ llvm/test/CodeGen/RISCV/global-merge-offset.ll @@ -15,9 +15,6 @@ @ga2 = dso_local global [ArrSize x i32] zeroinitializer, align 4 @gi = dso_local global i32 0, align 4 -; TODO: It would be better for codesize if the final store below was -; `sw a0, 0(a2)`. - define void @f1(i32 %a) nounwind { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: Index: llvm/test/CodeGen/RISCV/global-merge.ll =================================================================== --- llvm/test/CodeGen/RISCV/global-merge.ll +++ llvm/test/CodeGen/RISCV/global-merge.ll @@ -10,10 +10,6 @@ @eg1 = dso_local global i32 0, align 4 @eg2 = dso_local global i32 0, align 4 -; TODO: It would be better for code size to alter the first store below by -; first fully materialising .L_MergedGlobals in a1 and then storing to it with -; a 0 offset. - define void @f1(i32 %a) nounwind { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: