diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -125,6 +125,8 @@ return false; return true; } + bool isDesirableToCommuteWithShift(const SDNode *N, + CombineLevel Level) const override; private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -17,6 +17,7 @@ #include "RISCVRegisterInfo.h" #include "RISCVSubtarget.h" #include "RISCVTargetMachine.h" +#include "Utils/RISCVMatInt.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/CallingConvLower.h" @@ -854,6 +855,50 @@ return SDValue(); } +bool RISCVTargetLowering::isDesirableToCommuteWithShift( + const SDNode *N, CombineLevel Level) const { + // The following folds are only desirable if `(OP _, c1 << c2)` can be + // materialised in fewer instructions than `(OP _, c1)`: + // + // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) + // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) + SDValue N0 = N->getOperand(0); + MVT Ty = N0.getSimpleValueType(); + if (Ty.isScalarInteger() && + (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR)) { + auto *C1 = dyn_cast(N0->getOperand(1)); + auto *C2 = dyn_cast(N->getOperand(1)); + if (C1 && C2) { + APInt C1Int = C1->getAPIntValue(); + APInt ShiftedC1Int = C1Int << C2->getAPIntValue(); + + // We can materialise `c1 << c2` into an add immediate, so it's "free", + // and the combine should happen, to potentially allow further combines + // later. + if (isLegalAddImmediate(ShiftedC1Int.getSExtValue())) + return true; + + // We can materialise `c1` in an add immediate, so it's "free", and the + // combine should be prevented. + if (isLegalAddImmediate(C1Int.getSExtValue())) + return false; + + // Neither constant will fit into an immediate, so find materialisation + // costs. + int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(), + Subtarget.is64Bit()); + int ShiftedC1Cost = RISCVMatInt::getIntMatCost( + ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit()); + + // Materialising `c1` is cheaper than materialising `c1 << c2`, so the + // combine should be prevented. + if (C1Cost < ShiftedC1Cost) + return false; + } + } + return true; +} + unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { diff --git a/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h b/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h --- a/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h +++ b/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_RISCV_MATINT_H #define LLVM_LIB_TARGET_RISCV_MATINT_H +#include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/MachineValueType.h" #include @@ -30,6 +31,14 @@ // order to allow this helper to be used from both the MC layer and during // instruction selection. void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res); + +// Helper to estimate the number of instructions required to materialise the +// given immediate value into a register. This estimate does not account for +// `Val` possibly fitting into an immediate, and so may over-estimate. +// +// This will attempt to produce instructions to materialise `Val` as an +// `Size`-bit immediate. `IsRV64` should match the target architecture. +int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64); } // namespace RISCVMatInt } // namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp --- a/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp +++ b/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp @@ -16,7 +16,7 @@ namespace llvm { namespace RISCVMatInt { -void generateInstSeq(int64_t Val, bool Is64Bit, InstSeq &Res) { +void generateInstSeq(int64_t Val, bool IsRV64, InstSeq &Res) { if (isInt<32>(Val)) { // Depending on the active bits in the immediate Value v, the following // instruction sequences are emitted: @@ -32,13 +32,13 @@ Res.push_back(Inst(RISCV::LUI, Hi20)); if (Lo12 || Hi20 == 0) { - unsigned AddiOpc = (Is64Bit && Hi20) ? RISCV::ADDIW : RISCV::ADDI; + unsigned AddiOpc = (IsRV64 && Hi20) ? RISCV::ADDIW : RISCV::ADDI; Res.push_back(Inst(AddiOpc, Lo12)); } return; } - assert(Is64Bit && "Can't emit >32-bit imm for non-RV64 target"); + assert(IsRV64 && "Can't emit >32-bit imm for non-RV64 target"); // In the worst case, for a full 64-bit constant, a sequence of 8 instructions // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note @@ -68,11 +68,26 @@ int ShiftAmount = 12 + findFirstSet((uint64_t)Hi52); Hi52 = SignExtend64(Hi52 >> (ShiftAmount - 12), 64 - ShiftAmount); - generateInstSeq(Hi52, Is64Bit, Res); + generateInstSeq(Hi52, IsRV64, Res); Res.push_back(Inst(RISCV::SLLI, ShiftAmount)); if (Lo12) Res.push_back(Inst(RISCV::ADDI, Lo12)); } + +int getIntMatCost(const APInt &Val, unsigned Size, bool IsRV64) { + int PlatRegSize = IsRV64 ? 64 : 32; + + // Split the constant into platform register sized chunks, and calculate cost + // of each chunk. + int Cost = 0; + for (unsigned ShiftVal = 0; ShiftVal < Size; ShiftVal += PlatRegSize) { + APInt Chunk = Val.ashr(ShiftVal).sextOrTrunc(PlatRegSize); + InstSeq MatSeq; + generateInstSeq(Chunk.getSExtValue(), IsRV64, MatSeq); + Cost += MatSeq.size(); + } + return std::max(1, Cost); +} } // namespace RISCVMatInt } // namespace llvm diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV32I %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV64I %s + +; These test that constant adds are not moved after shifts by DAGCombine, +; if the constant is cheaper to materialise before it has been shifted. + +define signext i32 @add_small_const(i32 signext %a) nounwind { +; RV32I-LABEL: add_small_const: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, a0, 1 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a0, a0, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: add_small_const: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, a0, 1 +; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a0, a0, 56 +; RV64I-NEXT: ret + %1 = add i32 %a, 1 + %2 = shl i32 %1, 24 + %3 = ashr i32 %2, 24 + ret i32 %3 +} + +define signext i32 @add_large_const(i32 signext %a) nounwind { +; RV32I-LABEL: add_large_const: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: lui a1, 65520 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: add_large_const: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, -1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: ret + %1 = add i32 %a, 4095 + %2 = shl i32 %1, 16 + %3 = ashr i32 %2, 16 + ret i32 %3 +} + +define signext i32 @add_huge_const(i32 signext %a) nounwind { +; RV32I-LABEL: add_huge_const: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: lui a1, 524272 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: add_huge_const: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: addiw a1, a1, -1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: ret + %1 = add i32 %a, 32767 + %2 = shl i32 %1, 16 + %3 = ashr i32 %2, 16 + ret i32 %3 +}