diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -153,6 +153,40 @@ CurDAG->RemoveDeadNodes(); } +// Returns true if N is a MachineSDNode that has a reg and simm12 memory +// operand. The indices of the base pointer and offset are returned in BaseOpIdx +// and OffsetOpIdx. +static bool hasMemOffset(SDNode *N, unsigned &BaseOpIdx, + unsigned &OffsetOpIdx) { + switch (N->getMachineOpcode()) { + case RISCV::LB: + case RISCV::LH: + case RISCV::LW: + case RISCV::LBU: + case RISCV::LHU: + case RISCV::LWU: + case RISCV::LD: + case RISCV::FLH: + case RISCV::FLW: + case RISCV::FLD: + BaseOpIdx = 0; + OffsetOpIdx = 1; + return true; + case RISCV::SB: + case RISCV::SH: + case RISCV::SW: + case RISCV::SD: + case RISCV::FSH: + case RISCV::FSW: + case RISCV::FSD: + BaseOpIdx = 1; + OffsetOpIdx = 2; + return true; + } + + return false; +} + static SDNode *selectImmWithConstantPool(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT, int64_t Imm, const RISCVSubtarget &Subtarget) { @@ -664,6 +698,70 @@ ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm)); return; } + case ISD::ADD: { + // Try to select ADD + immediate used as memory addresses to + // (ADDI (ADD X, Imm-Lo12), Lo12) if it will allow the ADDI to be removed by + // doPeepholeLoadStoreADDI. + + // LHS should be an immediate. + auto *N1C = dyn_cast(Node->getOperand(1)); + if (!N1C) + break; + + int64_t Offset = N1C->getSExtValue(); + int64_t Lo12 = SignExtend64<12>(Offset); + + // Don't do this if the lower 12 bits are 0 or we could use ADDI directly. + if (Lo12 == 0 || isInt<12>(Offset)) + break; + + // Don't do this if we can use a pair of ADDIs. + if (isInt<12>(Offset / 2) && isInt<12>(Offset - Offset / 2)) + break; + + bool AllPointerUses = true; + for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + + // Is this user a memory instruction that uses a register and immediate + // that has this ADD as its pointer. + unsigned BaseOpIdx, OffsetOpIdx; + if (!User->isMachineOpcode() || + !hasMemOffset(User, BaseOpIdx, OffsetOpIdx) || + UI.getOperandNo() != BaseOpIdx) { + AllPointerUses = false; + break; + } + + // If the memory instruction already has an offset, make sure the combined + // offset is foldable. + int64_t MemOffs = + cast(User->getOperand(OffsetOpIdx))->getSExtValue(); + MemOffs += Lo12; + if (!isInt<12>(MemOffs)) { + AllPointerUses = false; + break; + } + } + + if (!AllPointerUses) + break; + + Offset -= Lo12; + // Restore sign bits for RV32. + if (!Subtarget->is64Bit()) + Offset = SignExtend64<32>(Offset); + + // Emit (ADDI (ADD X, Hi), Lo) + SDNode *Imm = selectImm(CurDAG, DL, VT, Offset, *Subtarget); + SDNode *ADD = CurDAG->getMachineNode(RISCV::ADD, DL, VT, + Node->getOperand(0), SDValue(Imm, 0)); + SDNode *ADDI = + CurDAG->getMachineNode(RISCV::ADDI, DL, VT, SDValue(ADD, 0), + CurDAG->getTargetConstant(Lo12, DL, VT)); + ReplaceNode(Node, ADDI); + return; + } case ISD::SRL: { // Optimize (srl (and X, C2), C) -> // (srli (slli X, (XLen-C3), (XLen-C3) + C) @@ -2097,37 +2195,9 @@ // -> (store val, (add base, src), off1+off2) // This is possible when off1+off2 fits a 12-bit immediate. bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) { - int OffsetOpIdx; - int BaseOpIdx; - - // Only attempt this optimisation for I-type loads and S-type stores. - switch (N->getMachineOpcode()) { - default: + unsigned OffsetOpIdx, BaseOpIdx; + if (!hasMemOffset(N, BaseOpIdx, OffsetOpIdx)) return false; - case RISCV::LB: - case RISCV::LH: - case RISCV::LW: - case RISCV::LBU: - case RISCV::LHU: - case RISCV::LWU: - case RISCV::LD: - case RISCV::FLH: - case RISCV::FLW: - case RISCV::FLD: - BaseOpIdx = 0; - OffsetOpIdx = 1; - break; - case RISCV::SB: - case RISCV::SH: - case RISCV::SW: - case RISCV::SD: - case RISCV::FSH: - case RISCV::FSW: - case RISCV::FSD: - BaseOpIdx = 1; - OffsetOpIdx = 2; - break; - } if (!isa(N->getOperand(OffsetOpIdx))) return false; @@ -2137,54 +2207,8 @@ if (!Base.isMachineOpcode()) return false; - // There is a ADD between ADDI and load/store. We can only fold ADDI that - // do not have a FrameIndex operand. - SDValue Add; - unsigned AddBaseIdx; - if (Base.getMachineOpcode() == RISCV::ADD && Base.hasOneUse()) { - Add = Base; - SDValue Op0 = Base.getOperand(0); - SDValue Op1 = Base.getOperand(1); - if (Op0.isMachineOpcode() && Op0.getMachineOpcode() == RISCV::ADDI && - !isa(Op0.getOperand(0)) && - isa(Op0.getOperand(1))) { - AddBaseIdx = 1; - Base = Op0; - } else if (Op1.isMachineOpcode() && Op1.getMachineOpcode() == RISCV::ADDI && - !isa(Op1.getOperand(0)) && - isa(Op1.getOperand(1))) { - AddBaseIdx = 0; - Base = Op1; - } else if (Op1.isMachineOpcode() && - Op1.getMachineOpcode() == RISCV::ADDIW && - isa(Op1.getOperand(1)) && - Op1.getOperand(0).isMachineOpcode() && - Op1.getOperand(0).getMachineOpcode() == RISCV::LUI) { - // We found an LUI+ADDIW constant materialization. We might be able to - // fold the ADDIW offset if it could be treated as ADDI. - // Emulate the constant materialization to see if the result would be - // a simm32 if ADDI was used instead of ADDIW. - - // First the LUI. - uint64_t Imm = Op1.getOperand(0).getConstantOperandVal(0); - Imm <<= 12; - Imm = SignExtend64<32>(Imm); - - // Then the ADDI. - uint64_t LoImm = cast(Op1.getOperand(1))->getSExtValue(); - Imm += LoImm; - - // If the result isn't a simm32, we can't do the optimization. - if (!isInt<32>(Imm)) - return false; - - AddBaseIdx = 0; - Base = Op1; - } else - return false; - } else if (Base.getMachineOpcode() == RISCV::ADDI) { - // If the base is an ADDI, we can merge it in to the load/store. - } else + // If the base is an ADDI, we can merge it in to the load/store. + if (Base.getMachineOpcode() != RISCV::ADDI) return false; SDValue ImmOperand = Base.getOperand(1); @@ -2231,26 +2255,13 @@ LLVM_DEBUG(N->dump(CurDAG)); LLVM_DEBUG(dbgs() << "\n"); - if (Add) - Add = SDValue(CurDAG->UpdateNodeOperands(Add.getNode(), - Add.getOperand(AddBaseIdx), - Base.getOperand(0)), - 0); - // Modify the offset operand of the load/store. if (BaseOpIdx == 0) { // Load - if (Add) - N = CurDAG->UpdateNodeOperands(N, Add, ImmOperand, N->getOperand(2)); - else - N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand, - N->getOperand(2)); + N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand, + N->getOperand(2)); } else { // Store - if (Add) - N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Add, ImmOperand, - N->getOperand(3)); - else - N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0), - ImmOperand, N->getOperand(3)); + N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0), + ImmOperand, N->getOperand(3)); } return true; diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll --- a/llvm/test/CodeGen/RISCV/mem.ll +++ b/llvm/test/CodeGen/RISCV/mem.ll @@ -227,10 +227,9 @@ ; RV32I-LABEL: lw_sw_far_local: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 4 -; RV32I-NEXT: addi a2, a2, -4 ; RV32I-NEXT: add a2, a0, a2 -; RV32I-NEXT: lw a0, 0(a2) -; RV32I-NEXT: sw a1, 0(a2) +; RV32I-NEXT: lw a0, -4(a2) +; RV32I-NEXT: sw a1, -4(a2) ; RV32I-NEXT: ret %1 = getelementptr inbounds i32, i32* %a, i64 4095 %2 = load volatile i32, i32* %1 @@ -266,10 +265,9 @@ ; RV32I-LABEL: lw_sw_really_far_local: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: addi a2, a2, -2048 ; RV32I-NEXT: add a2, a0, a2 -; RV32I-NEXT: lw a0, 0(a2) -; RV32I-NEXT: sw a1, 0(a2) +; RV32I-NEXT: lw a0, -2048(a2) +; RV32I-NEXT: sw a1, -2048(a2) ; RV32I-NEXT: ret %1 = getelementptr inbounds i32, i32* %a, i32 536870400 %2 = load volatile i32, i32* %1 diff --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll --- a/llvm/test/CodeGen/RISCV/mem64.ll +++ b/llvm/test/CodeGen/RISCV/mem64.ll @@ -257,10 +257,9 @@ ; RV64I-LABEL: lw_sw_far_local: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 8 -; RV64I-NEXT: addiw a2, a2, -8 ; RV64I-NEXT: add a2, a0, a2 -; RV64I-NEXT: ld a0, 0(a2) -; RV64I-NEXT: sd a1, 0(a2) +; RV64I-NEXT: ld a0, -8(a2) +; RV64I-NEXT: sd a1, -8(a2) ; RV64I-NEXT: ret %1 = getelementptr inbounds i64, i64* %a, i64 4095 %2 = load volatile i64, i64* %1 @@ -273,10 +272,10 @@ define i64 @lw_really_far_local(i64* %a) { ; RV64I-LABEL: lw_really_far_local: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: addiw a1, a1, -2048 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: slli a1, a1, 31 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ld a0, -2048(a0) ; RV64I-NEXT: ret %1 = getelementptr inbounds i64, i64* %a, i64 268435200 %2 = load volatile i64, i64* %1 @@ -288,10 +287,10 @@ define void @st_really_far_local(i64* %a, i64 %b) { ; RV64I-LABEL: st_really_far_local: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a2, 524288 -; RV64I-NEXT: addiw a2, a2, -2048 +; RV64I-NEXT: li a2, 1 +; RV64I-NEXT: slli a2, a2, 31 ; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: sd a1, 0(a0) +; RV64I-NEXT: sd a1, -2048(a0) ; RV64I-NEXT: ret %1 = getelementptr inbounds i64, i64* %a, i64 268435200 store i64 %b, i64* %1 @@ -303,11 +302,11 @@ define i64 @lw_sw_really_far_local(i64* %a, i64 %b) { ; RV64I-LABEL: lw_sw_really_far_local: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a2, 524288 -; RV64I-NEXT: addiw a2, a2, -2048 +; RV64I-NEXT: li a2, 1 +; RV64I-NEXT: slli a2, a2, 31 ; RV64I-NEXT: add a2, a0, a2 -; RV64I-NEXT: ld a0, 0(a2) -; RV64I-NEXT: sd a1, 0(a2) +; RV64I-NEXT: ld a0, -2048(a2) +; RV64I-NEXT: sd a1, -2048(a2) ; RV64I-NEXT: ret %1 = getelementptr inbounds i64, i64* %a, i64 268435200 %2 = load volatile i64, i64* %1 diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -13,30 +13,28 @@ ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: lui a2, 20 -; RV32I-NEXT: addi a2, a2, -1920 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: li a2, 2 -; RV32I-NEXT: sw a2, 0(a0) +; RV32I-NEXT: sw a2, -1920(a0) ; RV32I-NEXT: li a3, 1 -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a3, 0(a1) -; RV32I-NEXT: sw a2, 4(a1) +; RV32I-NEXT: sw a3, -1916(a0) +; RV32I-NEXT: sw a3, -1920(a1) +; RV32I-NEXT: sw a2, -1916(a1) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test1: ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: lui a2, 20 -; RV64I-NEXT: addiw a2, a2, -1920 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: li a2, 2 -; RV64I-NEXT: sw a2, 0(a0) +; RV64I-NEXT: sw a2, -1920(a0) ; RV64I-NEXT: li a3, 1 -; RV64I-NEXT: sw a3, 4(a0) -; RV64I-NEXT: sw a3, 0(a1) -; RV64I-NEXT: sw a2, 4(a1) +; RV64I-NEXT: sw a3, -1916(a0) +; RV64I-NEXT: sw a3, -1920(a1) +; RV64I-NEXT: sw a2, -1916(a1) ; RV64I-NEXT: ret entry: %s = load [65536 x i32]*, [65536 x i32]** %sp @@ -119,28 +117,27 @@ } ; GEPs have been manually split so the base GEP does not get used by any memory -; instructions. Make sure we use a small offset in each of the stores. +; instructions. Make sure we use an offset and common base for each of the +; stores. define void @test3([65536 x i32]* %t) { ; RV32I-LABEL: test3: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, 20 -; RV32I-NEXT: addi a1, a1, -1920 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: li a1, 2 -; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a1, -1916(a0) ; RV32I-NEXT: li a1, 3 -; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a1, -1912(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test3: ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lui a1, 20 -; RV64I-NEXT: addiw a1, a1, -1920 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: li a1, 2 -; RV64I-NEXT: sw a1, 4(a0) +; RV64I-NEXT: sw a1, -1916(a0) ; RV64I-NEXT: li a1, 3 -; RV64I-NEXT: sw a1, 8(a0) +; RV64I-NEXT: sw a1, -1912(a0) ; RV64I-NEXT: ret entry: %0 = bitcast [65536 x i32]* %t to i8*