diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2125,10 +2125,8 @@ // There is a ADD between ADDI and load/store. We can only fold ADDI that // do not have a FrameIndex operand. SDValue Add; - int AddBaseIdx; - if (Base.getMachineOpcode() == RISCV::ADD) { - if (!Base.hasOneUse()) - return false; + unsigned AddBaseIdx; + if (Base.getMachineOpcode() == RISCV::ADD && Base.hasOneUse()) { Add = Base; SDValue Op0 = Base.getOperand(0); SDValue Op1 = Base.getOperand(1); @@ -2142,12 +2140,36 @@ isa(Op1.getOperand(1))) { AddBaseIdx = 0; Base = Op1; + } else if (Op1.isMachineOpcode() && + Op1.getMachineOpcode() == RISCV::ADDIW && + isa(Op1.getOperand(1)) && + Op1.getOperand(0).isMachineOpcode() && + Op1.getOperand(0).getMachineOpcode() == RISCV::LUI) { + // We found an LUI+ADDIW constant materialization. We might be able to + // fold the ADDIW offset if it could be treated as ADDI. + // Emulate the constant materialization to see if the result would be + // a simm32 if ADDI was used instead of ADDIW. + + // First the LUI. + uint64_t Imm = Op1.getOperand(0).getConstantOperandVal(0); + Imm <<= 12; + Imm = SignExtend64(Imm, 32); + + // Then the ADDI. + uint64_t LoImm = cast(Op1.getOperand(1))->getSExtValue(); + Imm += LoImm; + + // If the result isn't a simm32, we can't do the optimization. + if (!isInt<32>(Imm)) + return false; + + AddBaseIdx = 0; + Base = Op1; } else return false; - } - - // If the base is an ADDI, we can merge it in to the load/store. - if (Base.getMachineOpcode() != RISCV::ADDI) + } else if (Base.getMachineOpcode() == RISCV::ADDI) { + // If the base is an ADDI, we can merge it in to the load/store. + } else return false; SDValue ImmOperand = Base.getOperand(1); diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll --- a/llvm/test/CodeGen/RISCV/mem.ll +++ b/llvm/test/CodeGen/RISCV/mem.ll @@ -238,6 +238,45 @@ ret i32 %2 } +define i32 @lw_really_far_local(i32* %a) { +; RV32I-LABEL: lw_really_far_local: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 524288 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a0, -2048(a0) +; RV32I-NEXT: ret + %1 = getelementptr inbounds i32, i32* %a, i32 536870400 + %2 = load volatile i32, i32* %1 + ret i32 %2 +} + +define void @st_really_far_local(i32* %a, i32 %b) { +; RV32I-LABEL: st_really_far_local: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a2, 524288 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: sw a1, -2048(a0) +; RV32I-NEXT: ret + %1 = getelementptr inbounds i32, i32* %a, i32 536870400 + store i32 %b, i32* %1 + ret void +} + +define i32 @lw_sw_really_far_local(i32* %a, i32 %b) { +; RV32I-LABEL: lw_sw_really_far_local: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a2, 524288 +; RV32I-NEXT: addi a2, a2, -2048 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: lw a0, 0(a2) +; RV32I-NEXT: sw a1, 0(a2) +; RV32I-NEXT: ret + %1 = getelementptr inbounds i32, i32* %a, i32 536870400 + %2 = load volatile i32, i32* %1 + store i32 %b, i32* %1 + ret i32 %2 +} + %struct.quux = type { i32, [0 x i8] } ; Make sure we don't remove the addi and fold the C from diff --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll --- a/llvm/test/CodeGen/RISCV/mem64.ll +++ b/llvm/test/CodeGen/RISCV/mem64.ll @@ -233,9 +233,8 @@ ; RV64I-LABEL: lw_far_local: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a1, 8 -; RV64I-NEXT: addiw a1, a1, -8 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ld a0, -8(a0) ; RV64I-NEXT: ret %1 = getelementptr inbounds i64, i64* %a, i64 4095 %2 = load volatile i64, i64* %1 @@ -246,9 +245,8 @@ ; RV64I-LABEL: st_far_local: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 8 -; RV64I-NEXT: addiw a2, a2, -8 ; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: sd a1, 0(a0) +; RV64I-NEXT: sd a1, -8(a0) ; RV64I-NEXT: ret %1 = getelementptr inbounds i64, i64* %a, i64 4095 store i64 %b, i64* %1 @@ -270,6 +268,53 @@ ret i64 %2 } +; Make sure we don't fold the addiw into the load offset. The sign extend of the +; addiw is required. +define i64 @lw_really_far_local(i64* %a) { +; RV64I-LABEL: lw_really_far_local: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a1, 524288 +; RV64I-NEXT: addiw a1, a1, -2048 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ret + %1 = getelementptr inbounds i64, i64* %a, i64 268435200 + %2 = load volatile i64, i64* %1 + ret i64 %2 +} + +; Make sure we don't fold the addiw into the store offset. The sign extend of +; the addiw is required. +define void @st_really_far_local(i64* %a, i64 %b) { +; RV64I-LABEL: st_really_far_local: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a2, 524288 +; RV64I-NEXT: addiw a2, a2, -2048 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: sd a1, 0(a0) +; RV64I-NEXT: ret + %1 = getelementptr inbounds i64, i64* %a, i64 268435200 + store i64 %b, i64* %1 + ret void +} + +; Make sure we don't fold the addiw into the load/store offset. The sign extend +; of the addiw is required. +define i64 @lw_sw_really_far_local(i64* %a, i64 %b) { +; RV64I-LABEL: lw_sw_really_far_local: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a2, 524288 +; RV64I-NEXT: addiw a2, a2, -2048 +; RV64I-NEXT: add a2, a0, a2 +; RV64I-NEXT: ld a0, 0(a2) +; RV64I-NEXT: sd a1, 0(a2) +; RV64I-NEXT: ret + %1 = getelementptr inbounds i64, i64* %a, i64 268435200 + %2 = load volatile i64, i64* %1 + store i64 %b, i64* %1 + ret i64 %2 +} + %struct.quux = type { i32, [0 x i8] } ; Make sure we don't remove the addi and fold the C from