diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp --- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -102,8 +102,7 @@ if (LoADDI->getOpcode() != RISCV::ADDI || LoADDI->getOperand(2).getTargetFlags() != RISCVII::MO_LO || !LoADDI->getOperand(2).isGlobal() || - LoADDI->getOperand(2).getOffset() != 0 || - !MRI->hasOneUse(LoADDI->getOperand(0).getReg())) + LoADDI->getOperand(2).getOffset() != 0) return false; return true; } @@ -252,108 +251,133 @@ bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI) { Register DestReg = LoADDI.getOperand(0).getReg(); - assert(MRI->hasOneUse(DestReg) && "expected one use for LoADDI"); - // LoADDI has only one use. - MachineInstr &Tail = *MRI->use_instr_begin(DestReg); - switch (Tail.getOpcode()) { - default: - LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:" - << Tail); - return false; - case RISCV::ADDI: { - // Offset is simply an immediate operand. - int64_t Offset = Tail.getOperand(2).getImm(); - - // We might have two ADDIs in a row. - Register TailDestReg = Tail.getOperand(0).getReg(); - if (MRI->hasOneUse(TailDestReg)) { - MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg); - if (TailTail.getOpcode() == RISCV::ADDI) { - Offset += TailTail.getOperand(2).getImm(); - LLVM_DEBUG(dbgs() << " Offset Instrs: " << Tail << TailTail); - DeadInstrs.insert(&Tail); - foldOffset(HiLUI, LoADDI, TailTail, Offset); - return true; + + // First, look for arithmetic instructions we can get an offset from. + // We might be able to remove the arithmetic instructions by folding the + // offset into the LUI+ADDI. + if (MRI->hasOneUse(DestReg)) { + // LoADDI has only one use. + MachineInstr &Tail = *MRI->use_instr_begin(DestReg); + switch (Tail.getOpcode()) { + default: + LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:" + << Tail); + break; + case RISCV::ADDI: { + // Offset is simply an immediate operand. + int64_t Offset = Tail.getOperand(2).getImm(); + + // We might have two ADDIs in a row. + Register TailDestReg = Tail.getOperand(0).getReg(); + if (MRI->hasOneUse(TailDestReg)) { + MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg); + if (TailTail.getOpcode() == RISCV::ADDI) { + Offset += TailTail.getOperand(2).getImm(); + LLVM_DEBUG(dbgs() << " Offset Instrs: " << Tail << TailTail); + DeadInstrs.insert(&Tail); + foldOffset(HiLUI, LoADDI, TailTail, Offset); + return true; + } } - } - LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail); - foldOffset(HiLUI, LoADDI, Tail, Offset); - return true; - } - case RISCV::ADD: { - // The offset is too large to fit in the immediate field of ADDI. - // This can be in two forms: - // 1) LUI hi_Offset followed by: - // ADDI lo_offset - // This happens in case the offset has non zero bits in - // both hi 20 and lo 12 bits. - // 2) LUI (offset20) - // This happens in case the lower 12 bits of the offset are zeros. - int64_t Offset; - if (!matchLargeOffset(Tail, DestReg, Offset)) - return false; - foldOffset(HiLUI, LoADDI, Tail, Offset); - return true; + LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail); + foldOffset(HiLUI, LoADDI, Tail, Offset); + return true; + } + case RISCV::ADD: { + // The offset is too large to fit in the immediate field of ADDI. + // This can be in two forms: + // 1) LUI hi_Offset followed by: + // ADDI lo_offset + // This happens in case the offset has non zero bits in + // both hi 20 and lo 12 bits. + // 2) LUI (offset20) + // This happens in case the lower 12 bits of the offset are zeros. + int64_t Offset; + if (!matchLargeOffset(Tail, DestReg, Offset)) + return false; + foldOffset(HiLUI, LoADDI, Tail, Offset); + return true; + } + case RISCV::SH1ADD: + case RISCV::SH2ADD: + case RISCV::SH3ADD: { + // The offset is too large to fit in the immediate field of ADDI. + // It may be encoded as (SH2ADD (ADDI X0, C), DestReg) or + // (SH3ADD (ADDI X0, C), DestReg). + int64_t Offset; + if (!matchShiftedOffset(Tail, DestReg, Offset)) + return false; + foldOffset(HiLUI, LoADDI, Tail, Offset); + return true; + } + } } - case RISCV::SH1ADD: - case RISCV::SH2ADD: - case RISCV::SH3ADD: { - // The offset is too large to fit in the immediate field of ADDI. - // It may be encoded as (SH2ADD (ADDI X0, C), DestReg) or - // (SH3ADD (ADDI X0, C), DestReg). - int64_t Offset; - if (!matchShiftedOffset(Tail, DestReg, Offset)) + + // We didn't find an arithmetic instruction. If all the uses are memory ops + // with the same offset, we can transform + // HiLUI: lui vreg1, %hi(foo) ---> lui vreg1, %hi(foo+8) + // LoADDI: addi vreg2, vreg1, %lo(foo) ---> lw vreg3, lo(foo+8)(vreg1) + // Tail: lw vreg3, 8(vreg2) + + Optional CommonOffset; + for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) { + switch (UseMI.getOpcode()) { + default: + LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI); return false; - foldOffset(HiLUI, LoADDI, Tail, Offset); - return true; + case RISCV::LB: + case RISCV::LH: + case RISCV::LW: + case RISCV::LBU: + case RISCV::LHU: + case RISCV::LWU: + case RISCV::LD: + case RISCV::FLH: + case RISCV::FLW: + case RISCV::FLD: + case RISCV::SB: + case RISCV::SH: + case RISCV::SW: + case RISCV::SD: + case RISCV::FSH: + case RISCV::FSW: + case RISCV::FSD: { + if (UseMI.getOperand(1).isFI()) + return false; + // Register defined by LoADDI should not be the value register. + if (DestReg == UseMI.getOperand(0).getReg()) + return false; + assert(DestReg == UseMI.getOperand(1).getReg() && + "Expected base address use"); + // All load/store instructions must use the same offset. + int64_t Offset = UseMI.getOperand(2).getImm(); + if (CommonOffset && Offset != CommonOffset) + return false; + CommonOffset = Offset; + } + } } - case RISCV::LB: - case RISCV::LH: - case RISCV::LW: - case RISCV::LBU: - case RISCV::LHU: - case RISCV::LWU: - case RISCV::LD: - case RISCV::FLH: - case RISCV::FLW: - case RISCV::FLD: - case RISCV::SB: - case RISCV::SH: - case RISCV::SW: - case RISCV::SD: - case RISCV::FSH: - case RISCV::FSW: - case RISCV::FSD: { - // Transforms the sequence: Into: - // HiLUI: lui vreg1, %hi(foo) ---> lui vreg1, %hi(foo+8) - // LoADDI: addi vreg2, vreg1, %lo(foo) ---> lw vreg3, lo(foo+8)(vreg1) - // Tail: lw vreg3, 8(vreg2) - if (Tail.getOperand(1).isFI()) - return false; - // Register defined by LoADDI should be used in the base part of the - // load\store instruction. Otherwise, no folding possible. - Register BaseAddrReg = Tail.getOperand(1).getReg(); - if (DestReg != BaseAddrReg) - return false; - MachineOperand &TailImmOp = Tail.getOperand(2); - int64_t Offset = TailImmOp.getImm(); - // Update the offsets in global address lowering. - HiLUI.getOperand(1).setOffset(Offset); - // Update the immediate in the Tail instruction to add the offset. - Tail.removeOperand(2); - MachineOperand &ImmOp = LoADDI.getOperand(2); - ImmOp.setOffset(Offset); - Tail.addOperand(ImmOp); + + // We found a common offset. + // Update the offsets in global address lowering. + HiLUI.getOperand(1).setOffset(*CommonOffset); + MachineOperand &ImmOp = LoADDI.getOperand(2); + ImmOp.setOffset(*CommonOffset); + + // Update the immediate in the load/store instructions to add the offset. + for (MachineInstr &UseMI : + llvm::make_early_inc_range(MRI->use_instructions(DestReg))) { + UseMI.removeOperand(2); + UseMI.addOperand(ImmOp); // Update the base reg in the Tail instruction to feed from LUI. // Output of HiLUI is only used in LoADDI, no need to use // MRI->replaceRegWith(). - Tail.getOperand(1).setReg(HiLUI.getOperand(0).getReg()); - DeadInstrs.insert(&LoADDI); - return true; - } + UseMI.getOperand(1).setReg(HiLUI.getOperand(0).getReg()); } - return false; + + DeadInstrs.insert(&LoADDI); + return true; } bool RISCVMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) { @@ -371,9 +395,8 @@ MachineInstr *LoADDI = nullptr; if (!detectLuiAddiGlobal(HiLUI, LoADDI)) continue; - LLVM_DEBUG(dbgs() << " Found lowered global address with one use: " + LLVM_DEBUG(dbgs() << " Found lowered global address: " << *LoADDI->getOperand(2).getGlobal() << "\n"); - // If the use count is only one, merge the offset MadeChange |= detectAndFoldOffset(HiLUI, *LoADDI); } } diff --git a/llvm/test/CodeGen/RISCV/hoist-global-addr-base.ll b/llvm/test/CodeGen/RISCV/hoist-global-addr-base.ll --- a/llvm/test/CodeGen/RISCV/hoist-global-addr-base.ll +++ b/llvm/test/CodeGen/RISCV/hoist-global-addr-base.ll @@ -261,6 +261,77 @@ ; CHECK-NEXT: ret ret i8* getelementptr inbounds ([0 x i8], [0 x i8]* @bar, i32 0, i64 12848) } + +define dso_local void @read_modify_write() local_unnamed_addr nounwind { +; RV32-LABEL: read_modify_write: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lui a0, %hi(s+160) +; RV32-NEXT: lw a1, %lo(s+160)(a0) +; RV32-NEXT: addi a1, a1, 10 +; RV32-NEXT: sw a1, %lo(s+160)(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: read_modify_write: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a0, %hi(s+160) +; RV64-NEXT: lw a1, %lo(s+160)(a0) +; RV64-NEXT: addiw a1, a1, 10 +; RV64-NEXT: sw a1, %lo(s+160)(a0) +; RV64-NEXT: ret +entry: + %x = load i32, i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 1), align 4 + %y = add i32 %x, 10 + store i32 %y, i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 1), align 4 + ret void +} + +define dso_local void @rmw_with_control_flow() nounwind { +; CHECK-LABEL: rmw_with_control_flow: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a0, %hi(s+164) +; CHECK-NEXT: lw a1, %lo(s+164)(a0) +; CHECK-NEXT: blez a1, .LBB17_2 +; CHECK-NEXT: # %bb.1: # %if.then +; CHECK-NEXT: li a1, 10 +; CHECK-NEXT: sw a1, %lo(s+164)(a0) +; CHECK-NEXT: .LBB17_2: # %if.end +; CHECK-NEXT: ret +entry: + %0 = load i32, i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 2), align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 10, i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 2), align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +%struct.foo = type { i32, %struct.foo* } + +@f = global %struct.foo zeroinitializer, align 8 + +; Test the case where the store value and base pointer are the same register. +define void @self_store() { +; RV32-LABEL: self_store: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, %hi(f) +; RV32-NEXT: addi a1, a0, %lo(f) +; RV32-NEXT: sw a1, %lo(f+4)(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: self_store: +; RV64: # %bb.0: +; RV64-NEXT: lui a0, %hi(f) +; RV64-NEXT: addi a0, a0, %lo(f) +; RV64-NEXT: sd a0, 8(a0) +; RV64-NEXT: ret + store %struct.foo* @f, %struct.foo** getelementptr inbounds (%struct.foo, %struct.foo* @f, i64 0, i32 1), align 8 + ret void +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32I: {{.*}} ; RV32ZBA: {{.*}}