Index: lib/Target/RISCV/RISCVISelDAGToDAG.cpp =================================================================== --- lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -56,12 +56,14 @@ private: void doPeepholeLoadStoreADDI(); + void doPeepholeGlobalAddiLuiOffset(); void doPeepholeBuildPairF64SplitF64(); }; } void RISCVDAGToDAGISel::PostprocessISelDAG() { doPeepholeLoadStoreADDI(); + doPeepholeGlobalAddiLuiOffset(); doPeepholeBuildPairF64SplitF64(); } @@ -128,6 +130,212 @@ return false; } +// Detect the pattern lui %hi(global) --> ADDI %lo(global) +// HiLUI LoADDI +static bool detectLuiAddiGlobal(SDNode *Tail, unsigned &Idx, SDValue &LoADDI, + SDValue &HiLUI, GlobalAddressSDNode *&GAlo, + GlobalAddressSDNode *&GAhi) { + // Try to detect the pattern on every operand of the tail instruction. + for (Idx = 0; Idx < Tail->getNumOperands(); Idx++) { + LoADDI = Tail->getOperand(Idx); + // LoADDI should only be used by one instruction (Tail). + if (!LoADDI->isMachineOpcode() || + !(LoADDI->getMachineOpcode() == RISCV::ADDI) || + !isa(LoADDI->getOperand(1)) || + !LoADDI->hasOneUse()) + continue; + // Check for existence of %lo target flag. + GAlo = cast(LoADDI->getOperand(1)); + if (!(GAlo->getTargetFlags() == RISCVII::MO_LO) || + !(GAlo->getOffset() == 0)) + return false; + // Check for existence of %hi target flag. + HiLUI = LoADDI->getOperand(0); + if (!HiLUI->isMachineOpcode() || + !(HiLUI->getMachineOpcode() == RISCV::LUI) || + !isa(HiLUI->getOperand(0)) || !HiLUI->hasOneUse()) + return false; + GAhi = cast(HiLUI->getOperand(0)); + if (!(GAhi->getTargetFlags() == RISCVII::MO_HI) || + !(GAhi->getOffset() == 0)) + return false; + return true; + } + return false; +} + +static bool matchLuiOffset(SDValue &OffsetLUI, int64_t &Offset) { + if (!OffsetLUI->isMachineOpcode() || + !(OffsetLUI->getMachineOpcode() == RISCV::LUI) || + !isa(OffsetLUI->getOperand(0))) + return false; + Offset = cast(OffsetLUI->getOperand(0))->getSExtValue(); + Offset = Offset << 12; + LLVM_DEBUG(dbgs() << " Detected \" LUI Offset_hi\"\n"); + return true; +} + +static bool matchAddiLuiOffset(SDValue &OffsetLoADDI, int64_t &Offset) { + // LoADDI should only be used by the tail instruction only. + if (!OffsetLoADDI->isMachineOpcode() || + !(OffsetLoADDI->getMachineOpcode() == RISCV::ADDI) || + !isa(OffsetLoADDI->getOperand(1)) || + !OffsetLoADDI->hasOneUse()) + return false; + int64_t OffLo = + cast(OffsetLoADDI->getOperand(1))->getZExtValue(); + // HiLUI should only be used by the loADDI. + SDValue OffsetHiLUI = (OffsetLoADDI->getOperand(0)); + if (!OffsetHiLUI->isMachineOpcode() || + !(OffsetHiLUI->getMachineOpcode() == RISCV::LUI) || + !isa(OffsetHiLUI->getOperand(0)) || + !OffsetHiLUI->hasOneUse()) + return false; + int64_t OffHi = + cast(OffsetHiLUI->getOperand(0))->getSExtValue(); + Offset = (OffHi << 12) + OffLo; + LLVM_DEBUG(dbgs() << " Detected \" ADDI (LUI Offset_hi), Offset_lo\"\n"); + return true; +} + +static void updateTailInstrUsers(SDNode *Tail, SelectionDAG *CurDAG, + GlobalAddressSDNode *GAhi, + GlobalAddressSDNode *GAlo, + SDValue &GlobalHiLUI, SDValue &GlobalLoADDI, + int64_t Offset) { + // Update the offset in GAhi and GAlo. + SDLoc DL(Tail->getOperand(1)); + SDValue GAHiNew = CurDAG->getTargetGlobalAddress(GAhi->getGlobal(), DL, + GlobalHiLUI.getValueType(), + Offset, RISCVII::MO_HI); + SDValue GALoNew = CurDAG->getTargetGlobalAddress(GAlo->getGlobal(), DL, + GlobalLoADDI.getValueType(), + Offset, RISCVII::MO_LO); + CurDAG->UpdateNodeOperands(GlobalHiLUI.getNode(), GAHiNew); + CurDAG->UpdateNodeOperands(GlobalLoADDI.getNode(), GlobalHiLUI, GALoNew); + // Update all uses of the Tail with the GlobalLoADDI. After + // this Tail will be a dead node. + SDValue From = SDValue(Tail, 0); + CurDAG->ReplaceAllUsesOfValuesWith(&From, &GlobalLoADDI, 1); +} + +// TODO: This transformation might be better implemeted in a Machine Funtion +// Pass as discussed here: https://reviews.llvm.org/D45748. +// +// Merge the offset of address calculation into the offset field +// of a global address node in a global address lowering sequence ("LUI +// %hi(global) --> add %lo(global)") under the following conditions: 1) The +// offset field in the global address lowering sequence is zero. 2) The lowered +// global address is only used in one node, referred to as "Tail". + +// This peephole does the following transformations to merge the offset: + +// 1) ADDI (ADDI (LUI %hi(global)) %lo(global)), offset +// ---> +// ADDI (LUI %hi(global + offset)) %lo(global + offset). +// +// This generates: +// lui a0, hi (global + offset) +// add a0, a0, lo (global + offset) +// Instead of +// lui a0, hi (global) +// addi a0, hi (global) +// addi a0, offset +// This pattern is for cases when the offset is small enough to fit in the +// immediate filed of ADDI (less than 12 bits). + +// 2) ADD ((ADDI (LUI %hi(global)) %lo(global)), (LUI hi_offset)) +// ---> +// offset = hi_offset << 12 +// ADDI (LUI %hi(global + offset)) %lo(global + offset) + +// Which generates the ASM: +// lui a0, hi(global + offset) +// addi a0, lo(global + offset) +// Instead of: +// lui a0, hi(global) +// addi a0, lo(global) +// lui a1, (offset) +// add a0, a0, a1 + +// This pattern is for cases when the offset doesn't fit in an immediate field +// of ADDI but the lower 12 bits are all zeros. + +// 3) ADD ((ADDI (LUI %hi(global)) %lo(global)), (ADDI lo_offset, (LUI +// hi_offset))) +// ---> +// ADDI (LUI %hi(global + offset)) %lo(global + offset) +// Which generates the ASM: +// lui a1, %hi(global + offhi20<<12 + offlo12) +// addi a1, %lo(global + offhi20<<12 + offlo12) +// Instead of: +// lui a0, hi(global) +// addi a0, lo(global) +// lui a1, (offhi20) +// addi a1, (offlo12) +// add a0, a0, a1 +// This pattern is for cases when the offset doesn't fit in an immediate field +// of ADDI and both the lower 1 bits and high 20 bits are non zero. +void RISCVDAGToDAGISel::doPeepholeGlobalAddiLuiOffset() { + SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); + ++Position; + SelectionDAG::allnodes_iterator Begin(CurDAG->allnodes_begin()); + while (Position != Begin) { + SDNode *Tail = &*--Position; + // Skip dead nodes and any non-machine opcodes. + if (Tail->use_empty() || !Tail->isMachineOpcode()) + continue; + // The tail instruction can be an ADD or an ADDI. + if (!Tail->isMachineOpcode() || !(Tail->getMachineOpcode() == RISCV::ADD || + Tail->getMachineOpcode() == RISCV::ADDI)) + continue; + // First detect the global address part of pattern: + // (lui %hi(global) --> Addi %lo(global)) + unsigned GlobalLoADDiIdx; + SDValue GlobalLoADDI; + SDValue GlobalHiLUI; + GlobalAddressSDNode *GAhi; + GlobalAddressSDNode *GAlo; + if (!detectLuiAddiGlobal(Tail, GlobalLoADDiIdx, GlobalLoADDI, GlobalHiLUI, + GAlo, GAhi)) + continue; + LLVM_DEBUG(dbgs() << " Detected \"ADDI LUI %hi(global), %lo(global)\n"); + // Detect the offset part for the address calculation by looking at the + // other operand of the tail instruction: + int64_t Offset; + if (Tail->getMachineOpcode() == RISCV::ADD) { + // If the Tail is an ADD instruction, the offset can be in two forms: + // 1) LUI hi_Offset followed by: + // ADDI lo_offset + // This happens in case the offset has non zero bits in + // both hi 20 and lo 12 bits. + // 2) LUI (offset20) + // This happens in case the lower 12 bits of the offset are zeros. + SDValue OffsetVal = Tail->getOperand(1 - GlobalLoADDiIdx); + if (!matchAddiLuiOffset(OffsetVal, Offset) && + !matchLuiOffset(OffsetVal, Offset)) + continue; + } else + // The Tail is an ADDI instruction: + Offset = cast(Tail->getOperand(1 - GlobalLoADDiIdx)) + ->getSExtValue(); + + LLVM_DEBUG( + dbgs() + << " Fold offset value into global offset of LUI %hi and ADDI %lo\n"); + LLVM_DEBUG(dbgs() << "\tTail:"); + LLVM_DEBUG(Tail->dump(CurDAG)); + LLVM_DEBUG(dbgs() << "\tGlobalHiLUI:"); + LLVM_DEBUG(GlobalHiLUI->dump(CurDAG)); + LLVM_DEBUG(dbgs() << "\tGlobalLoADDI:"); + LLVM_DEBUG(GlobalLoADDI->dump(CurDAG)); + LLVM_DEBUG(dbgs() << "\n"); + updateTailInstrUsers(Tail, CurDAG, GAhi, GAlo, GlobalHiLUI, GlobalLoADDI, + Offset); + } + CurDAG->RemoveDeadNodes(); +} + // Merge an ADDI into the offset of a load/store instruction where possible. // (load (add base, off), 0) -> (load base, off) // (store val, (add base, off)) -> (store val, base, off) Index: test/CodeGen/RISCV/hoist-global-addr-base.ll =================================================================== --- test/CodeGen/RISCV/hoist-global-addr-base.ll +++ test/CodeGen/RISCV/hoist-global-addr-base.ll @@ -4,6 +4,8 @@ %struct.S = type { [40 x i32], i32, i32, i32, [4100 x i32], i32, i32, i32 } @s = common dso_local global %struct.S zeroinitializer, align 4 @foo = global [6 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5, i16 0], align 2 +@g = global [1048576 x i8] zeroinitializer, align 1 + define dso_local void @multiple_stores() local_unnamed_addr { ; CHECK-LABEL: multiple_stores: @@ -21,8 +23,8 @@ ret void } -define dso_local void @control_flow() local_unnamed_addr #0 { -; CHECK-LABEL: control_flow: +define dso_local void @control_flow_with_mem_access() local_unnamed_addr #0 { +; CHECK-LABEL: control_flow_with_mem_access: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(s) ; CHECK-NEXT: addi a0, a0, %lo(s) @@ -47,37 +49,92 @@ ret void } -;TODO: Offset shouln't be separated in this case. We get shorter sequence if it -; is merged in the LUI %hi and the ADDI %lo. +; This test checks for the case where the offset is only an LUI. +; without peephole this generates: +; lui a0, %hi(g) +; addi a0, a0, %lo(g) +; lui a1, 128 ---> offset +; add a0, a0, a1 ---> base + offset. +define i8* @big_offset_lui_tail() { +; CHECK-LABEL: big_offset_lui_tail: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(g+524288) +; CHECK-NEXT: addi a0, a0, %lo(g+524288) +; CHECK-NEXT: ret + ret i8* getelementptr inbounds ([1048576 x i8], [1048576 x i8]* @g, i32 0, i32 524288) +} + define dso_local i32* @big_offset_one_use() local_unnamed_addr { ; CHECK-LABEL: big_offset_one_use: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a0, 4 -; CHECK-NEXT: addi a0, a0, 188 -; CHECK-NEXT: lui a1, %hi(s) -; CHECK-NEXT: addi a1, a1, %lo(s) -; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: lui a0, %hi(s+16572) +; CHECK-NEXT: addi a0, a0, %lo(s+16572) ; CHECK-NEXT: ret entry: ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 5) } -;TODO: Offset shouln't be separated in this case. We get shorter sequence if it -; is merged in the LUI %hi and the ADDI %lo. define dso_local i32* @small_offset_one_use() local_unnamed_addr { ; CHECK-LABEL: small_offset_one_use: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a0, %hi(s+160) +; CHECK-NEXT: addi a0, a0, %lo(s+160) +; CHECK-NEXT: ret +entry: + ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 1) +} + +; TODO: In this case we get a better sequence if the offset didn't get didn't +; get merged back in %if.end and %if.then. The current peephole is not able to +; detect the shared global address node across blocks. +; Without the peephole we can generate: +;# %bb.0: # %entry +; lui a0, %hi(s) +; addi a0, a0, %lo(s) +; lw a1, 164(a0) +; beqz a1, .LBB0_2 +;# %bb.1: # %if.end +; addi a0, a0, 168 +; ret +;.LBB0_2: # %if.then +; addi a0, a0, 160 +; ret +; Function Attrs: norecurse nounwind optsize readonly +define dso_local i32* @control_flow_no_mem(i32 %n) local_unnamed_addr #1 { +; CHECK-LABEL: control_flow_no_mem: +; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(s) ; CHECK-NEXT: addi a0, a0, %lo(s) -; CHECK-NEXT: addi a0, a0, 160 +; CHECK-NEXT: lw a0, 164(a0) +; CHECK-NEXT: beqz a0, .LBB5_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: lui a0, %hi(s+168) +; CHECK-NEXT: addi a0, a0, %lo(s+168) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB5_2: # %if.then +; CHECK-NEXT: lui a0, %hi(s+160) +; CHECK-NEXT: addi a0, a0, %lo(s+160) ; CHECK-NEXT: ret entry: + %0 = load i32, i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 2), align 4 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end +if.then: ; preds = %entry ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 1) +if.end: ; preds = %if.then, %entry + ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 3) } - ;TODO: Offset shouln't be separated in this case. We get shorter sequence if it -; is merged in the LUI %hi and the ADDI %lo. +; is merged in the LUI %hi and the ADDI %lo, the "ADDI" could be folded in the +; immediate part of "lhu" genertating the sequence: +; lui a0, %hi(foo +8) +; lhu a0, %lo(foo+8)(a0) +; instead of: +; lui a0, %hi(foo) +; addi a0, a0, %lo(foo) +; lhu a0, 8(a0) + define dso_local i32 @load_half() nounwind { ; CHECK-LABEL: load_half: ; CHECK: # %bb.0: # %entry @@ -87,13 +144,13 @@ ; CHECK-NEXT: addi a0, a0, %lo(foo) ; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: addi a1, zero, 140 -; CHECK-NEXT: bne a0, a1, .LBB4_2 +; CHECK-NEXT: bne a0, a1, .LBB6_2 ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: mv a0, zero ; CHECK-NEXT: lw ra, 12(sp) ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB4_2: # %if.then +; CHECK-NEXT: .LBB6_2: # %if.then ; CHECK-NEXT: call abort entry: %0 = load i16, i16* getelementptr inbounds ([6 x i16], [6 x i16]* @foo, i32 0, i32 4), align 2