Index: llvm/include/llvm/CodeGen/ModuloSchedule.h =================================================================== --- llvm/include/llvm/CodeGen/ModuloSchedule.h +++ llvm/include/llvm/CodeGen/ModuloSchedule.h @@ -300,6 +300,8 @@ /// State passed from peelKernel to peelPrologAndEpilogs(). std::deque PeeledFront, PeeledBack; + SmallVector IllegalPhisToDelete; + public: PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, LiveIntervals *LIS) @@ -321,6 +323,13 @@ /// Peels one iteration of the rewritten kernel (BB) in the specified /// direction. MachineBasicBlock *peelKernel(LoopPeelDirection LPD); + // Delete instructions whose stage is less than MinStage in the given basic + // block. + void FilterInstructions(MachineBasicBlock *MB, int MinStage); + // Move instructions of the given stage from sourceBB to DestBB. Remap the phi + // instructions to keep a valid IR. + void MoveStageInBB(MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, + int Stage); /// Peel the kernel forwards and backwards to produce prologs and epilogs, /// and stitch them together. void peelPrologAndEpilogs(); Index: llvm/lib/CodeGen/ModuloSchedule.cpp =================================================================== --- llvm/lib/CodeGen/ModuloSchedule.cpp +++ llvm/lib/CodeGen/ModuloSchedule.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ModuloSchedule.h" +#include "third_party/llvm/llvm-project/llvm/include/llvm/CodeGen/Register.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -1582,6 +1583,101 @@ return NewBB; } +void PeelingModuloScheduleExpander::FilterInstructions(MachineBasicBlock *MB, + int MinStage) { + for (auto I = MB->getFirstInstrTerminator()->getReverseIterator(); + I != std::next(MB->getFirstNonPHI()->getReverseIterator());) { + MachineInstr *MI = &*I++; + int Stage = getStage(MI); + if (Stage == -1 || Stage >= MinStage) + continue; + + for (MachineOperand &DefMO : MI->defs()) { + SmallVector, 4> Subs; + for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) { + // Only PHIs can use values from this block by construction. + // Match with the equivalent PHI in B. + assert(UseMI.isPHI()); + Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(), + MI->getParent()); + Subs.emplace_back(&UseMI, Reg); + } + for (auto &Sub : Subs) + Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0, + *MRI.getTargetRegisterInfo()); + } + if (LIS) + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + } +} + +void PeelingModuloScheduleExpander::MoveStageInBB(MachineBasicBlock *DestBB, + MachineBasicBlock *SourceBB, + int Stage) { + auto InsertPt = DestBB->getFirstNonPHI(); + DenseMap Remaps; + for (auto I = SourceBB->getFirstNonPHI(); I != SourceBB->end();) { + MachineInstr *MI = &*I++; + if (MI->isPHI()) { + // This is an illegal PHI. If we move any instructions using an illegal + // PHI, we need to create a legal Phi. + Register PhiR = MI->getOperand(0).getReg(); + auto RC = MRI.getRegClass(PhiR); + Register NR = MRI.createVirtualRegister(RC); + MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NR) + .addReg(PhiR) + .addMBB(SourceBB); + BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI; + CanonicalMIs[NI] = CanonicalMIs[MI]; + Remaps[PhiR] = NR; + continue; + } + if (getStage(MI) != Stage) + continue; + MI->removeFromParent(); + DestBB->insert(InsertPt, MI); + auto *KernelMI = CanonicalMIs[MI]; + BlockMIs[{DestBB, KernelMI}] = MI; + BlockMIs.erase({SourceBB, KernelMI}); + } + SmallVector PhiToDelete; + for (MachineInstr &MI : DestBB->phis()) { + assert(MI.getNumOperands() == 3); + MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg()); + // If the instruction referenced by the phi is moved inside the block + // we don't need the phi anymore. + if (getStage(Def) == Stage) { + Register PhiReg = MI.getOperand(0).getReg(); + MRI.replaceRegWith(MI.getOperand(0).getReg(), + Def->getOperand(0).getReg()); + MI.getOperand(0).setReg(PhiReg); + PhiToDelete.push_back(&MI); + } + } + for (auto *P : PhiToDelete) { + P->eraseFromParent(); + } + InsertPt = DestBB->getFirstNonPHI(); + for (MachineInstr &MI : SourceBB->phis()) { + MachineInstr *NewMI = MF.CloneMachineInstr(&MI); + DestBB->insert(InsertPt, NewMI); + Register OrigR = MI.getOperand(0).getReg(); + Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR)); + NewMI->getOperand(0).setReg(R); + NewMI->getOperand(1).setReg(OrigR); + NewMI->getOperand(2).setMBB(*DestBB->pred_begin()); + Remaps[OrigR] = R; + CanonicalMIs[NewMI] = CanonicalMIs[&MI]; + BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NewMI; + } + for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I) + for (MachineOperand &MO : I->uses()) + if (MO.isReg() && Remaps.count(MO.getReg())) + MO.setReg(Remaps[MO.getReg()]); +} + void PeelingModuloScheduleExpander::peelPrologAndEpilogs() { BitVector LS(Schedule.getNumStages(), true); BitVector AS(Schedule.getNumStages(), true); @@ -1607,23 +1703,36 @@ // Push out the epilogs, again in reverse order. // We can't assume anything about the minumum loop trip count at this point, - // so emit a fairly complex epilog: - // K[0, 1, 2] // Kernel runs stages 0, 1, 2 - // E0[2] <- P1 // Epilog runs stage 2 only, so the state after is [0]. - // E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2. - // - // This creates a single-successor single-predecessor sequence of blocks for - // each epilog, which are kept this way for simplicity at this stage and - // cleaned up by the optimizer later. + // so emit a fairly complex epilog. + + // We first peel number of stages minus one epilogue. Then we remove dead + // stages and reorder instructions based on there stage. If we have 3 stages + // we generate first: + // E0[3, 2, 1] + // E1[3', 2'] + // E2[3''] + // And then we move instructions based on their stages to have: + // E0[3] + // E1[2, 3'] + // E2[1, 2', 3''] + // The transformation is legal because we only move instructions past + // instructions of a previous loop iteration. for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) { - Epilogs.push_back(nullptr); - for (int J = Schedule.getNumStages() - 1; J >= I; --J) { - LS.reset(); - LS[J] = 1; - Epilogs.back() = peelKernel(LPD_Back); - LiveStages[Epilogs.back()] = LS; - AvailableStages[Epilogs.back()] = AS; + Epilogs.push_back(peelKernel(LPD_Back)); + FilterInstructions(Epilogs.back(), Schedule.getNumStages() - I); + } + for (int I = 0; I < Epilogs.size(); I++) { + LS.reset(); + for (int J = I; J < Epilogs.size(); J++) { + int Iteration = J; + int Stage = Schedule.getNumStages() - 1 + I - J; + // Move stage one block at a time so that Phi nodes are updated correctly. + for (int K = Iteration; K > I; K--) + MoveStageInBB(Epilogs[K - 1], Epilogs[K], Stage); + LS[Stage] = 1; } + LiveStages[Epilogs[I]] = LS; + AvailableStages[Epilogs[I]] = AS; } // Now we've defined all the prolog and epilog blocks as a fallthrough @@ -1659,6 +1768,13 @@ rewriteUsesOf(MI); } } + for (auto *MI : IllegalPhisToDelete) { + if (LIS) + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + } + IllegalPhisToDelete.clear(); + // Now all remapping has been done, we're free to optimize the generated code. for (MachineBasicBlock *B : reverse(Blocks)) EliminateDeadPhis(B, MRI, LIS); @@ -1727,9 +1843,10 @@ R = MI->getOperand(1).getReg(); MRI.setRegClass(R, MRI.getRegClass(PhiR)); MRI.replaceRegWith(PhiR, R); - if (LIS) - LIS->RemoveMachineInstrFromMaps(*MI); - MI->eraseFromParent(); + // Postpone deleting the Phi as it may be referenced by BlockMIs and used + // later to figure out how to remap registers. + MI->getOperand(0).setReg(PhiR); + IllegalPhisToDelete.push_back(MI); return; } Index: llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll =================================================================== --- llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll +++ llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll @@ -1,6 +1,4 @@ ; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s -; XFAIL: * -; LSR changes required. ; This version of the conv3x3 test has both loops. This test checks that the ; inner loop has 13 packets. Index: llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll @@ -0,0 +1,54 @@ +; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true < %s | FileCheck %s + +; Test epilogue generation when reading loop-carried dependency from a previous +; stage. The first epilogue should read value from iteration N-1 of the kernel. + +; CHECK: loop0 +; CHECK: r{{[0-9]+}} = add([[REG0:r([0-9]+)]],#8) +; CHECK: [[REG0:r([0-9]+)]] = [[REG1:r([0-9]+)]] +; CHECK: endloop0 +; CHECK: = add([[REG1]],#8) + +; Function Attrs: nounwind +define i32* @f0(i16* nocapture readonly %a0, i32 %a1) #0 { +b0: + %v0 = alloca [129 x i32], align 8 + br i1 undef, label %b1, label %b3 + +b1: ; preds = %b0 + br label %b2 + +b2: ; preds = %b2, %b1 + %v1 = phi i16* [ %a0, %b1 ], [ %v2, %b2 ] + %v2 = phi i16* [ undef, %b1 ], [ %v15, %b2 ] + %v3 = phi i32* [ null, %b1 ], [ %v4, %b2 ] + %v4 = phi i32* [ null, %b1 ], [ %v14, %b2 ] + %v5 = phi i32 [ 0, %b1 ], [ %v13, %b2 ] + %v6 = phi i16* [ undef, %b1 ], [ %v12, %b2 ] + %v7 = load i16, i16* %v2, align 2 + %v8 = sext i16 %v7 to i32 + %v9 = call i32 @llvm.hexagon.M2.mpy.ll.s0(i32 %v8, i32 %v8) #2 + %v10 = load i16, i16* %v6, align 2 + %v11 = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32 %v9, i32 undef, i32 undef) #2 + store i32 %v11, i32* %v4, align 4 + %v12 = getelementptr inbounds i16, i16* %v6, i32 -1 + %v13 = add i32 %v5, 1 + %v14 = getelementptr inbounds i32, i32* %v3, i32 2 + %v15 = getelementptr inbounds i16, i16* %v1, i32 2 + %v16 = icmp slt i32 %v13, %a1 + br i1 %v16, label %b2, label %b3 + +b3: ; preds = %b2, %b0 + %out = phi i32* [ null, %b0 ], [ %v14, %b2 ] + ret i32* %out +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.M2.mpy.ll.s0(i32, i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32, i32, i32) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind } Index: llvm/test/CodeGen/Hexagon/swp-stages4.ll =================================================================== --- llvm/test/CodeGen/Hexagon/swp-stages4.ll +++ llvm/test/CodeGen/Hexagon/swp-stages4.ll @@ -5,7 +5,6 @@ ; CHECK: = and ; CHECK: = and -; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1) ; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255) ; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255) ; CHECK: loop0(.LBB0_[[LOOP:.]],