Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -81,6 +81,11 @@ #define DEBUG_TYPE "si-fix-sgpr-copies" +static cl::opt EnableM0Merge( + "amdgpu-enable-merge-m0", + cl::desc("Merge and hoist M0 initializations"), + cl::init(false)); + namespace { class SIFixSGPRCopies : public MachineFunctionPass { @@ -108,7 +113,7 @@ INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) @@ -332,27 +337,186 @@ return true; } -static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, - const TargetRegisterInfo *TRI) { - DenseSet Visited; +template +bool searchPredecessors(const MachineBasicBlock *MBB, + const MachineBasicBlock *CutOff, + UnaryPredicate Predicate) { + + if (MBB == CutOff) + return false; + + DenseSet Visited; SmallVector Worklist(MBB->pred_begin(), MBB->pred_end()); while (!Worklist.empty()) { - MachineBasicBlock *mbb = Worklist.back(); - Worklist.pop_back(); + MachineBasicBlock *MBB = Worklist.pop_back_val(); - if (!Visited.insert(mbb).second) + if (!Visited.insert(MBB).second) continue; - if (hasTerminatorThatModifiesExec(*mbb, *TRI)) + if (MBB == CutOff) + continue; + if (Predicate(MBB)) return true; - Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end()); + Worklist.append(MBB->pred_begin(), MBB->pred_end()); } return false; } +static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, + const TargetRegisterInfo *TRI) { + return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { + return hasTerminatorThatModifiesExec(*MBB, *TRI); }); +} + +// Checks if there is potential path From instruction To instruction. +// If CutOff is specified and it sits in between of that path we ignore +// a higher portion of the path and report it is not reachable. +static bool isReachable(const MachineInstr *From, + const MachineInstr *To, + const MachineBasicBlock *CutOff, + MachineDominatorTree &MDT) { + // If either From block dominates To block or instructions are in the same + // block and From is higher. + if (MDT.dominates(From, To)) + return true; + + const MachineBasicBlock *MBBFrom = From->getParent(); + const MachineBasicBlock *MBBTo = To->getParent(); + if (MBBFrom == MBBTo) + return false; + + // Instructions are in different blocks, do predecessor search. + // We should almost never get here since we do not usually produce M0 stores + // other than -1. + return searchPredecessors(MBBTo, CutOff, [MBBFrom] + (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); +} + +// Hoist and merge identical SGPR initializations into a common predecessor. +// This is intended to combine M0 initializations, but can work with any +// SGPR. A VGPR cannot be processed since we cannot guarantee vector +// executioon. +static bool hoistAndMergeSGPRInits(unsigned Reg, + const MachineRegisterInfo &MRI, + MachineDominatorTree &MDT) { + // List of inits by immediate value. + typedef std::map> InitListMap; + InitListMap Inits; + // List of clobbering instructions. + SmallVector Clobbers; + bool Changed = false; + + for (auto &MI : MRI.def_instructions(Reg)) { + MachineOperand *Imm = nullptr; + for (auto &MO: MI.operands()) { + if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || + (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { + Imm = nullptr; + break; + } else if (MO.isImm()) + Imm = &MO; + } + if (Imm) + Inits[Imm->getImm()].push_front(&MI); + else + Clobbers.push_back(&MI); + } + + for (auto &Init : Inits) { + auto &Defs = Init.second; + + for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { + MachineInstr *MI1 = *I1; + + for (auto I2 = std::next(I1); I2 != E; ) { + MachineInstr *MI2 = *I2; + + // Check any possible interference + auto intereferes = [&](MachineBasicBlock::iterator From, + MachineBasicBlock::iterator To) -> bool { + + assert(MDT.dominates(&*To, &*From)); + + auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { + const MachineBasicBlock *MBBFrom = From->getParent(); + const MachineBasicBlock *MBBTo = To->getParent(); + bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); + bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); + if (!MayClobberFrom && !MayClobberTo) + return false; + if ((MayClobberFrom && !MayClobberTo) || + (!MayClobberFrom && MayClobberTo)) + return true; + // Both can clobber, this is not an interference only if both are + // dominated by Clobber and belong to the same block or if Clobber + // properly dominates To, given that To >> From, so it dominates + // both and located in a common dominator. + return !((MBBFrom == MBBTo && + MDT.dominates(Clobber, &*From) && + MDT.dominates(Clobber, &*To)) || + MDT.properlyDominates(Clobber->getParent(), MBBTo)); + }; + + return (any_of(Clobbers, interferes)) || + (any_of(Inits, [&](InitListMap::value_type &C) { + return C.first != Init.first && any_of(C.second, interferes); + })); + }; + + if (MDT.dominates(MI1, MI2)) { + if (!intereferes(MI2, MI1)) { + DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber() + << " " << *MI2); + MI2->eraseFromParent(); + Defs.erase(I2++); + Changed = true; + continue; + } + } else if (MDT.dominates(MI2, MI1)) { + if (!intereferes(MI1, MI2)) { + DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() + << " " << *MI1); + MI1->eraseFromParent(); + Defs.erase(I1++); + Changed = true; + break; + } + } else { + auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), + MI2->getParent()); + if (!MBB) { + ++I2; + continue; + } + + MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); + if (!intereferes(MI1, I) && !intereferes(MI2, I)) { + DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() + << " " << *MI1 << "and moving from BB#" + << MI2->getParent()->getNumber() << " to BB#" + << I->getParent()->getNumber() << " " << *MI2); + I->getParent()->splice(I, MI2->getParent(), MI2); + MI1->eraseFromParent(); + Defs.erase(I1++); + Changed = true; + break; + } + } + ++I2; + } + ++I1; + } + } + + if (Changed) + MRI.clearKillFlags(Reg); + + return Changed; +} + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -485,5 +649,8 @@ } } + if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) + hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT); + return true; } Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -146,6 +146,9 @@ reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); + // M0 has to be reserved so that llvm accepts it as a live-in into a block. + reserveRegisterTuples(Reserved, AMDGPU::M0); + // Reserve the memory aperture registers. reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); Index: test/CodeGen/AMDGPU/merge-m0.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/merge-m0.mir @@ -0,0 +1,132 @@ +# RUN: llc -march=amdgcn -amdgpu-enable-merge-m0 -verify-machineinstrs -run-pass si-fix-sgpr-copies %s -o - | FileCheck -check-prefix=GCN %s + +# GCN: bb.0.entry: +# GCN: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.1: +# GCN: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.2: +# GCN: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.3: +# GCN: SI_INIT_M0 3 + +# GCN: bb.4: +# GCN-NOT: SI_INIT_M0 +# GCN: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 4 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.5: +# GCN-NOT: SI_INIT_M0 +# GCN: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 4 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.6: +# GCN: SI_INIT_M0 -1, +# GCN-NEXT: DS_WRITE_B32 +# GCN: SI_INIT_M0 %2 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 %2 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 + +--- +name: test +alignment: 0 +exposesReturnsTwice: false +noVRegs: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: sreg_32_xm0 } +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_CBRANCH_VCCZ %bb.1, implicit undef %vcc + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4, %bb.5 + S_CBRANCH_VCCZ %bb.4, implicit undef %vcc + S_BRANCH %bb.5 + + bb.4: + successors: %bb.6 + SI_INIT_M0 3, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 4, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.6 + + bb.5: + successors: %bb.6 + SI_INIT_M0 3, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 4, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.6 + + bb.6: + successors: %bb.0.entry, %bb.6 + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + %2 = IMPLICIT_DEF + SI_INIT_M0 %2, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 %2, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_CBRANCH_VCCZ %bb.6, implicit undef %vcc + S_BRANCH %bb.0.entry + +... Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -69,19 +69,20 @@ ; TOSMEM-NOT: s_m0 ; TOSMEM: s_add_u32 m0, s7, 0x100 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it +; FIXME-TOSMEM-NOT: m0 -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s7, 0x200 ; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_mov_b64 exec, ; TOSMEM: s_cbranch_execz ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200 +; TOSMEM: s_add_u32 m0, s7, 0x200 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload @@ -130,7 +131,7 @@ ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100 +; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload ; GCN-NOT: v_readlane_b32 m0 @@ -159,13 +160,14 @@ ; GCN-LABEL: {{^}}restore_m0_lds: ; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_cmp_eq_u32 -; TOSMEM-NOT: m0 +; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 ; TOSMEM: s_mov_b32 m0, -1 @@ -178,10 +180,10 @@ ; TOSMEM: ds_write_b64 -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b32 m0, s0