Index: llvm/lib/Target/AMDGPU/SIPostRABundler.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -48,6 +48,9 @@ SmallSet Defs; + void collectUsedRegUnits(const MachineInstr &MI, + BitVector &UsedRegUnits) const; + bool isBundleCandidate(const MachineInstr &MI) const; bool isDependentLoad(const MachineInstr &MI) const; bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const; @@ -85,6 +88,21 @@ return false; } +void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI, + BitVector &UsedRegUnits) const { + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg() || !Op.readsReg()) + continue; + + Register Reg = Op.getReg(); + assert(!Op.getSubReg() && + "subregister indexes should not be present after RA"); + + for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) + UsedRegUnits.set(*Units); + } +} + bool SIPostRABundler::isBundleCandidate(const MachineInstr &MI) const { const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags; return IMemFlags != 0 && MI.mayLoadOrStore() && !MI.isBundled(); @@ -107,6 +125,9 @@ return false; TRI = MF.getSubtarget().getRegisterInfo(); + BitVector BundleUsedRegUnits(TRI->getNumRegUnits()); + BitVector KillUsedRegUnits(TRI->getNumRegUnits()); + bool Changed = false; for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::instr_iterator Next; @@ -150,6 +171,33 @@ Next = std::next(BundleEnd); if (ClauseLength > 1) { Changed = true; + + // Before register allocation, kills are inserted after potential soft + // clauses to hint register allocation. Look for kills that look like + // this, and erase them. + if (Next != E && Next->isKill()) { + MachineInstr &Kill = *Next; + + for (const MachineInstr &BundleMI : make_range(BundleStart, Next)) + collectUsedRegUnits(BundleMI, BundleUsedRegUnits); + collectUsedRegUnits(Kill, KillUsedRegUnits); + + BundleUsedRegUnits.flip(); + KillUsedRegUnits &= BundleUsedRegUnits; + + // Erase the kill if it's a subset of the used registers. + // + // TODO: Should we just remove all kills? Is there any real reason to + // keep them after RA? + if (KillUsedRegUnits.none()) { + ++Next; + Kill.eraseFromParent(); + } + + BundleUsedRegUnits.reset(); + KillUsedRegUnits.reset(); + } + finalizeBundle(MBB, BundleStart, Next); } Index: llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir +++ llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -220,3 +220,61 @@ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec ... + +# Before register allocation, KILL hints are inserted after potential soft +# clauses to hint the register allocator to not clobber the input +# registers. Kills that look like this should be erased. +--- +name: post_bundle_kill +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + + ; GCN-LABEL: name: post_bundle_kill + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6 +... + +# Kill some other register not used in the bundle, should not be touched. +--- +name: post_bundle_kill_other +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN-LABEL: name: post_bundle_kill_other + ; GCN: $vgpr7 = V_MOV_B32_e32 0, implicit $exec + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: KILL killed $vgpr7 + $vgpr7 = V_MOV_B32_e32 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + KILL killed $vgpr7 +... + +# Kill some other register not used in the bundle, but also some +# from the bundle. +--- +name: post_bundle_kill_plus_other +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN-LABEL: name: post_bundle_kill_plus_other + ; GCN: $vgpr7 = V_MOV_B32_e32 0, implicit $exec + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: KILL killed $vgpr7, killed $vgpr3 + $vgpr7 = V_MOV_B32_e32 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + KILL killed $vgpr7, killed $vgpr3 +...