diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -103,7 +103,7 @@ static cl::opt EnableM0Merge( "amdgpu-enable-merge-m0", cl::desc("Merge and hoist M0 initializations"), - cl::init(false)); + cl::init(true)); namespace { @@ -452,18 +452,32 @@ (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); } +// Return the first non-prolog instruction in the block. +static MachineBasicBlock::iterator +getFirstNonProlog(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { + MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); + while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) + ++I; + + return I; +} + // Hoist and merge identical SGPR initializations into a common predecessor. // This is intended to combine M0 initializations, but can work with any // SGPR. A VGPR cannot be processed since we cannot guarantee vector // executioon. static bool hoistAndMergeSGPRInits(unsigned Reg, const MachineRegisterInfo &MRI, - MachineDominatorTree &MDT) { + MachineDominatorTree &MDT, + const TargetInstrInfo *TII) { // List of inits by immediate value. using InitListMap = std::map>; InitListMap Inits; // List of clobbering instructions. SmallVector Clobbers; + // List of instructions marked for deletion. + SmallSet MergedInstrs; + bool Changed = false; for (auto &MI : MRI.def_instructions(Reg)) { @@ -492,7 +506,7 @@ MachineInstr *MI2 = *I2; // Check any possible interference - auto intereferes = [&](MachineBasicBlock::iterator From, + auto interferes = [&](MachineBasicBlock::iterator From, MachineBasicBlock::iterator To) -> bool { assert(MDT.dominates(&*To, &*From)); @@ -525,23 +539,23 @@ }; if (MDT.dominates(MI1, MI2)) { - if (!intereferes(MI2, MI1)) { + if (!interferes(MI2, MI1)) { LLVM_DEBUG(dbgs() << "Erasing from " << printMBBReference(*MI2->getParent()) << " " << *MI2); - MI2->eraseFromParent(); - Defs.erase(I2++); + MergedInstrs.insert(MI2); Changed = true; + I2++; continue; } } else if (MDT.dominates(MI2, MI1)) { - if (!intereferes(MI1, MI2)) { + if (!interferes(MI1, MI2)) { LLVM_DEBUG(dbgs() << "Erasing from " << printMBBReference(*MI1->getParent()) << " " << *MI1); - MI1->eraseFromParent(); - Defs.erase(I1++); + MergedInstrs.insert(MI1); Changed = true; + I1++; break; } } else { @@ -552,8 +566,8 @@ continue; } - MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); - if (!intereferes(MI1, I) && !intereferes(MI2, I)) { + MachineBasicBlock::iterator I = getFirstNonProlog(MBB, TII); + if (!interferes(MI1, I) && !interferes(MI2, I)) { LLVM_DEBUG(dbgs() << "Erasing from " << printMBBReference(*MI1->getParent()) << " " << *MI1 @@ -561,9 +575,9 @@ << printMBBReference(*MI2->getParent()) << " to " << printMBBReference(*I->getParent()) << " " << *MI2); I->getParent()->splice(I, MI2->getParent(), MI2); - MI1->eraseFromParent(); - Defs.erase(I1++); + MergedInstrs.insert(MI1); Changed = true; + I1++; break; } } @@ -573,6 +587,9 @@ } } + for (auto MI : MergedInstrs) + MI->removeFromParent(); + if (Changed) MRI.clearKillFlags(Reg); @@ -723,7 +740,7 @@ } if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) - hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT); + hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/merge-m0.mir b/llvm/test/CodeGen/AMDGPU/merge-m0.mir --- a/llvm/test/CodeGen/AMDGPU/merge-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-m0.mir @@ -47,7 +47,7 @@ # GCN-NEXT: DS_WRITE_B32 --- -name: test +name: merge-m0-many-init alignment: 0 exposesReturnsTwice: false legalized: false @@ -129,3 +129,117 @@ S_BRANCH %bb.0.entry ... + +# GCN: bb.0.entry: +# GCN: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 + +#GCN: bb.1: +#GCN-NOT: SI_INIT_M0 65536 +#GCN-NOT: SI_INIT_M0 -1 + +#GCN: bb.2: +#GCN: SI_INIT_M0 -1 + +#GCN: bb.3: +#GCN: SI_INIT_M0 -1 + +--- +name: merge-m0-dont-hoist-past-init-with-different-initializer +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: sreg_32_xm0 } +body: | + bb.0.entry: + successors: %bb.1 + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + SI_INIT_M0 65536, implicit-def $m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + + SI_INIT_M0 65536, implicit-def $m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.4 + + SI_INIT_M0 -1, implicit-def $m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.4 + + bb.3: + successors: %bb.4 + + SI_INIT_M0 -1, implicit-def $m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM 0 +... + +# GCN: bb.0.entry: +# GCN-NOT: SI_INIT_M0 +# GCN: S_OR_B64 +# GCN-NEXT: SI_INIT_M0 + +#GCN: bb.1: +#GCN-NOT: SI_INIT_M0 -1 + +#GCN: bb.2: +#GCN-NOT: SI_INIT_MO -1 + +--- +name: merge-m0-after-prolog +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: sreg_32_xm0 } +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: $sgpr0_sgpr1 + + $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + S_CBRANCH_VCCZ %bb.1, implicit undef $vcc + S_BRANCH %bb.2 + + bb.1: + successors: %bb.3 + + SI_INIT_M0 -1, implicit-def $m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + + SI_INIT_M0 -1, implicit-def $m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM 0 +...