Index: llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -466,6 +466,7 @@ // executioon. static bool hoistAndMergeSGPRInits(unsigned Reg, const MachineRegisterInfo &MRI, + const TargetRegisterInfo *TRI, MachineDominatorTree &MDT, const TargetInstrInfo *TII) { // List of inits by immediate value. @@ -480,7 +481,7 @@ for (auto &MI : MRI.def_instructions(Reg)) { MachineOperand *Imm = nullptr; - for (auto &MO: MI.operands()) { + for (auto &MO : MI.operands()) { if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { Imm = nullptr; @@ -585,8 +586,41 @@ } } - for (auto MI : MergedInstrs) - MI->removeFromParent(); + // Remove initializations that were merged into another. + for (auto &Init : Inits) { + auto &Defs = Init.second; + for (auto I = Defs.begin(); I != Defs.end(); ++I) + if (MergedInstrs.count(*I)) { + (*I)->eraseFromParent(); + I = Defs.erase(I); + } + } + + // Try to schedule SGPR initializations as early as possible in the MBB. + for (auto &Init : Inits) { + auto &Defs = Init.second; + for (auto MI : Defs) { + auto MBB = MI->getParent(); + MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); + MachineBasicBlock::reverse_iterator B(BoundaryMI); + // Check if B should actually be a bondary. If not set the previous + // instruction as the boundary instead. + if (!TII->isBasicBlockPrologue(*B)) + B++; + + auto R = std::next(MI->getReverseIterator()); + const unsigned Threshold = 50; + // Search until B or Threashold for a place to insert the initialization. + for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) + if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || + TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) + break; + + // Move to directly after R. + if (&*--R != MI) + MBB->splice(*R, MBB, MI); + } + } if (Changed) MRI.clearKillFlags(Reg); @@ -755,7 +789,7 @@ } if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) - hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII); + hoistAndMergeSGPRInits(AMDGPU::M0, MRI, TRI, *MDT, TII); return true; } Index: llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -26,12 +26,12 @@ ; CI: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 ; CI-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB1]], 6 -; CI-NEXT: v_lshr_b32_e64 v0, [[SUB0]], 6 -; CI-NEXT: v_add_i32_e64 v1, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]] +; CI-DAG: v_lshr_b32_e64 v0, [[SUB0]], 6 +; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB1]], 6 ; CI-NOT: v_mov ; CI: ds_write_b32 v0, v0 -; CI-NEXT: ds_write_b32 v0, v1 +; CI-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]] +; CI-NEXT: ds_write_b32 v0, v0 ; GFX9: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 ; GFX9-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 Index: llvm/trunk/test/CodeGen/AMDGPU/merge-m0.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/merge-m0.mir +++ llvm/trunk/test/CodeGen/AMDGPU/merge-m0.mir @@ -1,7 +1,10 @@ # RUN: llc -march=amdgcn -amdgpu-enable-merge-m0 -verify-machineinstrs -run-pass si-fix-sgpr-copies %s -o - | FileCheck -check-prefix=GCN %s +# GCN-LABEL: name: merge-m0-many-init # GCN: bb.0.entry: # GCN: SI_INIT_M0 -1 +# GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: DS_WRITE_B32 # GCN-NEXT: DS_WRITE_B32 # GCN-NEXT: SI_INIT_M0 65536 @@ -45,9 +48,8 @@ # GCN-NEXT: DS_WRITE_B32 # GCN-NEXT: SI_INIT_M0 -1 # GCN-NEXT: DS_WRITE_B32 - --- -name: merge-m0-many-init +name: merge-m0-many-init registers: - { id: 0, class: vgpr_32 } - { id: 1, class: vgpr_32 } @@ -124,22 +126,24 @@ ... +# GCN-LABEL: name: merge-m0-dont-hoist-past-init-with-different-initializer # GCN: bb.0.entry: # GCN: SI_INIT_M0 65536 +# GCN-NEXT: IMPLICIT_DEF +# GCN-NEXT: IMPLICIT_DEF # GCN-NEXT: DS_WRITE_B32 -#GCN: bb.1: -#GCN-NOT: SI_INIT_M0 65536 -#GCN-NOT: SI_INIT_M0 -1 - -#GCN: bb.2: -#GCN: SI_INIT_M0 -1 +# GCN: bb.1: +# GCN-NOT: SI_INIT_M0 65536 +# GCN-NOT: SI_INIT_M0 -1 -#GCN: bb.3: -#GCN: SI_INIT_M0 -1 +# GCN: bb.2: +# GCN: SI_INIT_M0 -1 +# GCN: bb.3: +# GCN: SI_INIT_M0 -1 --- -name: merge-m0-dont-hoist-past-init-with-different-initializer +name: merge-m0-dont-hoist-past-init-with-different-initializer registers: - { id: 0, class: vgpr_32 } - { id: 1, class: vgpr_32 } @@ -179,19 +183,19 @@ S_ENDPGM 0 ... +# GCN-LABEL: name: merge-m0-after-prologue # GCN: bb.0.entry: # GCN-NOT: SI_INIT_M0 # GCN: S_OR_B64 # GCN-NEXT: SI_INIT_M0 -#GCN: bb.1: -#GCN-NOT: SI_INIT_M0 -1 - -#GCN: bb.2: -#GCN-NOT: SI_INIT_MO -1 +# GCN: bb.1: +# GCN-NOT: SI_INIT_M0 -1 +# GCN: bb.2: +# GCN-NOT: SI_INIT_MO -1 --- -name: merge-m0-after-prologue +name: merge-m0-after-prologue registers: - { id: 0, class: vgpr_32 } - { id: 1, class: vgpr_32 } @@ -223,3 +227,71 @@ bb.3: S_ENDPGM 0 ... + +# GCN-LABEL: name: move-m0-avoid-hazard +# GCN: $m0 = S_MOV_B32 -1 +# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec +--- +name: move-m0-avoid-hazard +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $m0 = S_MOV_B32 -1 + DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec +... + +# GCN-LABEL: name: move-m0-with-prologue +# GCN $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc +# GCN: $m0 = S_MOV_B32 -1 +# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec +--- +name: move-m0-with-prologue +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $m0 = S_MOV_B32 -1 + DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec +... + +# GCN-LABEL: name: move-m0-different-initializer +# GCN: SI_INIT_M0 -1 +# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF +# GCN: SI_INIT_M0 65536 +# GCN-NEXT: S_NOP +--- +name: move-m0-different-initializer +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + SI_INIT_M0 -1, implicit-def $m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec + S_NOP 0 + SI_INIT_M0 65536, implicit-def $m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec +... + +# GCN-LABEL: name: move-m0-schedule-boundary +# GCN: S_SETREG +# GCN-NEXT: SI_INIT_M0 -1 +--- +name: move-m0-schedule-boundary +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + S_SETREG_IMM32_B32 0, 1 + SI_INIT_M0 -1, implicit-def $m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec +...