Index: llvm/lib/Target/AMDGPU/GCNSchedStrategy.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -78,16 +78,22 @@ // An error margin is necessary because of poor performance of the generic RP // tracker and can be adjusted up for tuning heuristics to try and more // aggressively reduce register pressure. - const unsigned DefaultErrorMargin = 3; + unsigned ErrorMargin = 3; - const unsigned HighRPErrorMargin = 10; + // Bias for SGPR limits under a high register pressure. + const unsigned HighRPSGPRBias = 7; - unsigned ErrorMargin = DefaultErrorMargin; + // Bias for VGPR limits under a high register pressure. + const unsigned HighRPVGPRBias = 7; unsigned SGPRCriticalLimit; unsigned VGPRCriticalLimit; + unsigned SGPRLimitBias = 0; + + unsigned VGPRLimitBias = 0; + GCNSchedStrategy(const MachineSchedContext *C); SUnit *pickNode(bool &IsTopNode) override; Index: llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -73,12 +73,18 @@ VGPRCriticalLimit = std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); - // Subtract error margin from register limits and avoid overflow. + // Subtract error margin and bias from register limits and avoid overflow. + SGPRCriticalLimit = + std::min(SGPRCriticalLimit - SGPRLimitBias, SGPRCriticalLimit); SGPRCriticalLimit = std::min(SGPRCriticalLimit - ErrorMargin, SGPRCriticalLimit); + VGPRCriticalLimit = + std::min(VGPRCriticalLimit - VGPRLimitBias, VGPRCriticalLimit); VGPRCriticalLimit = std::min(VGPRCriticalLimit - ErrorMargin, VGPRCriticalLimit); + SGPRExcessLimit = std::min(SGPRExcessLimit - SGPRLimitBias, SGPRExcessLimit); SGPRExcessLimit = std::min(SGPRExcessLimit - ErrorMargin, SGPRExcessLimit); + VGPRExcessLimit = std::min(VGPRExcessLimit - VGPRLimitBias, VGPRExcessLimit); VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit); } @@ -670,7 +676,8 @@ InitialOccupancy = DAG.MinOccupancy; // Aggressivly try to reduce register pressure in the unclustered high RP // stage. Temporarily increase occupancy target in the region. - S.ErrorMargin = S.HighRPErrorMargin; + S.SGPRLimitBias = S.HighRPSGPRBias; + S.VGPRLimitBias = S.HighRPVGPRBias; if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); @@ -735,7 +742,7 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); - S.ErrorMargin = S.DefaultErrorMargin; + S.SGPRLimitBias = S.VGPRLimitBias = 0; if (DAG.MinOccupancy > InitialOccupancy) { for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) DAG.RegionsWithMinOcc[IDX] = Index: llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -18,8 +18,8 @@ ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX908-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) @@ -43,8 +43,8 @@ ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) Index: llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -8,9 +8,9 @@ ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1769482 /* regdef:VGPR_32 */, def undef %22.sub0 ; GCN-NEXT: undef %24.sub0:av_64 = COPY %22.sub0