diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1644,6 +1644,33 @@ if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); + + // The epilog restore of a wwm-scratch register can cause undesired + // optimization during machine-cp post PrologEpilogInserter if the same + // register was assigned for return value ABI lowering with a COPY + // instruction. As given below, with the epilog reload, the earlier COPY + // appeared to be dead during machine-cp. + // ... + // v0 in WWM operation, needs the WWM spill at prolog/epilog. + // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 + // ... + // Epilog block: + // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 + // ... + // WWM spill restore to preserve the inactive lanes of v0. + // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 + // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 + // $exec = S_MOV_B64 killed $sgpr4_sgpr5 + // ... + // SI_RETURN implicit $vgpr0 + // ... + // To fix it, mark the same reg as a tied op for such restore instructions + // so that it marks a usage for the preceding COPY. + if (!IsStore && MI != MBB.end() && MI->isReturn() && + MI->readsRegister(SubReg, this)) { + MIB.addReg(SubReg, RegState::Implicit); + MIB->tieOperands(0, MIB->getNumOperands() - 1); + } } if (ScratchOffsetRegDelta != 0) { diff --git a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir --- a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir +++ b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir @@ -28,7 +28,7 @@ ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr0, 0 ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0 renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir @@ -0,0 +1,142 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# The COPY that moves the return value to VGPR0 should not be removed during machine-cp. The spill restore of the same register that follows, +# meant to only reload its inactive lanes. By marking the reg itself as the tied-op in the spill reload prevents the undesired optimization. + +--- +name: wwm_scratch_reg_spill_reload_of_outgoing_reg +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: ['$vgpr0'] + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + liveins: $sgpr20, $vgpr1 + ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_reg + ; GCN: liveins: $sgpr20, $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + ; GCN-NEXT: $vgpr0 = COPY killed renamable $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN implicit $vgpr0 + $vgpr0 = IMPLICIT_DEF + $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + $vgpr0 = COPY killed renamable $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0 +... + +# The reload of vgpr0 require the tied-op as it is a subreg in the outgoing tuple register vgpr0_vgpr1. +# The vgpr2 doesn't need the tied-op in the reload as it isn't holding any return value. +--- +name: wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: ['$vgpr0', '$vgpr2'] + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + liveins: $sgpr20, $sgpr21, $vgpr1 + ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg + ; GCN: liveins: $sgpr20, $sgpr21, $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2 + ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1 + $vgpr0 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2 + $vgpr0 = COPY $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0_vgpr1 +... + +# Tied op not required in the spill reload of vgpr2. + +--- +name: wwm_scratch_reg_spill_reload_different_outgoing_reg +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: ['$vgpr2'] + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + liveins: $sgpr20, $vgpr1 + ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_different_outgoing_reg + ; GCN: liveins: $sgpr20, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2 + ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1 + $vgpr2 = IMPLICIT_DEF + $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2 + $vgpr0 = COPY $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0_vgpr1 +... + +# Tied op not required in the spill reload of vgpr40 which is in the CSR range. +--- +name: wwm_csr_spill_reload +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: ['$vgpr40'] + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + liveins: $sgpr20, $vgpr1 + ; GCN-LABEL: name: wwm_csr_spill_reload + ; GCN: liveins: $sgpr20, $vgpr1, $vgpr40 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr40 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40 + ; GCN-NEXT: $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec + ; GCN-NEXT: $vgpr0 = COPY killed $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN implicit $vgpr0 + $vgpr40 = IMPLICIT_DEF + $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40 + $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec + $vgpr0 = COPY killed $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0 +...