diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1287,6 +1287,20 @@ return LoadStoreOp; } +// Returns true if \p Reg is used for returning a value from this function. +static bool isReturnValueCCReg(MachineInstr &MI, Register Reg) { + if (!MI.isReturn()) + return false; + + // Check the operand list of return instruction for the CC regs used for + // outgoing values. + for (MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() == Reg) + return true; + } + return false; +} + void SIRegisterInfo::buildSpillLoadStore( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, @@ -1644,6 +1658,32 @@ if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); + + // The epilog restore of a wwm-scratch register can cause undesired + // optimization during machine-cp post PrologEpilogInserter if the same + // register was assigned for return value ABI lowering with a COPY + // instruction. As given below, with the epilog reload, the earlier COPY + // appeared to be dead during machine-cp. To fix it, mark the same reg as + // a tied-op for such restore instructions that marks a usage for the + // preceding COPY. + // ... + // v0 in WWM operation, needs the WWM spill at prolog/epilog. + // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 + // ... + // Epilog block: + // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 + // ... + // WWM spill restore to preserve the inactive lanes of v0. + // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 + // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 + // $exec = S_MOV_B64 killed $sgpr4_sgpr5 + // ... + // SI_RETURN implicit $vgpr0 + // ... + if (!IsStore && MBB.isReturnBlock() && isReturnValueCCReg(*MI, SubReg)) { + MIB.addReg(SubReg, RegState::Implicit); + MIB->tieOperands(0, MIB->getNumOperands() - 1); + } } if (ScratchOffsetRegDelta != 0) { diff --git a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir --- a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir +++ b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir @@ -28,7 +28,7 @@ ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr0, 0 ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0 renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# The COPY that moves the return value to VGPR0 should not be removed during machine-cp. The spill restore of the same register that follows, +# meant to only reload its inactive lanes. By marking the reg itself as the tied-op in the spill reload prevents the undesired optimization. + +--- +name: wwm_scratch_reg_spill_reload +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: ['$vgpr0'] + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + liveins: $sgpr20, $vgpr1 + ; GCN-LABEL: name: wwm_scratch_reg_spill_reload + ; GCN: liveins: $sgpr20, $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + ; GCN-NEXT: $vgpr0 = COPY killed renamable $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; GCN-NEXT: SI_RETURN implicit $vgpr0 + $vgpr0 = IMPLICIT_DEF + $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0 + $vgpr0 = COPY killed renamable $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0 +...