Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -720,10 +720,24 @@ if (UseMI->isCopy() && OpToFold.isReg() && UseMI->getOperand(0).getReg().isVirtual() && !UseMI->getOperand(1).getSubReg()) { - LLVM_DEBUG(dbgs() << "Folding " << OpToFold - << "\n into " << *UseMI << '\n'); + unsigned Size = TII->getOpSize(*UseMI, 1); Register UseReg = OpToFold.getReg(); + Register DstReg = UseMI->getOperand(0).getReg(); + + const bool DstIsAGPR = TRI->isAGPR(*MRI, DstReg); + + SmallVector, 32> Defs; + bool IsAGPRRegSeqInit = + Size > 4 && DstIsAGPR && + getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII, + *MRI); + if (DstIsAGPR && !IsAGPRRegSeqInit && + !TRI->isVGPR(*MRI, OpToFold.getReg())) { + LLVM_DEBUG(dbgs() << "Not folding copy to AGPR\n"); + return; + } + UseMI->getOperand(1).setReg(UseReg); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); @@ -734,10 +748,7 @@ // can only accept VGPR or inline immediate. Recreate a reg_sequence with // its initializers right here, so we will rematerialize immediates and // avoid copies via different reg classes. - SmallVector, 32> Defs; - if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && - getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII, - *MRI)) { + if (IsAGPRRegSeqInit) { const DebugLoc &DL = UseMI->getDebugLoc(); MachineBasicBlock &MBB = *UseMI->getParent(); @@ -811,8 +822,7 @@ if (Size != 4) return; - if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg())) + if (DstIsAGPR && TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg())) UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -169,6 +169,13 @@ return hasAGPRs(RC) && !hasVGPRs(RC); } + /// \returns true if this class contains only VGPR registers FIXME: This works + /// correctly for classes obtained VGPR vregs, but will be incorrect for the + /// combined VS_* classes. + bool isVGPRClass(const TargetRegisterClass *RC) const { + return hasVGPRs(RC); + } + /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; Index: llvm/test/CodeGen/AMDGPU/fold-operands-agpr-copies.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/fold-operands-agpr-copies.mir @@ -0,0 +1,184 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -run-pass=si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GFX908 %s + +--- +name: no_fold_s32_to_v32_to_a32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GFX908-LABEL: name: no_fold_s32_to_v32_to_a32 + ; GFX908: liveins: $sgpr0 + ; GFX908: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %1:sreg_64, 0, 0, 0 :: (load 4, addrspace 4) + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX908: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]] + ; GFX908: S_ENDPGM 0, implicit [[COPY1]] + %0:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %3:sreg_64, 0, 0, 0 :: (load 4, addrspace 4) + %1:vgpr_32 = COPY %0 + %2:agpr_32 = COPY %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: no_fold_a32_to_v32_to_a32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0 + + ; GFX908-LABEL: name: no_fold_a32_to_v32_to_a32 + ; GFX908: liveins: $agpr0 + ; GFX908: [[COPY:%[0-9]+]]:agpr_32 = COPY $agpr0 + ; GFX908: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX908: [[COPY2:%[0-9]+]]:agpr_32 = COPY [[COPY1]] + ; GFX908: S_ENDPGM 0, implicit [[COPY2]] + %0:agpr_32 = COPY $agpr0 + %1:vgpr_32 = COPY %0 + %2:agpr_32 = COPY %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: fold_imm_a32_to_v32_to_a32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GFX908-LABEL: name: fold_imm_a32_to_v32_to_a32 + ; GFX908: liveins: $sgpr0 + ; GFX908: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: S_ENDPGM 0, implicit [[V_ACCVGPR_WRITE_B32_]] + %0:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + %1:vgpr_32 = COPY %0 + %2:agpr_32 = COPY %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: fold_imm_s32_to_v32_to_a32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GFX908-LABEL: name: fold_imm_s32_to_v32_to_a32 + ; GFX908: liveins: $sgpr0 + ; GFX908: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: S_ENDPGM 0, implicit [[V_ACCVGPR_WRITE_B32_]] + %0:sreg_32 = S_MOV_B32 0 + %1:vgpr_32 = COPY %0 + %2:agpr_32 = COPY %1 + S_ENDPGM 0, implicit %2 + +... + + +# Make sure we still fold immediate materializes into the AGPR moves. +--- +name: reg_sequence_imm_s64_to_a64_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GFX908-LABEL: name: reg_sequence_imm_s64_to_a64_0 + ; GFX908: liveins: $sgpr0 + ; GFX908: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX908: [[COPY:%[0-9]+]]:areg_64 = COPY [[REG_SEQUENCE]] + ; GFX908: S_ENDPGM 0, implicit [[COPY]] + %0:sreg_32_xm0_xexec = S_MOV_B32 0 + %1:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 + %2:areg_64 = COPY %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: reg_sequence_imm_s64_to_a64_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GFX908-LABEL: name: reg_sequence_imm_s64_to_a64_1 + ; GFX908: liveins: $sgpr0 + ; GFX908: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: [[V_ACCVGPR_WRITE_B32_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:areg_64 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_1]], %subreg.sub1 + ; GFX908: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:sreg_32_xm0_xexec = S_MOV_B32 0 + %1:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 + %2:sreg_64 = COPY %1 + %3:areg_64 = COPY %2 + S_ENDPGM 0, implicit %3 + +... + +--- +name: reg_sequence_imm_s64_to_v64_to_a64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GFX908-LABEL: name: reg_sequence_imm_s64_to_v64_to_a64 + ; GFX908: liveins: $sgpr0 + ; GFX908: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: [[V_ACCVGPR_WRITE_B32_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:areg_64 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_1]], %subreg.sub1 + ; GFX908: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:sreg_32_xm0_xexec = S_MOV_B32 0 + %1:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 + %2:vreg_64 = COPY %1 + %3:areg_64 = COPY %2 + S_ENDPGM 0, implicit %3 + +... + +--- +name: reg_sequence_imm_v64_to_a64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GFX908-LABEL: name: reg_sequence_imm_v64_to_a64 + ; GFX908: liveins: $sgpr0 + ; GFX908: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: [[V_ACCVGPR_WRITE_B32_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:areg_64 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_1]], %subreg.sub1 + ; GFX908: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 + %2:vreg_64 = COPY %1 + %3:areg_64 = COPY %2 + S_ENDPGM 0, implicit %3 + +... + +--- +name: reg_squence_imm_a64_to_v64_to_a64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GFX908-LABEL: name: reg_squence_imm_a64_to_v64_to_a64 + ; GFX908: liveins: $sgpr0 + ; GFX908: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: [[V_ACCVGPR_WRITE_B32_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:agpr_32 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_1]], %subreg.sub1 + ; GFX908: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + %1:areg_64 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 + %2:vreg_64 = COPY %1 + %3:agpr_32 = COPY %2 + S_ENDPGM 0, implicit %3 + +...