Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -875,8 +875,8 @@ return; } - if (RC->hasSuperClassEq(&AMDGPU::VReg_64RegClass) && - !RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg); + if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { if (ST.hasPackedFP32Ops()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) .addImm(SISrcMods::OP_SEL_1) @@ -895,7 +895,7 @@ const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); if (RI.isSGPRClass(RC)) { - if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { + if (!RI.isSGPRClass(SrcRC)) { reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; } @@ -906,12 +906,13 @@ unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.hasAGPRs(RC)) { - Opcode = (RI.hasVGPRs(RI.getPhysRegClass(SrcReg))) ? + Opcode = (RI.hasVGPRs(SrcRC)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; - } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) { Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && - !RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + (RI.isProperlyAlignedRC(*RC) && + (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. if (ST.hasPackedFP32Ops()) { Opcode = AMDGPU::V_PK_MOV_B32; @@ -3831,10 +3832,7 @@ } // Check that this is the aligned version of the class. - if (!RC || ((IsVGPR && !RC->hasSuperClassEq(RI.getVGPRClassForBitWidth( - RI.getRegSizeInBits(*RC)))) || - (IsAGPR && !RC->hasSuperClassEq(RI.getAGPRClassForBitWidth( - RI.getRegSizeInBits(*RC)))))) { + if (!RC || !RI.isProperlyAlignedRC(*RC)) { ErrInfo = "Subtarget requires even aligned vector registers"; return false; } Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -326,6 +326,10 @@ // \returns \p Reg otherwise. MCPhysReg get32BitRegister(MCPhysReg Reg) const; + // Returns true if a given register class is properly aligned for + // the subtarget. + bool isProperlyAlignedRC(const TargetRegisterClass &RC) const; + /// Return all SGPR128 which satisfy the waves per execution unit requirement /// of the subtarget. ArrayRef getAllSGPR128(const MachineFunction &MF) const; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2344,6 +2344,18 @@ return AMDGPU::NoRegister; } +bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { + if (!ST.needsAlignedVGPRs()) + return true; + + if (hasVGPRs(&RC)) + return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); + if (hasAGPRs(&RC)) + return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); + + return true; +} + bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { switch (PhysReg) { case AMDGPU::SGPR_NULL: Index: llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir +++ llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir @@ -158,3 +158,193 @@ ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7 ... + +--- +name: copy_v64_to_v64_unaligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr2_vgpr3 + ; GFX908-LABEL: name: copy_v64_to_v64_unaligned + ; GFX908: liveins: $vgpr2_vgpr3 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX90A-LABEL: name: copy_v64_to_v64_unaligned + ; GFX90A: liveins: $vgpr2_vgpr3 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + $vgpr1_vgpr2 = COPY killed $vgpr2_vgpr3, implicit $exec +... + +--- +name: copy_v64_unaligned_to_v64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr3_vgpr4 + ; GFX908-LABEL: name: copy_v64_unaligned_to_v64 + ; GFX908: liveins: $vgpr3_vgpr4 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX90A-LABEL: name: copy_v64_unaligned_to_v64 + ; GFX90A: liveins: $vgpr3_vgpr4 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + $vgpr0_vgpr1 = COPY killed $vgpr3_vgpr4, implicit $exec +... + +--- +name: copy_v128_to_v128_unaligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX908-LABEL: name: copy_v128_to_v128_unaligned + ; GFX908: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX908: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX90A-LABEL: name: copy_v128_to_v128_unaligned + ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX90A: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec +... + +--- +name: copy_v128_unaligned_to_v128 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX908-LABEL: name: copy_v128_unaligned_to_v128 + ; GFX908: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX90A-LABEL: name: copy_v128_unaligned_to_v128 + ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec +... + +--- +name: copy_s64_to_v64_unaligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr8_sgpr9 + ; GFX908-LABEL: name: copy_s64_to_v64_unaligned + ; GFX908: liveins: $sgpr8_sgpr9 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX90A-LABEL: name: copy_s64_to_v64_unaligned + ; GFX90A: liveins: $sgpr8_sgpr9 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + $vgpr1_vgpr2 = COPY killed $sgpr8_sgpr9, implicit $exec +... + +--- +name: copy_s128_to_v128_unaligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX908-LABEL: name: copy_s128_to_v128_unaligned + ; GFX908: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX908: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX908: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX90A-LABEL: name: copy_s128_to_v128_unaligned + ; GFX90A: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX90A: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec +... + +--- +name: copy_v96_to_v96_unaligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr8_vgpr9_vgpr10 + ; GFX908-LABEL: name: copy_v96_to_v96_unaligned + ; GFX908: liveins: $vgpr8_vgpr9_vgpr10 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 + ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX90A-LABEL: name: copy_v96_to_v96_unaligned + ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 + ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + $vgpr1_vgpr2_vgpr3 = COPY killed $vgpr8_vgpr9_vgpr10, implicit $exec +... + +--- +name: copy_v96_unaligned_to_v96 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr7_vgpr8_vgpr9 + ; GFX908-LABEL: name: copy_v96_unaligned_to_v96 + ; GFX908: liveins: $vgpr7_vgpr8_vgpr9 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX90A-LABEL: name: copy_v96_unaligned_to_v96 + ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr7_vgpr8_vgpr9, implicit $exec +... + +--- +name: copy_s96_to_v96 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2 + ; GFX908-LABEL: name: copy_s96_to_v96 + ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A-LABEL: name: copy_s96_to_v96 + ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + $vgpr0_vgpr1_vgpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec +... + +--- +name: copy_s96_to_v96_unaligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2 + ; GFX908-LABEL: name: copy_s96_to_v96_unaligned + ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX908: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A-LABEL: name: copy_s96_to_v96_unaligned + ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + $vgpr1_vgpr2_vgpr3 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec +...