Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2985,6 +2985,10 @@ // Only the first lane is executes, so readfirstlane is safe. substituteSimpleCopyRegs(OpdMapper, 1); constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + if (Subtarget.needsAlignedVGPRs()) { + Register DataReg = MI.getOperand(1).getReg(); + constrainGenericRegister(DataReg, AMDGPU::VReg_32_Align2RegClass, MRI); + } return; } case Intrinsic::amdgcn_ds_gws_sema_v: Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -373,6 +373,7 @@ let has_gws_data0 = 1; let hasSideEffects = 1; + let hasPostISelHook = 1; } class DS_VOID : DS_PseudoisMIMG(MI) && !MI.mayStore()) AddIMGInit(MI); + + if (Subtarget->needsAlignedVGPRs()) { + switch (MI.getOpcode()) { + default: + break; + case AMDGPU::DS_GWS_INIT: + case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_BARRIER: { + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + Register Reg = TII->getNamedOperand(MI, AMDGPU::OpName::data0)->getReg(); + MRI.setRegClass(Reg, TRI->isAGPR(MRI, Reg) + ? &AMDGPU::AReg_32_Align2RegClass + : &AMDGPU::VReg_32_Align2RegClass); + break; + } + } + } } static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4675,7 +4675,21 @@ assert(DefinedRC); if (!isLegalRegOperand(MRI, OpInfo, *MO)) return false; - bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); + const MachineOperand &CurOp = MI.getOperand(OpIdx); + const auto *RC = RI.getRegClassForReg(MRI, MO->getReg()); + if (ST.needsAlignedVGPRs() && CurOp.isReg() && &CurOp != MO) { + const auto *CurRC = RI.getRegClassForReg(MRI, CurOp.getReg()); + if (RI.isAlignedRC(CurRC) && RI.hasVectorRegisters(RC)) { + unsigned Sub = MO->getSubReg(); + if (Sub && + !RI.getCompatibleSubRegClass(RI.getEquivalentVGPRClass(RC), + RI.getEquivalentVGPRClass(CurRC), Sub)) + return false; + if (!Sub && !RI.isAlignedRC(RC)) + return false; + } + } + bool IsAGPR = RI.hasAGPRs(RC); if (IsAGPR && !ST.hasMAIInsts()) return false; unsigned Opc = MI.getOpcode(); Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -143,6 +143,9 @@ LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); + LLVM_READONLY + bool isAlignedRC(const TargetRegisterClass *RC) const; + /// Return the 'base' register class for this register. /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. const TargetRegisterClass *getPhysRegClass(MCRegister Reg) const; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1886,6 +1886,8 @@ static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 32) + return &AMDGPU::VReg_32_Align2RegClass; if (BitWidth <= 64) return &AMDGPU::VReg_64_Align2RegClass; if (BitWidth <= 96) @@ -1942,6 +1944,8 @@ static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 32) + return &AMDGPU::AReg_32_Align2RegClass; if (BitWidth <= 64) return &AMDGPU::AReg_64_Align2RegClass; if (BitWidth <= 96) @@ -1998,6 +2002,35 @@ return nullptr; } +bool SIRegisterInfo::isAlignedRC(const TargetRegisterClass *RC) const { + if (!RC) + return false; + + switch (RC->getID()) { + default: + return false; + case AMDGPU::VReg_32_Align2RegClassID: + case AMDGPU::VReg_64_Align2RegClassID: + case AMDGPU::VReg_96_Align2RegClassID: + case AMDGPU::VReg_128_Align2RegClassID: + case AMDGPU::VReg_160_Align2RegClassID: + case AMDGPU::VReg_192_Align2RegClassID: + case AMDGPU::VReg_256_Align2RegClassID: + case AMDGPU::VReg_512_Align2RegClassID: + case AMDGPU::VReg_1024_Align2RegClassID: + case AMDGPU::AReg_32_Align2RegClassID: + case AMDGPU::AReg_64_Align2RegClassID: + case AMDGPU::AReg_96_Align2RegClassID: + case AMDGPU::AReg_128_Align2RegClassID: + case AMDGPU::AReg_160_Align2RegClassID: + case AMDGPU::AReg_192_Align2RegClassID: + case AMDGPU::AReg_256_Align2RegClassID: + case AMDGPU::AReg_512_Align2RegClassID: + case AMDGPU::AReg_1024_Align2RegClassID: + return true; + } +} + // FIXME: This is very slow. It might be worth creating a map from physreg to // register class. const TargetRegisterClass * @@ -2011,6 +2044,8 @@ &AMDGPU::SReg_32RegClass, &AMDGPU::AGPR_32RegClass, &AMDGPU::AGPR_32RegClass, + &AMDGPU::VReg_32_Align2RegClass, + &AMDGPU::AReg_32_Align2RegClass, &AMDGPU::VReg_64_Align2RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, @@ -2105,7 +2140,9 @@ const TargetRegisterClass * SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { unsigned Size = getRegSizeInBits(*SRC); - const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); + const TargetRegisterClass *VRC = isAlignedRC(SRC) + ? getAlignedVGPRClassForBitWidth(Size) + : getVGPRClassForBitWidth(Size); assert(VRC && "Invalid register class size"); return VRC; } @@ -2113,7 +2150,9 @@ const TargetRegisterClass * SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { unsigned Size = getRegSizeInBits(*SRC); - const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); + const TargetRegisterClass *ARC = isAlignedRC(SRC) + ? getAlignedAGPRClassForBitWidth(Size) + : getAGPRClassForBitWidth(Size); assert(ARC && "Invalid register class size"); return ARC; } Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -806,6 +806,8 @@ def _Align2 : VRegClassBase; } +def VReg_32_Align2 : VRegClassBase<1, [OtherVT], (decimate VGPR_32, 2)>; + defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], (add VGPR_64)>; defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; @@ -827,6 +829,8 @@ } } +def AReg_32_Align2 : VRegClassBase<1, [OtherVT], (decimate AGPR_32, 2)>; + defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], (add AGPR_64)>; defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1604,6 +1604,8 @@ case AMDGPU::SReg_32RegClassID: case AMDGPU::SReg_32_XM0RegClassID: case AMDGPU::SRegOrLds_32RegClassID: + case AMDGPU::VReg_32_Align2RegClassID: + case AMDGPU::AReg_32_Align2RegClassID: return 32; case AMDGPU::SGPR_64RegClassID: case AMDGPU::VS_64RegClassID: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -138,7 +138,7 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2883594 /* regdef:VReg_64 */, def %2 + ; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 3014666 /* regdef:VReg_64 */, def %2 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %2 ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) Index: llvm/test/CodeGen/AMDGPU/ds_gws_align.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/ds_gws_align.ll @@ -0,0 +1,59 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s + +; GCN-LABEL: {{^}}gws_init_odd_reg: +; GFX908-DAG: ds_gws_init v1 gds +; GFX90A-DAG: ds_gws_init v2 gds +; GCN-DAG: ds_gws_init v0 gds +define amdgpu_ps void @gws_init_odd_reg(<2 x i32> %arg) { + %vgpr.0 = extractelement <2 x i32> %arg, i32 0 + %vgpr.1 = extractelement <2 x i32> %arg, i32 1 + call void @llvm.amdgcn.ds.gws.init(i32 %vgpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.init(i32 %vgpr.1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}gws_sema_br_odd_reg: +; GFX908-DAG: ds_gws_sema_br v1 gds +; GFX90A-DAG: ds_gws_sema_br v2 gds +; GCN-DAG: ds_gws_sema_br v0 gds +define amdgpu_ps void @gws_sema_br_odd_reg(<2 x i32> %arg) { + %vgpr.0 = extractelement <2 x i32> %arg, i32 0 + %vgpr.1 = extractelement <2 x i32> %arg, i32 1 + call void @llvm.amdgcn.ds.gws.sema.br(i32 %vgpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.sema.br(i32 %vgpr.1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}gws_barrier_odd_reg: +; GFX908-DAG: ds_gws_barrier v1 gds +; GFX90A-DAG: ds_gws_barrier v2 gds +; GCN-DAG: ds_gws_barrier v0 gds +define amdgpu_ps void @gws_barrier_odd_reg(<2 x i32> %arg) { + %vgpr.0 = extractelement <2 x i32> %arg, i32 0 + %vgpr.1 = extractelement <2 x i32> %arg, i32 1 + call void @llvm.amdgcn.ds.gws.barrier(i32 %vgpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.barrier(i32 %vgpr.1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}gws_init_odd_agpr: +; GFX908-COUNT-2: ds_gws_init v{{[0-9]+}} gds +; GFX90A: ds_gws_init a{{[0-9]*[02468]}} gds +; GFX90A: ds_gws_init v{{[0-9]*[02468]}} gds +define amdgpu_ps void @gws_init_odd_agpr(<4 x i32> %arg) { +bb: + %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0) + %agpr.0 = extractelement <4 x i32> %mai, i32 0 + %agpr.1 = extractelement <4 x i32> %mai, i32 1 + call void @llvm.amdgcn.ds.gws.init(i32 %agpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.init(i32 %agpr.1, i32 0) + ret void +} + +declare void @llvm.amdgcn.ds.gws.init(i32, i32) +declare void @llvm.amdgcn.ds.gws.sema.br(i32, i32) +declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) Index: llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:SGPR_128 */, def %4 + ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5046282 /* regdef:SGPR_128 */, def %4 ; GFX908: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:SGPR_128 */, [[COPY]] ; GFX908: S_ENDPGM 0 ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:SGPR_128 */, def %4 + ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5046282 /* regdef:SGPR_128 */, def %4 ; GFX90A: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -26,15 +26,15 @@ define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4390922 /* regdef:VReg_128 */, def %4 + ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4653066 /* regdef:VReg_128 */, def %4 ; GFX908: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 - ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4390921 /* reguse:VReg_128 */, [[COPY]] + ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:VReg_128 */, [[COPY]] ; GFX908: S_ENDPGM 0 ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4521994 /* regdef:VReg_128_Align2 */, def %4 + ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:VReg_128_Align2 */, def %4 ; GFX90A: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 - ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -44,15 +44,15 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4325386 /* regdef:AReg_128 */, def %4 + ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4587530 /* regdef:AReg_128 */, def %4 ; GFX908: [[COPY:%[0-9]+]]:areg_128 = COPY %4 - ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_128 */, [[COPY]] + ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_128 */, [[COPY]] ; GFX908: S_ENDPGM 0 ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4456458 /* regdef:AReg_128_Align2 */, def %4 + ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4718602 /* regdef:AReg_128_Align2 */, def %4 ; GFX90A: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 - ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4456457 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4718601 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val)