Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4222,11 +4222,35 @@ return BB; } case AMDGPU::DS_GWS_INIT: - case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_BARRIER: + if (Subtarget->needsAlignedVGPRs()) { + // Add implicit aligned super-reg to force alignment on the data operand. + const DebugLoc &DL = MI.getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + Register DataReg = Op->getReg(); + bool IsAGPR = TRI->isAGPR(MRI, DataReg); + Register Undef = MRI.createVirtualRegister( + IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef); + Register NewVR = + MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass + : &AMDGPU::VReg_64_Align2RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR) + .addReg(DataReg) + .addImm(AMDGPU::sub0) + .addReg(Undef) + .addImm(AMDGPU::sub1); + Op->setReg(NewVR); + Op->setSubReg(AMDGPU::sub0); + MI.addOperand(MachineOperand::CreateReg(NewVR, false, true)); + } + LLVM_FALLTHROUGH; + case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: - case AMDGPU::DS_GWS_BARRIER: // A s_waitcnt 0 is required to be the instruction immediately following. if (getSubtarget()->hasGWSAutoReplay()) { bundleInstWithWaitcnt(MI); Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4344,6 +4344,31 @@ } } + if (ST.needsAlignedVGPRs() && !MRI.isSSA() && + (MI.getOpcode() == AMDGPU::DS_GWS_INIT || + MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || + MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) { + // Register class of these instructions is fixed in the finalize lowering, + // thus do not check it before we done with SSA which is guaranteed past + // that point. + const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0); + Register Reg = Op->getReg(); + bool Aligned = true; + if (Reg.isPhysical()) { + Aligned = !(RI.getHWRegIndex(Reg) & 1); + } else { + const TargetRegisterClass &RC = *MRI.getRegClass(Reg); + Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && + !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); + } + + if (!Aligned) { + ErrInfo = "Subtarget requires even aligned vector registers " + "for DS_GWS instructions"; + return false; + } + } + return true; } Index: llvm/test/CodeGen/AMDGPU/ds_gws_align.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/ds_gws_align.ll @@ -0,0 +1,58 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s + +; GCN-LABEL: {{^}}gws_init_odd_reg: +; GFX908-DAG: ds_gws_init v1 gds +; GFX90A-DAG: ds_gws_init v2 gds +; GCN-DAG: ds_gws_init v0 gds +define amdgpu_ps void @gws_init_odd_reg(<2 x i32> %arg) { + %vgpr.0 = extractelement <2 x i32> %arg, i32 0 + %vgpr.1 = extractelement <2 x i32> %arg, i32 1 + call void @llvm.amdgcn.ds.gws.init(i32 %vgpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.init(i32 %vgpr.1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}gws_sema_br_odd_reg: +; GFX908-DAG: ds_gws_sema_br v1 gds +; GFX90A-DAG: ds_gws_sema_br v2 gds +; GCN-DAG: ds_gws_sema_br v0 gds +define amdgpu_ps void @gws_sema_br_odd_reg(<2 x i32> %arg) { + %vgpr.0 = extractelement <2 x i32> %arg, i32 0 + %vgpr.1 = extractelement <2 x i32> %arg, i32 1 + call void @llvm.amdgcn.ds.gws.sema.br(i32 %vgpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.sema.br(i32 %vgpr.1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}gws_barrier_odd_reg: +; GFX908-DAG: ds_gws_barrier v1 gds +; GFX90A-DAG: ds_gws_barrier v2 gds +; GCN-DAG: ds_gws_barrier v0 gds +define amdgpu_ps void @gws_barrier_odd_reg(<2 x i32> %arg) { + %vgpr.0 = extractelement <2 x i32> %arg, i32 0 + %vgpr.1 = extractelement <2 x i32> %arg, i32 1 + call void @llvm.amdgcn.ds.gws.barrier(i32 %vgpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.barrier(i32 %vgpr.1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}gws_init_odd_agpr: +; GFX908-COUNT-2: ds_gws_init v{{[0-9]+}} gds +; GFX90A-COUNT-2: ds_gws_init {{[va][0-9]?[02468]}} gds +define amdgpu_ps void @gws_init_odd_agpr(<4 x i32> %arg) { +bb: + %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0) + %agpr.0 = extractelement <4 x i32> %mai, i32 0 + %agpr.1 = extractelement <4 x i32> %mai, i32 1 + call void @llvm.amdgcn.ds.gws.init(i32 %agpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.init(i32 %agpr.1, i32 0) + ret void +} + +declare void @llvm.amdgcn.ds.gws.init(i32, i32) +declare void @llvm.amdgcn.ds.gws.sema.br(i32, i32) +declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) Index: llvm/test/CodeGen/AMDGPU/ds_gws_align_agpr.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/ds_gws_align_agpr.mir @@ -0,0 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -verify-machineinstrs -run-pass=finalize-isel -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: gws_init_odd_agpr +body: | + bb.0: + + ; GCN-LABEL: name: gws_init_odd_agpr + ; GCN: $m0 = S_MOV_B32 0 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: [[DEF1:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF + ; GCN: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 killed [[DEF]], killed [[DEF]], killed [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN: [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE [[V_MFMA_I32_4X4X4I8_e64_]], %subreg.sub0, [[DEF2]], %subreg.sub1 + ; GCN: BUNDLE implicit killed [[REG_SEQUENCE]], implicit $m0, implicit $exec { + ; GCN: DS_GWS_INIT killed [[REG_SEQUENCE]].sub0, 0, implicit $m0, implicit $exec, implicit [[REG_SEQUENCE]] :: (store 4 into custom "GWSResource") + ; GCN: S_WAITCNT 0 + ; GCN: } + ; GCN: [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE [[V_MFMA_I32_4X4X4I8_e64_]], %subreg.sub0, [[DEF3]], %subreg.sub1 + ; GCN: BUNDLE implicit killed [[REG_SEQUENCE1]], implicit $m0, implicit $exec { + ; GCN: DS_GWS_INIT killed [[REG_SEQUENCE1]].sub0, 0, implicit $m0, implicit $exec, implicit [[REG_SEQUENCE1]] :: (store 4 into custom "GWSResource") + ; GCN: S_WAITCNT 0 + ; GCN: } + ; GCN: S_ENDPGM 0 + SI_INIT_M0 0, implicit-def dead $m0 + %0:vgpr_32 = IMPLICIT_DEF + %1:areg_128_align2 = IMPLICIT_DEF + %2:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 killed %0, killed %0, killed %1, 0, 0, 0, implicit $mode, implicit $exec + DS_GWS_INIT killed %2.sub0, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + DS_GWS_INIT killed %2.sub1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/verify-ds-gws-align.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/verify-ds-gws-align.mir @@ -0,0 +1,38 @@ +# RUN: not --crash llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX90A-ERR %s + +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT killed %0.sub1:areg_128_align2, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT killed %0.sub3:areg_128_align2, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_SEMA_BR killed %1.sub1:vreg_64_align2, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_BARRIER killed %2.sub0:vreg_64, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT killed %3:vgpr_32, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT $vgpr1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT $agpr1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +--- +name: gws_odd_vgpr +tracksRegLiveness: true +body: | + bb.0: + %0:areg_128_align2 = IMPLICIT_DEF + DS_GWS_INIT killed %0.sub1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + %0:areg_128_align2 = IMPLICIT_DEF + DS_GWS_INIT killed %0.sub3, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + %1:vreg_64_align2 = IMPLICIT_DEF + DS_GWS_SEMA_BR killed %1.sub1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + %2:vreg_64 = IMPLICIT_DEF + DS_GWS_BARRIER killed %2.sub0, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + %3:vgpr_32 = IMPLICIT_DEF + DS_GWS_INIT killed %3, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + $vgpr1 = IMPLICIT_DEF + DS_GWS_INIT $vgpr1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + $agpr1 = IMPLICIT_DEF + DS_GWS_INIT $agpr1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + S_ENDPGM 0 + +...