Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1390,7 +1390,24 @@ if (HasVSrc) { Register VSrc = MI.getOperand(1).getReg(); - MIB.addReg(VSrc); + + if (STI.needsAlignedVGPRs()) { + // Add implicit aligned super-reg to force alignment on the data operand. + Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); + Register NewVR = + MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); + BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR) + .addReg(VSrc, 0, MI.getOperand(1).getSubReg()) + .addImm(AMDGPU::sub0) + .addReg(Undef) + .addImm(AMDGPU::sub1); + MIB.addReg(NewVR, 0, AMDGPU::sub0); + MIB.addReg(NewVR, RegState::Implicit); + } else { + MIB.addReg(VSrc); + } + if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) return false; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4222,11 +4222,35 @@ return BB; } case AMDGPU::DS_GWS_INIT: - case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_BARRIER: + if (Subtarget->needsAlignedVGPRs()) { + // Add implicit aligned super-reg to force alignment on the data operand. + const DebugLoc &DL = MI.getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + Register DataReg = Op->getReg(); + bool IsAGPR = TRI->isAGPR(MRI, DataReg); + Register Undef = MRI.createVirtualRegister( + IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef); + Register NewVR = + MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass + : &AMDGPU::VReg_64_Align2RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR) + .addReg(DataReg, 0, Op->getSubReg()) + .addImm(AMDGPU::sub0) + .addReg(Undef) + .addImm(AMDGPU::sub1); + Op->setReg(NewVR); + Op->setSubReg(AMDGPU::sub0); + MI.addOperand(MachineOperand::CreateReg(NewVR, false, true)); + } + LLVM_FALLTHROUGH; + case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: - case AMDGPU::DS_GWS_BARRIER: // A s_waitcnt 0 is required to be the instruction immediately following. if (getSubtarget()->hasGWSAutoReplay()) { bundleInstWithWaitcnt(MI); Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4344,6 +4344,28 @@ } } + if (ST.needsAlignedVGPRs() && + (MI.getOpcode() == AMDGPU::DS_GWS_INIT || + MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || + MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) { + const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0); + Register Reg = Op->getReg(); + bool Aligned = true; + if (Reg.isPhysical()) { + Aligned = !(RI.getHWRegIndex(Reg) & 1); + } else { + const TargetRegisterClass &RC = *MRI.getRegClass(Reg); + Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && + !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); + } + + if (!Aligned) { + ErrInfo = "Subtarget requires even aligned vector registers " + "for DS_GWS instructions"; + return false; + } + } + return true; } Index: llvm/test/CodeGen/AMDGPU/ds_gws_align.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/ds_gws_align.ll @@ -0,0 +1,58 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s + +; GCN-LABEL: {{^}}gws_init_odd_reg: +; GFX908-DAG: ds_gws_init v1 gds +; GFX90A-DAG: ds_gws_init v2 gds +; GCN-DAG: ds_gws_init v0 gds +define amdgpu_ps void @gws_init_odd_reg(<2 x i32> %arg) { + %vgpr.0 = extractelement <2 x i32> %arg, i32 0 + %vgpr.1 = extractelement <2 x i32> %arg, i32 1 + call void @llvm.amdgcn.ds.gws.init(i32 %vgpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.init(i32 %vgpr.1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}gws_sema_br_odd_reg: +; GFX908-DAG: ds_gws_sema_br v1 gds +; GFX90A-DAG: ds_gws_sema_br v2 gds +; GCN-DAG: ds_gws_sema_br v0 gds +define amdgpu_ps void @gws_sema_br_odd_reg(<2 x i32> %arg) { + %vgpr.0 = extractelement <2 x i32> %arg, i32 0 + %vgpr.1 = extractelement <2 x i32> %arg, i32 1 + call void @llvm.amdgcn.ds.gws.sema.br(i32 %vgpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.sema.br(i32 %vgpr.1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}gws_barrier_odd_reg: +; GFX908-DAG: ds_gws_barrier v1 gds +; GFX90A-DAG: ds_gws_barrier v2 gds +; GCN-DAG: ds_gws_barrier v0 gds +define amdgpu_ps void @gws_barrier_odd_reg(<2 x i32> %arg) { + %vgpr.0 = extractelement <2 x i32> %arg, i32 0 + %vgpr.1 = extractelement <2 x i32> %arg, i32 1 + call void @llvm.amdgcn.ds.gws.barrier(i32 %vgpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.barrier(i32 %vgpr.1, i32 0) + ret void +} + +; GCN-LABEL: {{^}}gws_init_odd_agpr: +; GFX908-COUNT-2: ds_gws_init v{{[0-9]+}} gds +; GFX90A-COUNT-2: ds_gws_init {{[va][0-9]?[02468]}} gds +define amdgpu_ps void @gws_init_odd_agpr(<4 x i32> %arg) { +bb: + %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0) + %agpr.0 = extractelement <4 x i32> %mai, i32 0 + %agpr.1 = extractelement <4 x i32> %mai, i32 1 + call void @llvm.amdgcn.ds.gws.init(i32 %agpr.0, i32 0) + call void @llvm.amdgcn.ds.gws.init(i32 %agpr.1, i32 0) + ret void +} + +declare void @llvm.amdgcn.ds.gws.init(i32, i32) +declare void @llvm.amdgcn.ds.gws.sema.br(i32, i32) +declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) Index: llvm/test/CodeGen/AMDGPU/verify-ds-gws-align.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/verify-ds-gws-align.mir @@ -0,0 +1,37 @@ +# RUN: not --crash llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX90A-ERR %s + +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT killed %0.sub1:areg_128_align2, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT killed %0.sub3:areg_128_align2, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_SEMA_BR killed %1.sub1:vreg_64_align2, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_BARRIER killed %2.sub0:vreg_64, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT killed %3:vgpr_32, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT $vgpr1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +# GFX90A-ERR: *** Bad machine code: Subtarget requires even aligned vector registers for DS_GWS instructions *** +# GFX90A-ERR: DS_GWS_INIT $agpr1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") +--- +name: gws_odd_vgpr +body: | + bb.0: + %0:areg_128_align2 = IMPLICIT_DEF + DS_GWS_INIT killed %0.sub1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + %0:areg_128_align2 = IMPLICIT_DEF + DS_GWS_INIT killed %0.sub3, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + %1:vreg_64_align2 = IMPLICIT_DEF + DS_GWS_SEMA_BR killed %1.sub1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + %2:vreg_64 = IMPLICIT_DEF + DS_GWS_BARRIER killed %2.sub0, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + %3:vgpr_32 = IMPLICIT_DEF + DS_GWS_INIT killed %3, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + $vgpr1 = IMPLICIT_DEF + DS_GWS_INIT $vgpr1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + $agpr1 = IMPLICIT_DEF + DS_GWS_INIT $agpr1, 0, implicit $m0, implicit $exec :: (store 4 into custom "GWSResource") + S_ENDPGM 0 + +...