diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1866,7 +1866,7 @@ return DAG.getUNDEF(VT); } - return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); + return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg); } static void processPSInputArgs(SmallVectorImpl &Splits, @@ -2181,11 +2181,16 @@ SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const { + bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs(); if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { // Note: user SGPRs are handled by the front-end for graphics shaders // Pad up the used user SGPRs with dead inputs. - unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); + // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately + // before enabling architected SGPRs for workgroup IDs. + assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget"); + + unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to // rely on it to reach 16 since if we end up having no stack usage, it will // not really be added. @@ -2201,20 +2206,26 @@ } if (Info.hasWorkGroupIDX()) { - Register Reg = Info.addWorkGroupIDX(); - MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs); + if (!HasArchitectedSGPRs) + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDY()) { - Register Reg = Info.addWorkGroupIDY(); - MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs); + if (!HasArchitectedSGPRs) + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDZ()) { - Register Reg = Info.addWorkGroupIDZ(); - MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs); + if (!HasArchitectedSGPRs) + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -693,21 +693,32 @@ } // Add system SGPRs. - Register addWorkGroupIDX() { - ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); - NumSystemSGPRs += 1; + Register addWorkGroupIDX(bool HasArchitectedSGPRs) { + Register Reg = HasArchitectedSGPRs ? AMDGPU::TTMP9 : getNextSystemSGPR(); + ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg); + if (!HasArchitectedSGPRs) + NumSystemSGPRs += 1; + return ArgInfo.WorkGroupIDX.getRegister(); } - Register addWorkGroupIDY() { - ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); - NumSystemSGPRs += 1; + Register addWorkGroupIDY(bool HasArchitectedSGPRs) { + Register Reg = HasArchitectedSGPRs ? AMDGPU::TTMP7 : getNextSystemSGPR(); + unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u; + ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask); + if (!HasArchitectedSGPRs) + NumSystemSGPRs += 1; + return ArgInfo.WorkGroupIDY.getRegister(); } - Register addWorkGroupIDZ() { - ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); - NumSystemSGPRs += 1; + Register addWorkGroupIDZ(bool HasArchitectedSGPRs) { + Register Reg = HasArchitectedSGPRs ? AMDGPU::TTMP7 : getNextSystemSGPR(); + unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u; + ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask); + if (!HasArchitectedSGPRs) + NumSystemSGPRs += 1; + return ArgInfo.WorkGroupIDZ.getRegister(); } diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs --verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel --verify-machineinstrs < %s | FileCheck -check-prefix=GCN-GISEL %s + +define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { +; GCN-SDAG-LABEL: workgroup_id_x: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 +; GCN-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-SDAG-NEXT: s_endpgm +; +; GCN-GISEL-LABEL: workgroup_id_x: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GCN-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-GISEL-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + store i32 %idx, ptr addrspace(1) %ptrx + + ret void +} + +define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry) { +; GCN-SDAG-LABEL: workgroup_id_xy: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 +; GCN-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, ttmp7 +; GCN-SDAG-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-SDAG-NEXT: s_endpgm +; +; GCN-GISEL-LABEL: workgroup_id_xy: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, ttmp9 +; GCN-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, ttmp7 +; GCN-GISEL-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-GISEL-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + store i32 %idx, ptr addrspace(1) %ptrx + %idy = call i32 @llvm.amdgcn.workgroup.id.y() + store i32 %idy, ptr addrspace(1) %ptry + + ret void +} + +define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { +; GCN-SDAG-LABEL: workgroup_id_xyz: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GCN-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 +; GCN-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GCN-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GCN-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GCN-SDAG-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GCN-SDAG-NEXT: global_store_dword v0, v1, s[6:7] +; GCN-SDAG-NEXT: s_endpgm +; +; GCN-GISEL-LABEL: workgroup_id_xyz: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GCN-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-GISEL-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-GISEL-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GCN-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-GISEL-NEXT: global_store_dword v1, v0, s[6:7] +; GCN-GISEL-NEXT: s_endpgm + %idx = call i32 @llvm.amdgcn.workgroup.id.x() + store i32 %idx, ptr addrspace(1) %ptrx + %idy = call i32 @llvm.amdgcn.workgroup.id.y() + store i32 %idy, ptr addrspace(1) %ptry + %idz = call i32 @llvm.amdgcn.workgroup.id.z() + store i32 %idz, ptr addrspace(1) %ptrz + + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() +declare i32 @llvm.amdgcn.workgroup.id.y() +declare i32 @llvm.amdgcn.workgroup.id.z()