Index: llvm/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.td +++ llvm/lib/Target/AMDGPU/AMDGPU.td @@ -177,6 +177,12 @@ "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; +def FeatureUserSGPRInit16Bug : SubtargetFeature<"user-sgpr-init16-bug", + "UserSGPRInit16Bug", + "true", + "Bug requiring at least 16 user+system SGPRs to be enabled" +>; + def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", "LDSMisalignedBug", "true", @@ -1262,11 +1268,11 @@ // Features for GFX 11.0.0 and 11.0.1 def FeatureISAVersion11_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, - [])>; + [FeatureUserSGPRInit16Bug])>; def FeatureISAVersion11_0_2 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, - [])>; + [FeatureUserSGPRInit16Bug])>; //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -900,8 +900,14 @@ else if (MFI->hasWorkItemIDY()) TIDIGCompCnt = 1; + // The private segment wave byte offset is the last of the system SGPRs. We + // initially assumed it was allocated, and may have used it. It shouldn't harm + // anything to disable it if we know the stack isn't used here. We may still + // have emitted code reading it to initialize scratch, but if that's unused + // reading garbage should be OK. + const bool EnablePrivateSegmentWaveOffset = ProgInfo.ScratchBlocks > 0; ProgInfo.ComputePGMRSrc2 = - S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | + S_00B84C_SCRATCH_EN(EnablePrivateSegmentWaveOffset) | S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) | Index: llvm/lib/Target/AMDGPU/GCNSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -107,6 +107,7 @@ bool GFX10_3Insts = false; bool GFX7GFX8GFX9Insts = false; bool SGPRInitBug = false; + bool UserSGPRInit16Bug = false; bool NegativeScratchOffsetBug = false; bool NegativeUnalignedScratchOffsetBug = false; bool HasSMemRealTime = false; @@ -928,6 +929,10 @@ return SGPRInitBug; } + bool hasUserSGPRInit16Bug() const { + return UserSGPRInit16Bug; + } + bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } bool hasNegativeUnalignedScratchOffsetBug() const { Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2099,6 +2099,24 @@ SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const { + if (Subtarget->hasUserSGPRInit16Bug()) { + // Pad up the used user SGPRs with dead inputs. + unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); + + // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to + // rely on it to reach 16 since if we end up having no stack usage, it will + // not really be added. + unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() + + Info.hasWorkGroupIDY() + + Info.hasWorkGroupIDZ() + + Info.hasWorkGroupInfo(); + for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) { + Register Reg = Info.addReservedUserSGPR(); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + } + if (Info.hasWorkGroupIDX()) { Register Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); @@ -2143,6 +2161,8 @@ MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); } + + assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16); } static void reservePrivateMemoryRegs(const TargetMachine &TM, Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -609,6 +609,13 @@ Register addFlatScratchInit(const SIRegisterInfo &TRI); Register addImplicitBufferPtr(const SIRegisterInfo &TRI); + /// Increment user SGPRs used for padding the argument list only. + Register addReservedUserSGPR() { + Register Next = getNextUserSGPR(); + ++NumUserSGPRs; + return Next; + } + // Add system SGPRs. Register addWorkGroupIDX() { ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); Index: llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll @@ -0,0 +1,162 @@ +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck -check-prefix=GCN %s + +; There aren't any stack objects, but we still enable the +; private_segment_wavefront_offset to get to 16, and the workgroup ID +; is in s14. + +; private_segment_buffer + workgroup_id_x = 5, + 11 padding + +; GCN-LABEL: {{^}}minimal_kernel_inputs: +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], s15 +; GCN-NEXT: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off + +; GCN: .amdhsa_kernel minimal_kernel_inputs +; GCN: .amdhsa_user_sgpr_count 15 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_wavefront_size32 1 +; GCN-NEXT: .amdhsa_enable_private_segment 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 +define amdgpu_kernel void @minimal_kernel_inputs() { + %id = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %id, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}minimal_kernel_inputs_with_stack: +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], s15 +; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off + +; GCN: .amdhsa_kernel minimal_kernel_inputs +; GCN: .amdhsa_user_sgpr_count 15 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_wavefront_size32 1 +; GCN-NEXT: .amdhsa_enable_private_segment 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 +define amdgpu_kernel void @minimal_kernel_inputs_with_stack() { + %alloca = alloca i32, addrspace(5) + %id = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %id, i32 addrspace(1)* undef + store volatile i32 0, i32 addrspace(5)* %alloca + ret void +} + +; GCN-LABEL: {{^}}queue_ptr: +; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1] + +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], s15 +; GCN-NEXT: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off + +; GCN: .amdhsa_kernel queue_ptr +; GCN: .amdhsa_user_sgpr_count 15 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_wavefront_size32 1 +; GCN-NEXT: .amdhsa_enable_private_segment 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 +define amdgpu_kernel void @queue_ptr() { + %queue.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %load = load volatile i8, i8 addrspace(4)* %queue.ptr + %id = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %id, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}all_inputs: +; GCN: v_mov_b32_e32 [[V_X:v[0-9]+]], s13 +; GCN: v_mov_b32_e32 [[V_Y:v[0-9]+]], s14 +; GCN: v_mov_b32_e32 [[V_Z:v[0-9]+]], s15 + +; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1] +; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[2:3] +; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5] + +; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s6 +; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s7 + +; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_X]], off +; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Y]], off +; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Z]], off +; GCN: global_store_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[DISPATCH_LO]]:[[DISPATCH_HI]]{{\]}}, off + +; GCN: .amdhsa_kernel all_inputs +; GCN: .amdhsa_user_sgpr_count 13 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_wavefront_size32 1 +; GCN-NEXT: .amdhsa_enable_private_segment 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13 +define amdgpu_kernel void @all_inputs() { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + %dispatch.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %load.dispatch = load volatile i8, i8 addrspace(4)* %dispatch.ptr + + %queue.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() + %load.queue = load volatile i8, i8 addrspace(4)* %queue.ptr + + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %load.implicitarg = load volatile i8, i8 addrspace(4)* %implicitarg.ptr + + %id.x = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %id.x, i32 addrspace(1)* undef + + %id.y = call i32 @llvm.amdgcn.workgroup.id.y() + store volatile i32 %id.y, i32 addrspace(1)* undef + + %id.z = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %id.z, i32 addrspace(1)* undef + + %dispatch.id = call i64 @llvm.amdgcn.dispatch.id() + store volatile i64 %dispatch.id, i64 addrspace(1)* undef + + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.y() #0 +declare i32 @llvm.amdgcn.workgroup.id.z() #0 +declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 +declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 +declare align 4 i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 +declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i64 @llvm.amdgcn.dispatch.id() #0 + +attributes #0 = { nounwind readnone speculatable willreturn }