Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -73,6 +73,9 @@ defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named <"__builtin_amdgcn_workgroup_id">; +defm int_amdgcn_workgroup_count : AMDGPUReadPreloadRegisterIntrinsic_xyz_named + <"__builtin_amdgcn_workgroup_count">; + def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">, Intrinsic<[], [], [IntrConvergent]>; Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -178,6 +178,10 @@ { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" }, { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" }, + { "llvm.amdgcn.workgroup.count.x", "amdgpu-work-group-count-x" }, + { "llvm.amdgcn.workgroup.count.y", "amdgpu-work-group-count-y" }, + { "llvm.amdgcn.workgroup.count.z", "amdgpu-work-group-count-z" }, + { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -726,6 +726,26 @@ CCInfo.AllocateReg(FlatScratchInitReg); } + if (Info->hasGridWorkgroupCountX()) { + unsigned WorkGroupCountXReg = Info->addWorkGroupCountX(*TRI); + MF.addLiveIn(WorkGroupCountXReg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(WorkGroupCountXReg); + } + + if (Info->hasGridWorkgroupCountY()) { + assert(Info->getNumUserSGPRs() < 16); + unsigned WorkGroupCountYReg = Info->addWorkGroupCountY(*TRI); + MF.addLiveIn(WorkGroupCountYReg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(WorkGroupCountYReg); + } + + if (Info->hasGridWorkgroupCountZ()) { + assert(Info->getNumUserSGPRs() < 16); + unsigned WorkGroupCountZReg = Info->addWorkGroupCountZ(*TRI); + MF.addLiveIn(WorkGroupCountZReg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(WorkGroupCountZReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector Chains; @@ -2076,6 +2096,15 @@ // Really only 2 bits. return lowerImplicitZextParam(DAG, Op, MVT::i8, getImplicitParameterOffset(MFI, GRID_DIM)); + case Intrinsic::amdgcn_workgroup_count_x: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_COUNT_X), VT); + case Intrinsic::amdgcn_workgroup_count_y: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_COUNT_Y), VT); + case Intrinsic::amdgcn_workgroup_count_z: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_COUNT_Z), VT); case Intrinsic::amdgcn_workgroup_id_x: case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -144,6 +144,9 @@ unsigned addQueuePtr(const SIRegisterInfo &TRI); unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); unsigned addFlatScratchInit(const SIRegisterInfo &TRI); + unsigned addWorkGroupCountX(const SIRegisterInfo &TRI); + unsigned addWorkGroupCountY(const SIRegisterInfo &TRI); + unsigned addWorkGroupCountZ(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -107,6 +107,15 @@ if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue()) WorkItemIDZ = true; + if (F->hasFnAttribute("amdgpu-work-group-count-x")) + GridWorkgroupCountX = true; + + if (F->hasFnAttribute("amdgpu-work-group-count-y")) + GridWorkgroupCountY = true; + + if (F->hasFnAttribute("amdgpu-work-group-count-z")) + GridWorkgroupCountZ = true; + // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. if (WorkItemIDZ) @@ -181,6 +190,26 @@ return FlatScratchInitUserSGPR; } +unsigned SIMachineFunctionInfo::addWorkGroupCountX(const SIRegisterInfo &TRI) { + GridWorkGroupCountXUserSGPR = getNextUserSGPR(); + NumUserSGPRs += 1; + return GridWorkGroupCountXUserSGPR; +} + +unsigned SIMachineFunctionInfo::addWorkGroupCountY(const SIRegisterInfo &TRI) { + GridWorkGroupCountYUserSGPR = getNextUserSGPR(); + NumUserSGPRs += 1; + return GridWorkGroupCountYUserSGPR; +} + +unsigned SIMachineFunctionInfo::addWorkGroupCountZ(const SIRegisterInfo &TRI) { + GridWorkGroupCountZUserSGPR = getNextUserSGPR(); + NumUserSGPRs += 1; + return GridWorkGroupCountZUserSGPR; +} + + + SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( MachineFunction *MF, unsigned FrameIndex, Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -164,12 +164,15 @@ WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + WORKGROUP_COUNT_X = 15, + WORKGROUP_COUNT_Y = 16, + WORKGROUP_COUNT_Z = 17, // VGPRS: - FIRST_VGPR_VALUE = 15, + FIRST_VGPR_VALUE = 32, WORKITEM_ID_X = FIRST_VGPR_VALUE, - WORKITEM_ID_Y = 16, - WORKITEM_ID_Z = 17 + WORKITEM_ID_Y = 33, + WORKITEM_ID_Z = 34 }; /// \brief Returns the physical register that \p Value is stored in. Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -912,6 +912,15 @@ const SISubtarget &ST = MF.getSubtarget(); (void)ST; switch (Value) { + case SIRegisterInfo::WORKGROUP_COUNT_X: + assert(MFI->hasGridWorkgroupCountX()); + return MFI->GridWorkGroupCountXUserSGPR; + case SIRegisterInfo::WORKGROUP_COUNT_Y: + assert(MFI->hasGridWorkgroupCountY()); + return MFI->GridWorkGroupCountYUserSGPR; + case SIRegisterInfo::WORKGROUP_COUNT_Z: + assert(MFI->hasGridWorkgroupCountZ()); + return MFI->GridWorkGroupCountZUserSGPR; case SIRegisterInfo::WORKGROUP_ID_X: assert(MFI->hasWorkGroupIDX()); return MFI->WorkGroupIDXSystemSGPR; Index: test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.count.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.count.ll @@ -0,0 +1,136 @@ +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=MESA %s + +declare i32 @llvm.amdgcn.workgroup.count.x() #0 +declare i32 @llvm.amdgcn.workgroup.count.y() #0 +declare i32 @llvm.amdgcn.workgroup.count.z() #0 + +; ALL-LABEL: {{^}}test_workgroup_count_x: +; MESA: .amd_kernel_code_t +; MESA: compute_pgm_rsrc2_user_sgpr = 7 +; MESA: enable_sgpr_grid_workgroup_count_x = 1 +; MESA: enable_sgpr_grid_workgroup_count_y = 0 +; MESA: enable_sgpr_grid_workgroup_count_z = 0 +; MESA: .end_amd_kernel_code_t + +; MESA: v_mov_b32_e32 [[VALUE:v[0-9]+]], s6 +; MESA: buffer_store_dword [[VALUE]] + +define void @test_workgroup_count_x(i32 addrspace(1)* %out) #1 { + %id = call i32 @llvm.amdgcn.workgroup.count.x() + store i32 %id, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_workgroup_count_y: +; MESA: .amd_kernel_code_t +; MESA: compute_pgm_rsrc2_user_sgpr = 7 +; MESA: enable_sgpr_grid_workgroup_count_x = 0 +; MESA: enable_sgpr_grid_workgroup_count_y = 1 +; MESA: enable_sgpr_grid_workgroup_count_z = 0 +; MESA: .end_amd_kernel_code_t + +; MESA: v_mov_b32_e32 [[VALUE:v[0-9]+]], s6 +; MESA: buffer_store_dword [[VALUE]] + +define void @test_workgroup_count_y(i32 addrspace(1)* %out) #1 { + %id = call i32 @llvm.amdgcn.workgroup.count.y() + store i32 %id, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_workgroup_count_z: +; MESA: .amd_kernel_code_t +; MESA: compute_pgm_rsrc2_user_sgpr = 7 +; MESA: enable_sgpr_grid_workgroup_count_x = 0 +; MESA: enable_sgpr_grid_workgroup_count_y = 0 +; MESA: enable_sgpr_grid_workgroup_count_z = 1 +; MESA: .end_amd_kernel_code_t + +; MESA: v_mov_b32_e32 [[VALUE:v[0-9]+]], s6 +; MESA: buffer_store_dword [[VALUE]] + +define void @test_workgroup_count_z(i32 addrspace(1)* %out) #1 { + %id = call i32 @llvm.amdgcn.workgroup.count.z() + store i32 %id, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_workgroup_count_xy: +; MESA: .amd_kernel_code_t +; MESA: compute_pgm_rsrc2_user_sgpr = 8 +; MESA: enable_sgpr_grid_workgroup_count_x = 1 +; MESA: enable_sgpr_grid_workgroup_count_y = 1 +; MESA: enable_sgpr_grid_workgroup_count_z = 0 +; MESA: .end_amd_kernel_code_t + +; MESA: s_add_i32 [[S_VAL:s[0-9]+]], s6, s7 +; MESA: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]] +; MESA: buffer_store_dword [[V_VAL]] + +define void @test_workgroup_count_xy(i32 addrspace(1)* %out) #1 { + %x = call i32 @llvm.amdgcn.workgroup.count.x() + %y = call i32 @llvm.amdgcn.workgroup.count.y() + %val = add i32 %x, %y + store i32 %val, i32 addrspace(1)* %out + ret void +} +; ALL-LABEL: {{^}}test_workgroup_count_xz: +; MESA: .amd_kernel_code_t +; MESA: compute_pgm_rsrc2_user_sgpr = 8 +; MESA: enable_sgpr_grid_workgroup_count_x = 1 +; MESA: enable_sgpr_grid_workgroup_count_y = 0 +; MESA: enable_sgpr_grid_workgroup_count_z = 1 +; MESA: .end_amd_kernel_code_t + +; MESA: s_add_i32 [[S_VAL:s[0-9]+]], s6, s7 +; MESA: buffer_store_dword [[V_VAL]] + +define void @test_workgroup_count_xz(i32 addrspace(1)* %out) #1 { + %x = call i32 @llvm.amdgcn.workgroup.count.x() + %z = call i32 @llvm.amdgcn.workgroup.count.z() + %val = add i32 %x, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_workgroup_count_yz: +; MESA: .amd_kernel_code_t +; MESA: compute_pgm_rsrc2_user_sgpr = 8 +; MESA: enable_sgpr_grid_workgroup_count_x = 0 +; MESA: enable_sgpr_grid_workgroup_count_y = 1 +; MESA: enable_sgpr_grid_workgroup_count_z = 1 +; MESA: .end_amd_kernel_code_t + +; MESA: s_add_i32 [[S_VAL:s[0-9]+]], s6, s7 +; MESA: buffer_store_dword [[V_VAL]] + +define void @test_workgroup_count_yz(i32 addrspace(1)* %out) #1 { + %y = call i32 @llvm.amdgcn.workgroup.count.y() + %z = call i32 @llvm.amdgcn.workgroup.count.z() + %val = add i32 %y, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}test_workgroup_count_xyz: +; MESA: .amd_kernel_code_t +; MESA: compute_pgm_rsrc2_user_sgpr = 9 +; MESA: enable_sgpr_grid_workgroup_count_x = 1 +; MESA: enable_sgpr_grid_workgroup_count_y = 1 +; MESA: enable_sgpr_grid_workgroup_count_z = 1 +; MESA: .end_amd_kernel_code_t + +; MESA: s_add_i32 [[XY:s[0-9]+]], s6, s7 +; MESA: s_add_i32 [[S_VAL:s[0-9]+]], [[XY]], s8 +; MESA: buffer_store_dword [[V_VAL]] + +define void @test_workgroup_count_xyz(i32 addrspace(1)* %out) #1 { + %x = call i32 @llvm.amdgcn.workgroup.count.x() + %y = call i32 @llvm.amdgcn.workgroup.count.y() + %z = call i32 @llvm.amdgcn.workgroup.count.z() + %xy = add i32 %x, %y + %xyz = add i32 %xy, %z + store i32 %xyz, i32 addrspace(1)* %out + ret void +} +