Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2876,6 +2876,8 @@ Register AndMaskSrc = LiveIn; + // TODO: Avoid clearing the high bits if we know workitem id y/z are always + // 0. if (Shift != 0) { auto ShiftAmt = B.buildConstant(S32, Shift); AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); @@ -4954,6 +4956,12 @@ return true; } +static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { + B.buildConstant(MI.getOperand(0).getReg(), C); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; @@ -5057,12 +5065,20 @@ case Intrinsic::amdgcn_implicitarg_ptr: return legalizeImplicitArgPtr(MI, MRI, B); case Intrinsic::amdgcn_workitem_id_x: + if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) + return replaceWithConstant(B, MI, 0); return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_X); case Intrinsic::amdgcn_workitem_id_y: + if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) + return replaceWithConstant(B, MI, 0); + return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_Y); case Intrinsic::amdgcn_workitem_id_z: + if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) + return replaceWithConstant(B, MI, 0); + return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_Z); case Intrinsic::amdgcn_workgroup_id_x: Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6760,14 +6760,23 @@ return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); case Intrinsic::amdgcn_workitem_id_x: + if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0) + return DAG.getConstant(0, DL, MVT::i32); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDX); case Intrinsic::amdgcn_workitem_id_y: + if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0) + return DAG.getConstant(0, DL, MVT::i32); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: + if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0) + return DAG.getConstant(0, DL, MVT::i32); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll @@ -1,9 +1,9 @@ -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s ; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -125,5 +125,75 @@ ret void } +; FIXME: Should be able to avoid enabling in kernel inputs +; FIXME: Packed tid should avoid the and +; ALL-LABEL: {{^}}test_reqd_workgroup_size_x_only: +; CO-V2: enable_vgpr_workitem_id = 2 + +; ALL-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; UNPACKED-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 + +; PACKED: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ff, v0 +; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] + +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] +define amdgpu_kernel void @test_reqd_workgroup_size_x_only(i32* %out) !reqd_work_group_size !0 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.x, i32* %out + store volatile i32 %id.y, i32* %out + store volatile i32 %id.z, i32* %out + ret void +} + +; ALL-LABEL: {{^}}test_reqd_workgroup_size_y_only: +; CO-V2: enable_vgpr_workitem_id = 2 + +; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] + +; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 + +; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 10 +; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] + +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] +define amdgpu_kernel void @test_reqd_workgroup_size_y_only(i32* %out) !reqd_work_group_size !1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.x, i32* %out + store volatile i32 %id.y, i32* %out + store volatile i32 %id.z, i32* %out + ret void +} + +; ALL-LABEL: {{^}}test_reqd_workgroup_size_z_only: +; CO-V2: enable_vgpr_workitem_id = 2 + +; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] + +; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v2 + +; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20 +; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] +define amdgpu_kernel void @test_reqd_workgroup_size_z_only(i32* %out) !reqd_work_group_size !2 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.x, i32* %out + store volatile i32 %id.y, i32* %out + store volatile i32 %id.z, i32* %out + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } + +!0 = !{i32 64, i32 1, i32 1} +!1 = !{i32 1, i32 64, i32 1} +!2 = !{i32 1, i32 1, i32 64} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2,UNPACKED %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2,UNPACKED %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -63,5 +63,75 @@ ret void } +; FIXME: Should be able to avoid enabling in kernel inputs +; FIXME: Packed tid should avoid the and +; ALL-LABEL: {{^}}test_reqd_workgroup_size_x_only: +; CO-V2: enable_vgpr_workitem_id = 2 + +; ALL-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; UNPACKED-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 + +; PACKED: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ff, v0 +; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] + +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] +define amdgpu_kernel void @test_reqd_workgroup_size_x_only(i32* %out) !reqd_work_group_size !0 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.x, i32* %out + store volatile i32 %id.y, i32* %out + store volatile i32 %id.z, i32* %out + ret void +} + +; ALL-LABEL: {{^}}test_reqd_workgroup_size_y_only: +; CO-V2: enable_vgpr_workitem_id = 2 + +; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] + +; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 + +; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 10 +; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] + +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] +define amdgpu_kernel void @test_reqd_workgroup_size_y_only(i32* %out) !reqd_work_group_size !1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.x, i32* %out + store volatile i32 %id.y, i32* %out + store volatile i32 %id.z, i32* %out + ret void +} + +; ALL-LABEL: {{^}}test_reqd_workgroup_size_z_only: +; CO-V2: enable_vgpr_workitem_id = 2 + +; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] +; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] + +; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v2 + +; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20 +; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] +define amdgpu_kernel void @test_reqd_workgroup_size_z_only(i32* %out) !reqd_work_group_size !2 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %id.x, i32* %out + store volatile i32 %id.y, i32* %out + store volatile i32 %id.z, i32* %out + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } + +!0 = !{i32 64, i32 1, i32 1} +!1 = !{i32 1, i32 64, i32 1} +!2 = !{i32 1, i32 1, i32 64}