diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -32,18 +32,27 @@ unsigned StackOffset; }; + // Bitmask to locate argument within the register. + unsigned Mask; + bool IsStack : 1; bool IsSet : 1; - ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false) - : Register(Val), IsStack(IsStack), IsSet(IsSet) {} public: - static ArgDescriptor createRegister(unsigned Reg) { - return ArgDescriptor(Reg, false, true); + ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, + bool IsStack = false, bool IsSet = false) + : Register(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} + + static ArgDescriptor createRegister(unsigned Reg, unsigned Mask = ~0u) { + return ArgDescriptor(Reg, Mask, false, true); + } + + static ArgDescriptor createStack(unsigned Reg, unsigned Mask = ~0u) { + return ArgDescriptor(Reg, Mask, true, true); } - static ArgDescriptor createStack(unsigned Reg) { - return ArgDescriptor(Reg, true, true); + static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { + return ArgDescriptor(Arg.Register, Mask, Arg.IsStack, Arg.IsSet); } bool isSet() const { @@ -68,6 +77,14 @@ return StackOffset; } + unsigned getMask() const { + return Mask; + } + + bool isMasked() const { + return Mask != ~0u; + } + void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -9,6 +9,7 @@ #include "AMDGPU.h" #include "AMDGPUArgumentUsageInfo.h" #include "SIRegisterInfo.h" +#include "llvm/Support/NativeFormatting.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -26,9 +27,16 @@ } if (isRegister()) - OS << "Reg " << printReg(getRegister(), TRI) << '\n'; + OS << "Reg " << printReg(getRegister(), TRI); else - OS << "Stack offset " << getStackOffset() << '\n'; + OS << "Stack offset " << getStackOffset(); + + if (isMasked()) { + OS << " & "; + llvm::write_hex(OS, Mask, llvm::HexPrintStyle::PrefixLower); + } + + OS << '\n'; } char AMDGPUArgumentUsageInfo::ID = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4233,9 +4233,19 @@ const ArgDescriptor &Arg) const { assert(Arg && "Attempting to load missing argument"); - if (Arg.isRegister()) - return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); - return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); + SDValue V = Arg.isRegister() ? + CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) : + loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); + + if (!Arg.isMasked()) + return V; + + unsigned Mask = Arg.getMask(); + unsigned Shift = countTrailingZeros(Mask); + V = DAG.getNode(ISD::SRL, SL, VT, V, + DAG.getShiftAmountConstant(Shift, VT, SL)); + return DAG.getNode(ISD::AND, SL, VT, V, + DAG.getConstant(Mask >> Shift, SL, VT)); } uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1585,7 +1585,13 @@ // Try to allocate a VGPR at the end of the argument list, or if no argument // VGPRs are left allocating a stack slot. -static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { +// If \p Mask is is given it indicates bitfield position in the register. +// If \p Arg is given use it with new ]p Mask instead of allocating new. +static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, + ArgDescriptor Arg = ArgDescriptor()) { + if (Arg.isSet()) + return ArgDescriptor::createArg(Arg, Mask); + ArrayRef ArgVGPRs = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); @@ -1593,7 +1599,7 @@ // Spill to stack required. int64_t Offset = CCInfo.AllocateStack(4, 4); - return ArgDescriptor::createStack(Offset); + return ArgDescriptor::createStack(Offset, Mask); } unsigned Reg = ArgVGPRs[RegIdx]; @@ -1602,7 +1608,7 @@ MachineFunction &MF = CCInfo.getMachineFunction(); MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - return ArgDescriptor::createRegister(Reg); + return ArgDescriptor::createRegister(Reg, Mask); } static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, @@ -1634,14 +1640,21 @@ MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { - if (Info.hasWorkItemIDX()) - Info.setWorkItemIDX(allocateVGPR32Input(CCInfo)); + const unsigned Mask = 0x3ff; + ArgDescriptor Arg; - if (Info.hasWorkItemIDY()) - Info.setWorkItemIDY(allocateVGPR32Input(CCInfo)); + if (Info.hasWorkItemIDX()) { + Arg = allocateVGPR32Input(CCInfo, Mask); + Info.setWorkItemIDX(Arg); + } + + if (Info.hasWorkItemIDY()) { + Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg); + Info.setWorkItemIDY(Arg); + } if (Info.hasWorkItemIDZ()) - Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); + Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } static void allocateSpecialInputSGPRs(CCState &CCInfo, @@ -2387,9 +2400,6 @@ AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, - AMDGPUFunctionArgInfo::WORKITEM_ID_X, - AMDGPUFunctionArgInfo::WORKITEM_ID_Y, - AMDGPUFunctionArgInfo::WORKITEM_ID_Z, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR }; @@ -2429,6 +2439,71 @@ MemOpChains.push_back(ArgStore); } } + + // Pack workitem IDs into a single register or pass it as is if already + // packed. + const ArgDescriptor *OutgoingArg; + const TargetRegisterClass *ArgRC; + + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + if (!OutgoingArg) + return; + + const ArgDescriptor *IncomingArgX + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first; + const ArgDescriptor *IncomingArgY + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first; + const ArgDescriptor *IncomingArgZ + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first; + + SDValue InputReg; + SDLoc SL; + + // If incoming ids are not packed we need to pack them. + if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX) + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); + + if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) { + SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); + Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, + DAG.getShiftAmountConstant(10, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; + } + + if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) { + SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); + Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, + DAG.getShiftAmountConstant(20, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z; + } + + if (!InputReg.getNode()) { + // Workitem ids are already packed, any of present incoming arguments + // will carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : + IncomingArgY ? *IncomingArgY : + *IncomingArgZ, ~0u); + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + } + + if (OutgoingArg->isRegister()) { + RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + } else { + unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4); + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, + SpecialArgOffset); + MemOpChains.push_back(ArgStore); + } } static bool canGuaranteeTCO(CallingConv::ID CC) { diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll --- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll @@ -65,6 +65,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt +; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: s_setpc_b64 define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,8 +1,9 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_x() #1 { @@ -13,7 +14,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_y: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_y() #1 { @@ -24,7 +26,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_z: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_z() #1 { @@ -35,8 +38,10 @@ ; GCN-LABEL: {{^}}use_workitem_id_xy: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] +; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xy() #1 { @@ -49,9 +54,12 @@ ; GCN-LABEL: {{^}}use_workitem_id_xyz: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v2 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] +; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] +; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xyz() #1 { @@ -66,8 +74,10 @@ ; GCN-LABEL: {{^}}use_workitem_id_xz: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] +; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xz() #1 { @@ -80,8 +90,10 @@ ; GCN-LABEL: {{^}}use_workitem_id_yz: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 -; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] +; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_yz() #1 { @@ -108,7 +120,7 @@ ; GCN-NOT: v0 ; GCN-NOT: v1 -; GCN: v_mov_b32_e32 v0, v1 +; GCN: v_lshlrev_b32_e32 v0, 10, v1 ; GCN-NOT: v0 ; GCN-NOT: v1 ; GCN: s_swappc_b64 @@ -122,15 +134,72 @@ ; GCN-NOT: v0 ; GCN-NOT: v2 -; GCN: v_mov_b32_e32 v0, v2 +; GCN: v_lshlrev_b32_e32 v0, 20, v2 ; GCN-NOT: v0 -; GCN-NOT: v2 +; GCN-NOT: v1 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { call void @use_workitem_id_z() ret void } +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; GCN: v_or_b32_e32 v0, v0, [[IDY]] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { + call void @use_workitem_id_xy() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz: +; GCN-NOT: v0 +; GCN-NOT: v2 +; GCN: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 v0, v0, [[IDZ]] +; GCN-NOT: v0 +; GCN-NOT: v2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { + call void @use_workitem_id_xz() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz: +; GCN-NOT: v1 +; GCN-NOT: v2 +; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; GCN: v_or_b32_e32 v0, [[IDY]], [[IDZ]] +; GCN-NOT: v1 +; GCN-NOT: v2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 { + call void @use_workitem_id_yz() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN-NOT: v2 +; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; GCN-DAG: v_or_b32_e32 v0, v0, [[IDY]] +; GCN-DAG: v_or_b32_e32 v0, v0, [[IDZ]] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN-NOT: v2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 { + call void @use_workitem_id_xyz() + ret void +} + ; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x: ; GCN-NOT: v0 ; GCN: s_swappc_b64 @@ -160,8 +229,9 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1 +; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_x(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %arg0, i32 addrspace(1)* undef @@ -171,8 +241,9 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10 +; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workitem.id.y() store volatile i32 %arg0, i32 addrspace(1)* undef @@ -182,8 +253,9 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: ; GCN: s_waitcnt -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10 +; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workitem.id.z() store volatile i32 %arg0, i32 addrspace(1)* undef @@ -207,6 +279,7 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: ; GCN: enable_vgpr_workitem_id = 1 +; GCN: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NOT: v1 ; GCN: v_mov_b32_e32 v0, 0x22b ; GCN-NOT: v1 @@ -221,7 +294,7 @@ ; GCN: enable_vgpr_workitem_id = 2 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: v_mov_b32_e32 v1, v2 +; GCN-DAG: v_lshlrev_b32_e32 v1, 20, v2 ; GCN: s_swappc_b64 ; GCN-NOT: v0 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { @@ -232,6 +305,7 @@ ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: v_and_b32_e32 v32, 0x3ff, v32 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -357,6 +431,7 @@ ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_waitcnt +; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v32 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 ; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}} ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -469,15 +544,18 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: v_and_b32_e32 v32, 0x3ff, v32 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: v_bfe_u32 v32, v32, 10, 10 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: v_bfe_u32 v32, v32, 20, 10 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_xyz( @@ -531,19 +609,19 @@ ret void } -; frame[0] = ID X -; frame[1] = ID Y -; frame[2] = ID Z +; frame[0] = ID { Z, Y, X } ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: ; GCN: enable_vgpr_workitem_id = 2 -; GCN: s_mov_b32 s33, s7 -; GCN: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s33, s7 +; GCN-DAG: s_mov_b32 s32, s33 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-DAG: v_or_b32_e32 v0, v0, v1 +; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-DAG: v_or_b32_e32 v0, v0, v2 +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { call void @too_many_args_use_workitem_id_xyz( @@ -560,19 +638,19 @@ ; workitem ID X in register, yz on stack ; v31 = workitem ID X -; frame[0] = workitem Y -; frame[1] = workitem Z +; frame[0] = workitem { Z, Y, X } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz: -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 -; GCN: buffer_load_dword v31, off, s[0:3], s32{{$}} -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 -; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}} -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: flat_store_dword v[0:1], [[IDX]] +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]] +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 +; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]] ; GCN: s_waitcnt ; GCN-NEXT: s_setpc_b64 -; GCN: ScratchSize: 12 +; GCN: ScratchSize: 8 define void @too_many_args_use_workitem_id_x_stack_yz( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -623,18 +701,18 @@ ret void } -; frame[0] = ID Y -; frame[1] = ID Z - ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: ; GCN: enable_vgpr_workitem_id = 2 ; GCN: s_mov_b32 s33, s7 -; GCN: s_mov_b32 s32, s33 -; GCN-DAG: v_mov_b32_e32 v31, v0 -; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GCN-NOT: v0 +; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-DAG: v_or_b32_e32 v0, v0, v1 +; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-DAG: v_or_b32_e32 v31, v0, v2 + +; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { call void @too_many_args_use_workitem_id_x_stack_yz( diff --git a/llvm/test/CodeGen/AMDGPU/zext-lid.ll b/llvm/test/CodeGen/AMDGPU/zext-lid.ll --- a/llvm/test/CodeGen/AMDGPU/zext-lid.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-lid.ll @@ -1,8 +1,9 @@ -; RUN: llc -march=amdgcn < %s | FileCheck %s -; RUN: llc -O0 -march=amdgcn < %s | FileCheck %s +; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,O2 %s +; RUN: llc -O0 -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s -; CHECK-NOT: and_b32 +; GCN-LABEL: {{^}}zext_grp_size_128: +; GCN-NOT: and_b32 ; OPT-LABEL: @zext_grp_size_128 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 @@ -24,6 +25,9 @@ ret void } +; GCN-LABEL: {{^}}zext_grp_size_32x4x1: +; GCN-NOT: and_b32 + ; OPT-LABEL: @zext_grp_size_32x4x1 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !3 @@ -44,6 +48,9 @@ ret void } +; GCN-LABEL: {{^}}zext_grp_size_1x1x1: +; GCN-NOT: and_b32 + ; When EarlyCSE is not run this call produces a range max with 0 active bits, ; which is a special case as an AssertZext from width 0 is invalid. ; OPT-LABEL: @zext_grp_size_1x1x1 @@ -55,6 +62,9 @@ ret void } +; GCN-LABEL: {{^}}zext_grp_size_512: +; GCN-NOT: and_b32 + ; OPT-LABEL: @zext_grp_size_512 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !6 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !6 @@ -75,6 +85,11 @@ ret void } +; GCN-LABEL: {{^}}func_test_workitem_id_x_known_max_range: +; O2-NOT: and_b32 +; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff, +; O2-NOT: and_b32 + ; OPT-LABEL: @func_test_workitem_id_x_known_max_range( ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 define void @func_test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 { @@ -85,6 +100,11 @@ ret void } +; GCN-LABEL: {{^}}func_test_workitem_id_x_default_range: +; O2-NOT: and_b32 +; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff, +; O2-NOT: and_b32 + ; OPT-LABEL: @func_test_workitem_id_x_default_range( ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !7 define void @func_test_workitem_id_x_default_range(i32 addrspace(1)* nocapture %out) #4 {