Index: llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -40,19 +40,22 @@ bool IsSet : 1; public: - ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, + constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false, bool IsSet = false) : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} - static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { + static constexpr ArgDescriptor createRegister(Register Reg, + unsigned Mask = ~0u) { return ArgDescriptor(Reg, Mask, false, true); } - static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { + static constexpr ArgDescriptor createStack(unsigned Offset, + unsigned Mask = ~0u) { return ArgDescriptor(Offset, Mask, true, true); } - static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { + static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, + unsigned Mask) { return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); } @@ -141,7 +144,7 @@ ArgDescriptor ImplicitArgPtr; // Input registers for non-HSA ABI - ArgDescriptor ImplicitBufferPtr = 0; + ArgDescriptor ImplicitBufferPtr; // VGPRs inputs. These are always v0, v1 and v2 for entry functions. ArgDescriptor WorkItemIDX; @@ -150,11 +153,14 @@ std::pair getPreloadedValue(PreloadedValue Value) const; + + static constexpr AMDGPUFunctionArgInfo fixedABILayout(); }; class AMDGPUArgumentUsageInfo : public ImmutablePass { private: static const AMDGPUFunctionArgInfo ExternFunctionInfo; + static const AMDGPUFunctionArgInfo FixedABIFunctionInfo; DenseMap ArgInfoMap; public: @@ -175,15 +181,7 @@ ArgInfoMap[&F] = ArgInfo; } - const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const { - auto I = ArgInfoMap.find(&F); - if (I == ArgInfoMap.end()) { - assert(F.isDeclaration()); - return ExternFunctionInfo; - } - - return I->second; - } + const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const; }; } // end namespace llvm Index: llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -8,6 +8,8 @@ #include "AMDGPU.h" #include "AMDGPUArgumentUsageInfo.h" +#include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIRegisterInfo.h" #include "llvm/Support/NativeFormatting.h" #include "llvm/Support/raw_ostream.h" @@ -43,6 +45,10 @@ const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{}; +// Hardcoded registers from fixed function ABI +const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo + = AMDGPUFunctionArgInfo::fixedABILayout(); + bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) { return false; } @@ -133,3 +139,41 @@ } llvm_unreachable("unexpected preloaded value type"); } + +constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { + AMDGPUFunctionArgInfo AI; + AI.PrivateSegmentBuffer = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; + AI.DispatchPtr = AMDGPU::SGPR4_SGPR5; + AI.QueuePtr = AMDGPU::SGPR6_SGPR7; + + // Do not pass kernarg segment pointer, only pass increment version in its + // place. + AI.ImplicitArgPtr = AMDGPU::SGPR8_SGPR9; + AI.DispatchID = AMDGPU::SGPR10_SGPR11; + + // Skip FlatScratchInit/PrivateSegmentSize + AI.WorkGroupIDX = AMDGPU::SGPR12; + AI.WorkGroupIDY = AMDGPU::SGPR13; + AI.WorkGroupIDZ = AMDGPU::SGPR14; + + const unsigned Mask = 0x3ff; + AI.WorkItemIDX = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask); + AI.WorkItemIDY = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 10); + AI.WorkItemIDZ = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 20); + return AI; +} + +const AMDGPUFunctionArgInfo & +AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const { + auto I = ArgInfoMap.find(&F); + if (I == ArgInfoMap.end()) { + if (AMDGPUTargetMachine::EnableFixedFunctionABI) + return FixedABIFunctionInfo; + + // Without the fixed ABI, we assume no function has special inputs. + assert(F.isDeclaration()); + return ExternFunctionInfo; + } + + return I->second; +} Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -39,6 +39,7 @@ public: static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; + static bool EnableFixedFunctionABI; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -139,6 +139,13 @@ cl::init(true), cl::Hidden); +static cl::opt EnableAMDGPUFixedFunctionABIOpt( + "amdgpu-fixed-function-abi", + cl::desc("Enable all implicit function arguments"), + cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI), + cl::init(false), + cl::Hidden); + // Enable lib calls simplifications static cl::opt EnableLibCallSimplify( "amdgpu-simplify-libcall", @@ -372,6 +379,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; +bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -439,6 +439,10 @@ MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; + void allocateSpecialInputVGPRsFixed(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; }; } // End namespace llvm Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1688,10 +1688,11 @@ return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); } -void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) const { +/// Allocate implicit function VGPR arguments at the end of allocated user +/// arguments. +void SITargetLowering::allocateSpecialInputVGPRs( + CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { const unsigned Mask = 0x3ff; ArgDescriptor Arg; @@ -1709,6 +1710,20 @@ Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } +/// Allocate implicit function VGPR arguments in fixed registers. +void SITargetLowering::allocateSpecialInputVGPRsFixed( + CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { + Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31); + if (!Reg) + report_fatal_error("failed to allocated VGPR for implicit arguments"); + + const unsigned Mask = 0x3ff; + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10)); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20)); +} + void SITargetLowering::allocateSpecialInputSGPRs( CCState &CCInfo, MachineFunction &MF, @@ -2091,6 +2106,10 @@ if (IsEntryFunc) { allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); + } else { + // For the fixed ABI, pass workitem IDs in the last argument register. + if (AMDGPUTargetMachine::EnableFixedFunctionABI) + allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } if (IsKernel) { @@ -2202,7 +2221,7 @@ InVals.push_back(Val); } - if (!IsEntryFunc) { + if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { // Special inputs come after user arguments. allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); } @@ -2483,6 +2502,8 @@ if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) + report_fatal_error("failed to allocate implicit input argument"); } else { unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4); SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, @@ -2549,6 +2570,7 @@ if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + CCInfo.AllocateReg(OutgoingArg->getRegister()); } else { unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4); SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, @@ -2723,12 +2745,19 @@ } const SIMachineFunctionInfo *Info = MF.getInfo(); + SmallVector, 8> RegsToPass; + SmallVector MemOpChains; // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); + if (AMDGPUTargetMachine::EnableFixedFunctionABI) { + // With a fixed ABI, allocate fixed registers before user arguments. + passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); + } + CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. @@ -2747,7 +2776,6 @@ // arguments to begin at SP+0. Completely unused for non-tail calls. int32_t FPDiff = 0; MachineFrameInfo &MFI = MF.getFrameInfo(); - SmallVector, 8> RegsToPass; // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass @@ -2764,7 +2792,6 @@ Chain = DAG.getTokenFactor(DL, CopyFromChains); } - SmallVector MemOpChains; MVT PtrVT = MVT::i32; // Walk the register/memloc assignments, inserting copies/loads. @@ -2860,8 +2887,10 @@ } } - // Copy special input registers after user input arguments. - passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); + if (!AMDGPUTargetMachine::EnableFixedFunctionABI) { + // Copy special input registers after user input arguments. + passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); + } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -8,6 +8,7 @@ #include "SIMachineFunctionInfo.h" #include "AMDGPUArgumentUsageInfo.h" +#include "AMDGPUTargetMachine.h" #include "AMDGPUSubtarget.h" #include "SIRegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -60,6 +61,11 @@ // calls. const bool HasCalls = FrameInfo.hasCalls() || F.hasFnAttribute("amdgpu-calls"); + // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't + // have any calls. + const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI && + (!isEntryFunction() || HasCalls); + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { if (!F.arg_empty()) KernargSegmentPtr = true; @@ -94,23 +100,33 @@ } } - if (F.hasFnAttribute("amdgpu-work-group-id-x")) + if (UseFixedABI) { WorkGroupIDX = true; - - if (F.hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; - - if (F.hasFnAttribute("amdgpu-work-group-id-z")) WorkGroupIDZ = true; - - if (F.hasFnAttribute("amdgpu-work-item-id-x")) WorkItemIDX = true; - - if (F.hasFnAttribute("amdgpu-work-item-id-y")) WorkItemIDY = true; - - if (F.hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; + ImplicitArgPtr = true; + } else { + if (F.hasFnAttribute("amdgpu-work-group-id-x")) + WorkGroupIDX = true; + + if (F.hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; + + if (F.hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; + + if (F.hasFnAttribute("amdgpu-work-item-id-x")) + WorkItemIDX = true; + + if (F.hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; + + if (F.hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; + } bool HasStackObjects = FrameInfo.hasStackObjects(); @@ -133,19 +149,27 @@ if (isAmdHsaOrMesa) { PrivateSegmentBuffer = true; - if (F.hasFnAttribute("amdgpu-dispatch-ptr")) + if (UseFixedABI) { DispatchPtr = true; - - if (F.hasFnAttribute("amdgpu-queue-ptr")) QueuePtr = true; - if (F.hasFnAttribute("amdgpu-dispatch-id")) + // FIXME: We don't need this? DispatchID = true; + } else { + if (F.hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + + if (F.hasFnAttribute("amdgpu-queue-ptr")) + QueuePtr = true; + + if (F.hasFnAttribute("amdgpu-dispatch-id")) + DispatchID = true; + } } else if (ST.isMesaGfxShader(F)) { ImplicitBufferPtr = true; } - if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) + if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) KernargSegmentPtr = true; if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) { Index: llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -0,0 +1,343 @@ +; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s + +; GCN-LABEL: {{^}}use_dispatch_ptr: +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +define hidden void @use_dispatch_ptr() #1 { + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}use_queue_ptr: +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +define hidden void @use_queue_ptr() #1 { + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}use_kernarg_segment_ptr: +; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 0 +; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0 +define hidden void @use_kernarg_segment_ptr() #1 { + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}use_implicitarg_ptr: +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +define hidden void @use_implicitarg_ptr() #1 { + %implicit.arg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %implicit.arg.ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}use_dispatch_id: +; GCN: ; use s[10:11] +define hidden void @use_dispatch_id() #1 { + %id = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %id) + ret void +} +; GCN-LABEL: {{^}}use_workgroup_id_x: +; GCN: s_waitcnt +; GCN: ; use s12 +define hidden void @use_workgroup_id_x() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_stack_workgroup_id_x: +; GCN: s_waitcnt +; GCN-NOT: s32 +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} +; GCN: ; use s12 +; GCN: s_setpc_b64 +define hidden void @use_stack_workgroup_id_x() #1 { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + %val = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_y: +; GCN: s_waitcnt +; GCN: ; use s13 +define hidden void @use_workgroup_id_y() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_z: +; GCN: s_waitcnt +; GCN: ; use s14 +define hidden void @use_workgroup_id_z() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_xy: +; GCN: ; use s12 +; GCN: ; use s13 +define hidden void @use_workgroup_id_xy() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.x() + %val1 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_xyz: +; GCN: ; use s12 +; GCN: ; use s13 +; GCN: ; use s14 +define hidden void @use_workgroup_id_xyz() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.x() + %val1 = call i32 @llvm.amdgcn.workgroup.id.y() + %val2 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + call void asm sideeffect "; use $0", "s"(i32 %val2) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_xz: +; GCN: ; use s12 +; GCN: ; use s14 +define hidden void @use_workgroup_id_xz() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.x() + %val1 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_yz: +; GCN: ; use s13 +; GCN: ; use s14 +define hidden void @use_workgroup_id_yz() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.y() + %val1 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + ret void +} + +; Argument is in right place already +; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x: +; GCN-NOT: s12 +; GCN-NOT: s13 +; GCN-NOT: s14 +; GCN: v_readlane_b32 s4, v32, 0 +define hidden void @func_indirect_use_workgroup_id_x() #1 { + call void @use_workgroup_id_x() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y: +; GCN-NOT: s4 +; GCN: v_readlane_b32 s4, v32, 0 +define hidden void @func_indirect_use_workgroup_id_y() #1 { + call void @use_workgroup_id_y() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z: +; GCN-NOT: s4 +; GCN: v_readlane_b32 s4, v32, 0 +define hidden void @func_indirect_use_workgroup_id_z() #1 { + call void @use_workgroup_id_z() + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x: +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: ; use s12 +define hidden void @other_arg_use_workgroup_id_x(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %arg0, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y: +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: ; use s13 +define hidden void @other_arg_use_workgroup_id_y(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.y() + store volatile i32 %arg0, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z: +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: ; use s14 +define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %arg0, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_every_sgpr_input: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: ; use s[10:11] +; GCN: ; use s12 +; GCN: ; use s13 +; GCN: ; use s14 +define hidden void @use_every_sgpr_input() #1 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc + + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc + + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 + %implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc + + %val3 = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %val3) + + %val4 = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val4) + + %val5 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val5) + + %val6 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val6) + + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input: +; GCN: s_mov_b32 s33, s17 +; GCN: s_mov_b32 s12, s14 +; GCN: s_mov_b32 s13, s15 +; GCN: s_mov_b32 s14, s16 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 + +; GCN: .amdhsa_user_sgpr_private_segment_buffer 1 +; GCN: .amdhsa_user_sgpr_dispatch_ptr 1 +; GCN: .amdhsa_user_sgpr_queue_ptr 1 +; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GCN: .amdhsa_user_sgpr_dispatch_id 1 +; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 +; GCN: .amdhsa_user_sgpr_private_segment_size 0 +; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; GCN: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN: .amdhsa_system_sgpr_workgroup_id_y 1 +; GCN: .amdhsa_system_sgpr_workgroup_id_z 1 +; GCN: .amdhsa_system_sgpr_workgroup_info 0 +; GCN: .amdhsa_system_vgpr_workitem_id 2 +define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { + call void @use_every_sgpr_input() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_every_sgpr_input: +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN-NOT: s8 +; GCN-NOT: s9 +; GCN-NOT: s10 +; GCN-NOT: s11 +; GCN-NOT: s12 +; GCN-NOT: s13 +; GCN-NOT: s[6:7] +; GCN-NOT: s[8:9] +; GCN-NOT: s[10:11] +; GCN-NOT: s[12:13] +; GCN-NOT: s14 +; GCN: s_or_saveexec_b64 s[16:17], -1 +define hidden void @func_indirect_use_every_sgpr_input() #1 { + call void @use_every_sgpr_input() + ret void +} + +; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz: +; GCN-NOT: s12 +; GCN-NOT: s13 +; GCN-NOT: s14 +; GCN: ; use s[10:11] +; GCN: ; use s12 +; GCN: ; use s13 +; GCN: ; use s14 + +; GCN: s_swappc_b64 +define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc + + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc + + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc + + %val3 = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %val3) + + %val4 = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val4) + + %val5 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val5) + + %val6 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val6) + + call void @use_workgroup_id_xyz() + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.y() #0 +declare i32 @llvm.amdgcn.workgroup.id.z() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 +declare i64 @llvm.amdgcn.dispatch.id() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind noinline } Index: llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,8 +1,10 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VARABI %s +; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt -; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0 +; VARABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0 +; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -14,7 +16,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_y: ; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 +; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 +; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -26,7 +29,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_z: ; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 +; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 +; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -38,8 +42,12 @@ ; GCN-LABEL: {{^}}use_workitem_id_xy: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 + +; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 + ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-NEXT: s_waitcnt @@ -54,9 +62,16 @@ ; GCN-LABEL: {{^}}use_workitem_id_xyz: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 + +; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 + +; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 + + ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] @@ -74,8 +89,12 @@ ; GCN-LABEL: {{^}}use_workitem_id_xz: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 + +; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 + ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt @@ -90,8 +109,12 @@ ; GCN-LABEL: {{^}}use_workitem_id_yz: ; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 + +; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 + ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt @@ -105,24 +128,39 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: -; GCN: enable_vgpr_workitem_id = 0 +; VARABI: enable_vgpr_workitem_id = 0 +; FIXEDABI: enable_vgpr_workitem_id = 2 -; GCN-NOT: v0 +; FIXEDA-NOT: v0 +; VARABI-NOT: v31 ; GCN: s_swappc_b64 -; GCN-NOT: v0 +; FIXEDABI-NOT: v0 +; VARABI-NOT: v31 define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { call void @use_workitem_id_x() ret void } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: -; GCN: enable_vgpr_workitem_id = 1 +; VARABI: enable_vgpr_workitem_id = 1 +; FIXEDABI: enable_vgpr_workitem_id = 2 + +; FIXEDABI-NOT: v0 +; FIXEDABI-NOT: v1 + +; VARABI-NOT: v31 +; VARABI: v_lshlrev_b32_e32 v0, 10, v1 + + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + +; FIXEDABI-NOT: v0 +; FIXEDABI-NOT: v1 +; VARABI-NOT: v31 -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: v_lshlrev_b32_e32 v0, 10, v1 -; GCN-NOT: v0 -; GCN-NOT: v1 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 { call void @use_workitem_id_y() @@ -132,11 +170,17 @@ ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z: ; GCN: enable_vgpr_workitem_id = 2 -; GCN-NOT: v0 -; GCN-NOT: v2 -; GCN: v_lshlrev_b32_e32 v0, 20, v2 -; GCN-NOT: v0 -; GCN-NOT: v1 +; VARABI-NOT: v0 +; VARABI-NOT: v2 +; VARABI: v_lshlrev_b32_e32 v0, 20, v2 +; VARABI-NOT: v0 +; VARABI-NOT: v1 + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { call void @use_workitem_id_z() @@ -144,12 +188,18 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy: -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; GCN: v_or_b32_e32 v0, v0, [[IDY]] -; GCN-NOT: v0 -; GCN-NOT: v1 +; VARABI-NOT: v0 +; VARABI-NOT: v1 +; VARABI: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; VARABI: v_or_b32_e32 v0, v0, [[IDY]] +; VARABI-NOT: v0 +; VARABI-NOT: v1 + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { call void @use_workitem_id_xy() @@ -157,12 +207,19 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz: -; GCN-NOT: v0 -; GCN-NOT: v2 -; GCN: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 v0, v0, [[IDZ]] -; GCN-NOT: v0 -; GCN-NOT: v2 +; VARABI-NOT: v0 +; VARABI-NOT: v2 +; VARABI: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; VARABI: v_or_b32_e32 v0, v0, [[IDZ]] +; VARABI-NOT: v0 +; VARABI-NOT: v2 + + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { call void @use_workitem_id_xz() @@ -170,13 +227,20 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz: -; GCN-NOT: v1 -; GCN-NOT: v2 -; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; GCN: v_or_b32_e32 v0, [[IDY]], [[IDZ]] -; GCN-NOT: v1 -; GCN-NOT: v2 +; VARABI-NOT: v1 +; VARABI-NOT: v2 +; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; VARABI: v_or_b32_e32 v0, [[IDY]], [[IDZ]] +; VARABI-NOT: v1 +; VARABI-NOT: v2 + + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 { call void @use_workitem_id_yz() @@ -184,16 +248,22 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz: -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN-NOT: v2 -; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; GCN-DAG: v_or_b32_e32 v0, v0, [[IDY]] -; GCN-DAG: v_or_b32_e32 v0, v0, [[IDZ]] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN-NOT: v2 +; VARABI-NOT: v0 +; VARABI-NOT: v1 +; VARABI-NOT: v2 +; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDY]] +; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDZ]] +; VARABI-NOT: v0 +; VARABI-NOT: v1 +; VARABI-NOT: v2 + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 { call void @use_workitem_id_xyz() @@ -229,7 +299,9 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1 +; VARABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1 +; FIXEDABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 + ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_x(i32 %arg0) #1 { @@ -241,7 +313,8 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: ; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10 +; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10 +; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { @@ -253,7 +326,8 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: ; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10 +; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10 +; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { @@ -265,10 +339,17 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: -; GCN: enable_vgpr_workitem_id = 0 +; VARABI: enable_vgpr_workitem_id = 0 +; FIXEDABI: enable_vgpr_workitem_id = 2 + +; VARABI: v_mov_b32_e32 v1, v0 +; VARABI: v_mov_b32_e32 v0, 0x22b + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; GCN: v_mov_b32_e32 v1, v0 -; GCN: v_mov_b32_e32 v0, 0x22b ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 { call void @other_arg_use_workitem_id_x(i32 555) @@ -277,14 +358,20 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: -; GCN: enable_vgpr_workitem_id = 1 - -; GCN: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NOT: v1 -; GCN: v_mov_b32_e32 v0, 0x22b -; GCN-NOT: v1 -; GCN: s_swappc_b64 -; GCN-NOT: v0 +; VARABI: enable_vgpr_workitem_id = 1 + +; VARABI: v_lshlrev_b32_e32 v1, 10, v1 +; VARABI-NOT: v1 +; VARABI: v_mov_b32_e32 v0, 0x22b +; VARABI-NOT: v1 +; VARABI: s_swappc_b64 +; VARABI-NOT: v0 + +; FIXEDABI: enable_vgpr_workitem_id = 2 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 { call void @other_arg_use_workitem_id_y(i32 555) ret void @@ -293,24 +380,33 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: ; GCN: enable_vgpr_workitem_id = 2 -; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: v_lshlrev_b32_e32 v1, 20, v2 -; GCN: s_swappc_b64 -; GCN-NOT: v0 +; VARABI-DAG: v_mov_b32_e32 v0, 0x22b +; VARABI-DAG: v_lshlrev_b32_e32 v1, 20, v2 +; VARABI: s_swappc_b64 +; VARABI-NOT: v0 + + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { call void @other_arg_use_workitem_id_z(i32 555) ret void } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} -; GCN: v_and_b32_e32 v32, 0x3ff, v32 -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 +; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} +; VARABI: v_and_b32_e32 v32, 0x3ff, v32 +; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 +; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VARABI-NEXT: s_waitcnt +; VARABI-NEXT: s_setpc_b64 + +; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 +; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} define void @too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -359,12 +455,25 @@ } ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: -; GCN: enable_vgpr_workitem_id = 0 +; VARABI: enable_vgpr_workitem_id = 0 -; GCN: s_mov_b32 s33, s7 -; GCN: s_mov_b32 s32, s33 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN: s_swappc_b64 +; VARABI: s_mov_b32 s33, s7 +; VARABI: s_mov_b32 s32, s33 +; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}} +; VARABI: s_swappc_b64 + + +; FIXEDABI: enable_vgpr_workitem_id = 2 +; FIXEDABI: s_mov_b32 s33, s17 +; FIXEDABI-DAG: s_mov_b32 s32, s33 +; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} + +; FIXEDABI: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { call void @too_many_args_use_workitem_id_x( i32 10, i32 20, i32 30, i32 40, @@ -379,8 +488,16 @@ } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: -; GCN: s_mov_b32 s34, s32 -; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} +; VARABI: s_mov_b32 s34, s32 +; VARABI: buffer_store_dword v1, off, s[0:3], s32{{$}} + +; Touching the workitem id register is not necessary. +; FIXEDABI-NOT: v31 +; FIXEDABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} +; FIXEDABI-NOT: v31 +; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} +; FIXEDABI-NOT: v31 + ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { store volatile i32 %arg0, i32 addrspace(1)* undef @@ -422,20 +539,29 @@ ret void } -; stack layout: +; var abi stack layout: ; frame[0] = byval arg32 ; frame[1] = stack passed workitem ID x ; frame[2] = VGPR spill slot ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt -; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v32 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 -; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}} -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN: s_setpc_b64 +; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VARABI-NEXT: s_waitcnt +; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32 +; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 +; VARABI: buffer_load_dword v0, off, s[0:3], s32{{$}} +; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VARABI: s_setpc_b64 + + +; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 +; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 + +; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}} +; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4{{$}} +; FIXEDABI: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -483,24 +609,46 @@ ret void } +; var abi stack layout: ; sp[0] = byval ; sp[1] = ?? ; sp[2] = stack passed workitem ID x ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: -; GCN: enable_vgpr_workitem_id = 0 -; GCN-DAG: s_mov_b32 s33, s7 -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 -; GCN: s_add_u32 s32, s33, 0x400{{$}} - -; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 - -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} -; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], -; GCN: s_swappc_b64 +; VARABI: enable_vgpr_workitem_id = 0 +; VARABI-DAG: s_mov_b32 s33, s7 +; VARABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; VARABI: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 +; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 +; VARABI: s_add_u32 s32, s33, 0x400{{$}} + +; VARABI-NOT: s32 +; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 + +; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} +; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]], +; VARABI: s_swappc_b64 + + +; FIXEDABI: s_mov_b32 s33, s17 +; FIXEDABI-DAG: s_add_u32 s32, s33, 0x400 +; FIXEDABI-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33 offset:4{{$}} + +; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 +; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} + +; FIXME: Why this reload? +; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], s33 offset:4{{$}} + +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] + +; FIXEDABI-NOT: s32 +; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4 +; FIXEDABI: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, i32 addrspace(5)* %alloca @@ -518,13 +666,27 @@ } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}} -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}} -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} -; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], -; GCN: s_swappc_b64 +; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; VARABI: buffer_store_dword [[K]], off, s[0:3], s34{{$}} +; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}} +; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} +; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]], +; VARABI: s_swappc_b64 + + +; FIXED-ABI-NOT: v31 +; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s34{{$}} + +; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} +; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} +; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}} + +; FIXED-ABI-NOT: v31 +; FIXEDABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} +; FIXED-ABI-NOT: v31 +; FIXEDABI: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 999, i32 addrspace(5)* %alloca @@ -543,25 +705,36 @@ ; Only one stack load should be emitted for all 3 values. ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} -; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} -; GCN-NOT: buffer_load_dword - -; GCN: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32 -; GCN-NOT: buffer_load_dword -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] -; GCN-NOT: buffer_load_dword -; GCN: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10 -; GCN-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10 -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] -; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] - -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 +; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VARABI: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} +; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} +; VARABI-NOT: buffer_load_dword + +; VARABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32 +; VARABI-NOT: buffer_load_dword +; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] +; VARABI-NOT: buffer_load_dword +; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10 +; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10 +; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] +; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] + +; VARABI: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VARABI-NEXT: s_waitcnt +; VARABI-NEXT: s_setpc_b64 + + +; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31 +; FIXEDABI-NOT: buffer_load_dword +; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]] +; FIXEDABI-NOT: buffer_load_dword +; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10 +; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10 +; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] +; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] + define void @too_many_args_use_workitem_id_xyz( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -613,19 +786,23 @@ ret void } -; frame[0] = ID { Z, Y, X } - ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: ; GCN: enable_vgpr_workitem_id = 2 -; GCN-DAG: s_mov_b32 s33, s7 +; VARABI-DAG: s_mov_b32 s33, s7 +; FIXEDABI-DAG: s_mov_b32 s33, s17 ; GCN-DAG: s_mov_b32 s32, s33 -; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-DAG: v_or_b32_e32 v0, v0, v1 -; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-DAG: v_or_b32_e32 v0, v0, v2 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} +; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 +; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] +; VARABI-DAG: v_or_b32_e32 [[PACKEDID:v[0-9]+]], [[TMP2]], [[TMP0]] +; VARABI: buffer_store_dword [[PACKEDID]], off, s[0:3], s32{{$}} + +; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] +; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 +; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} + ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { call void @too_many_args_use_workitem_id_xyz( @@ -640,7 +817,7 @@ ret void } -; workitem ID X in register, yz on stack +; Var abi: workitem ID X in register, yz on stack ; v31 = workitem ID X ; frame[0] = workitem { Z, Y, X } @@ -708,7 +885,8 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: ; GCN: enable_vgpr_workitem_id = 2 -; GCN: s_mov_b32 s33, s7 +; VARABI: s_mov_b32 s33, s7 +; FIXEDABI: s_mov_b32 s33, s17 ; GCN-NOT: v0 ; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1