Index: llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -41,6 +42,21 @@ GRID_SIZE_Z = 20 }; +// Field offsets to implicit kernel argument pointer. +enum ImplicitArgOffsets { + BLOCK_COUNT_X = 0, + BLOCK_COUNT_Y = 4, + BLOCK_COUNT_Z = 8, + + GROUP_SIZE_X = 12, + GROUP_SIZE_Y = 14, + GROUP_SIZE_Z = 16, + + REMAINDER_X = 18, + REMAINDER_Y = 20, + REMAINDER_Z = 22, +}; + class AMDGPULowerKernelAttributes : public ModulePass { public: static char ID; @@ -60,6 +76,144 @@ } // end anonymous namespace +static bool processImplicitArgUse(CallInst *CI) { + Function *F = CI->getParent()->getParent(); + const bool HasUniformWorkGroupSize = + F->getFnAttribute("uniform-work-group-size").getValueAsBool(); + + if (!HasUniformWorkGroupSize) + return false; + + Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; + Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; + Value *Remainders[3] = {nullptr, nullptr, nullptr}; + + const DataLayout &DL = F->getParent()->getDataLayout(); + + // We expect to see several GEP users, casted to the appropriate type and + // loaded. + for (User *U : CI->users()) { + if (!U->hasOneUse()) + continue; + + int64_t Offset = 0; + BitCastInst *BCI = dyn_cast(U); + if (!BCI) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + BCI = dyn_cast(*U->user_begin()); + } + + if (!BCI || !BCI->hasOneUse()) + continue; + + auto *Load = dyn_cast(*BCI->user_begin()); + if (!Load || !Load->isSimple()) + continue; + + unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); + + // TODO: Handle merged loads. + switch (Offset) { + case BLOCK_COUNT_X: + if (LoadSize == 4) + BlockCounts[0] = Load; + break; + case BLOCK_COUNT_Y: + if (LoadSize == 4) + BlockCounts[1] = Load; + break; + case BLOCK_COUNT_Z: + if (LoadSize == 4) + BlockCounts[2] = Load; + break; + case GROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case GROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case GROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case REMAINDER_X: + if (LoadSize == 2) + Remainders[0] = Load; + break; + case REMAINDER_Y: + if (LoadSize == 2) + Remainders[1] = Load; + break; + case REMAINDER_Z: + if (LoadSize == 2) + Remainders[2] = Load; + break; + default: + break; + } + } + + // Under v5 __ockl_get_local_size returns the value computed by the expression: + // + // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder + // + // For functions with the attribute uniform-work-group-size=true. we can evaluate + // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned + // for __ockl_get_local_size. + bool MadeChange = false; + for (int I = 0; I < 3; ++I) { + Value *BlockCount = BlockCounts[I]; + if (!BlockCount) + continue; + + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic() + : (I == 1 ? m_Intrinsic() + : m_Intrinsic()); + + for (User *ICmp : BlockCount->users()) { + ICmpInst::Predicate Pred; + if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) { + if (Pred != ICmpInst::ICMP_ULT) + continue; + ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType())); + MadeChange = true; + } + } + } + + // All remainders should be 0 with uniform work group size. + for (Value *Remainder : Remainders) { + if (!Remainder) + continue; + Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType())); + MadeChange = true; + } + + // If reqd_work_group_size is set, we can replace work group size with it. + auto MD = F->getMetadata("reqd_work_group_size"); + const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; + if (!HasReqdWorkGroupSize) + return MadeChange; + + for (int I = 0; I < 3; I++) { + Value *GroupSize = GroupSizes[I]; + if (!GroupSize) + continue; + + ConstantInt *KnownSize = mdconst::extract(MD->getOperand(I)); + GroupSize->replaceAllUsesWith( + ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false)); + MadeChange = true; + } + + return MadeChange; +} + static bool processUse(CallInst *CI) { Function *F = CI->getParent()->getParent(); @@ -217,27 +371,46 @@ // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get // TargetPassConfig for subtarget. bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { - StringRef DispatchPtrName - = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + bool MadeChange = false; + bool IsV5OrLater = AMDGPU::getAmdhsaCodeObjectVersion() >= 5; + if (IsV5OrLater) { + StringRef ImplicitArgPtrName = + Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr); + Function *ImplicitArgPtr = M.getFunction(ImplicitArgPtrName); + + if (!ImplicitArgPtr) // ImplicitArf ptr not used. + return false; + + SmallPtrSet HandledImplicitArgUses; + for (auto *U : ImplicitArgPtr->users()) { + CallInst *CI = cast(U); + if (HandledImplicitArgUses.insert(CI).second) { + if (processImplicitArgUse(CI)) + MadeChange = true; + } + } + } else { // Pre-V5 + StringRef DispatchPtrName = + Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + Function *DispatchPtr = M.getFunction(DispatchPtrName); - Function *DispatchPtr = M.getFunction(DispatchPtrName); - if (!DispatchPtr) // Dispatch ptr not used. + if (!DispatchPtr) // Dispatch ptr not used. return false; - bool MadeChange = false; - - SmallPtrSet HandledUses; - for (auto *U : DispatchPtr->users()) { - CallInst *CI = cast(U); - if (HandledUses.insert(CI).second) { - if (processUse(CI)) - MadeChange = true; + SmallPtrSet HandledUses; + for (auto *U : DispatchPtr->users()) { + CallInst *CI = cast(U); + if (HandledUses.insert(CI).second) { + if (processUse(CI)) + MadeChange = true; + } } } return MadeChange; } + INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU Kernel Attributes", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, @@ -253,14 +426,20 @@ AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { StringRef DispatchPtrName = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + StringRef ImplicitArgPtrName = + Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr); Function *DispatchPtr = F.getParent()->getFunction(DispatchPtrName); - if (!DispatchPtr) // Dispatch ptr not used. + Function *ImplicitArgPtr = F.getParent()->getFunction(ImplicitArgPtrName); + if (!DispatchPtr && !ImplicitArgPtr) // Dispatch /ImplicitArg ptr not used. return PreservedAnalyses::all(); + bool IsV5OrLater = AMDGPU::getAmdhsaCodeObjectVersion() >= 5; for (Instruction &I : instructions(F)) { if (CallInst *CI = dyn_cast(&I)) { - if (CI->getCalledFunction() == DispatchPtr) + if (IsV5OrLater && CI->getCalledFunction() == ImplicitArgPtr) + processImplicitArgUse(CI); + if (!IsV5OrLater && CI->getCalledFunction() == DispatchPtr) processUse(CI); } } Index: llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_local_size_x(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_local_size_x( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 12 +; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.*]] = bitcast i8 addrspace(4)* [[GEP_LOCAL_SIZE]] to i16 addrspace(4)* +; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, i16 addrspace(4)* [[BC_GEP_LOCAL_SIZE]], align 4 +; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %bc.block.count.x = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %block.count.x = load i32, i32 addrspace(4)* %bc.block.count.x, align 4 + %cmp.id.count = icmp ult i32 %group.id, %block.count.x + %local.size.offset = select i1 %cmp.id.count, i64 12, i64 18 + %gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset + %bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)* + %local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2 + store i16 %local.size, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_local_size_y(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_local_size_y( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 14 +; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.*]] = bitcast i8 addrspace(4)* [[GEP_LOCAL_SIZE]] to i16 addrspace(4)* +; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, i16 addrspace(4)* [[BC_GEP_LOCAL_SIZE]], align 2 +; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y() + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.block.count.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 4 + %bc.block.count.y = bitcast i8 addrspace(4)* %gep.block.count.y to i32 addrspace(4)* + %block.count.y = load i32, i32 addrspace(4)* %bc.block.count.y, align 4 + %cmp.id.count = icmp ult i32 %group.id, %block.count.y + %local.size.offset = select i1 %cmp.id.count, i64 14, i64 20 + %gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset + %bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)* + %local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2 + store i16 %local.size, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_local_size_z(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_local_size_z( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 16 +; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.*]] = bitcast i8 addrspace(4)* [[GEP_LOCAL_SIZE]] to i16 addrspace(4)* +; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, i16 addrspace(4)* [[BC_GEP_LOCAL_SIZE]], align 4 +; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z() + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.block.count.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 8 + %bc.block.count.z = bitcast i8 addrspace(4)* %gep.block.count.z to i32 addrspace(4)* + %block.count.z = load i32, i32 addrspace(4)* %bc.block.count.z, align 4 + %cmp.id.count = icmp ult i32 %group.id, %block.count.z + %local.size.offset = select i1 %cmp.id.count, i64 16, i64 22 + %gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset + %bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)* + %local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2 + store i16 %local.size, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_remainder_x(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_remainder_x( +; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18 + %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)* + %remainder.x = load i16, i16 addrspace(4)* %bc.x, align 2 + store i16 %remainder.x, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_remainder_y(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_remainder_y( +; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18 + %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)* + %remainder.y = load i16, i16 addrspace(4)* %bc.y, align 2 + store i16 %remainder.y, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_remainder_z(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_remainder_z( +; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18 + %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)* + %remainder.z = load i16, i16 addrspace(4)* %bc.z, align 2 + store i16 %remainder.z, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_x(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_work_group_size_x( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 12 +; GCN-NEXT: [[BC_X:%.*]] = bitcast i8 addrspace(4)* [[GEP_X]] to i16 addrspace(4)* +; GCN-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, i16 addrspace(4)* [[BC_X]], align 4 +; GCN-NEXT: store i16 [[GROUP_SIZE_X]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12 + %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)* + %group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2 + store i16 %group.size.x, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_y(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_work_group_size_y( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 14 +; GCN-NEXT: [[BC_Y:%.*]] = bitcast i8 addrspace(4)* [[GEP_Y]] to i16 addrspace(4)* +; GCN-NEXT: [[GROUP_SIZE_Y:%.*]] = load i16, i16 addrspace(4)* [[BC_Y]], align 2 +; GCN-NEXT: store i16 [[GROUP_SIZE_Y]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14 + %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)* + %group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2 + store i16 %group.size.y, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_z(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_work_group_size_z( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 16 +; GCN-NEXT: [[BC_Z:%.*]] = bitcast i8 addrspace(4)* [[GEP_Z]] to i16 addrspace(4)* +; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, i16 addrspace(4)* [[BC_Z]], align 4 +; GCN-NEXT: store i16 [[GROUP_SIZE_Z]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16 + %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)* + %group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2 + store i16 %group.size.z, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_x_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { +; GCN-LABEL: @get_work_group_size_x_reqd( +; GCN-NEXT: store i16 8, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12 + %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)* + %group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2 + store i16 %group.size.x, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_y_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { +; GCN-LABEL: @get_work_group_size_y_reqd( +; GCN-NEXT: store i16 16, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14 + %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)* + %group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2 + store i16 %group.size.y, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_z_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { +; GCN-LABEL: @get_work_group_size_z_reqd( +; GCN-NEXT: store i16 2, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16 + %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)* + %group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2 + store i16 %group.size.z, i16 addrspace(1)* %out + ret void +} + + +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1 +declare i32 @llvm.amdgcn.workgroup.id.x() #1 +declare i32 @llvm.amdgcn.workgroup.id.y() #1 +declare i32 @llvm.amdgcn.workgroup.id.z() #1 + +!llvm.module.flags = !{!1} + +attributes #0 = { nounwind "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable } +!0 = !{i32 8, i32 16, i32 2} +!1 = !{i32 1, !"amdgpu_code_object_version", i32 500}