Index: llvm/include/llvm/Transforms/IPO/Attributor.h =================================================================== --- llvm/include/llvm/Transforms/IPO/Attributor.h +++ llvm/include/llvm/Transforms/IPO/Attributor.h @@ -1315,6 +1315,10 @@ return TargetTriple.isAMDGPU() || TargetTriple.isNVPTX(); } + const SmallVectorImpl &getIndirectlyCallableFunctions() const { + return IndirectlyCallableFunctions; + } + private: struct FunctionInfo { ~FunctionInfo(); @@ -1347,6 +1351,10 @@ return *FI; } + /// Vector of functions that might be callable indirectly, i.a., via a + /// function pointer. + SmallVector IndirectlyCallableFunctions; + /// Initialize the function information cache \p FI for the function \p F. /// /// This method needs to be called for all function that might be looked at @@ -1412,6 +1420,10 @@ /// Flag to determine if we should skip all liveness checks early on. bool UseLiveness = true; + /// Flag to indicate if the entire world is contained in this module, that + /// is, no outside functions exist. + bool IsClosedWorldModule = false; + /// Callback function to be invoked on internal functions marked live. std::function InitializationCallback = nullptr; @@ -1687,6 +1699,10 @@ /// Return true if this is a module pass, false otherwise. bool isModulePass() const { return Configuration.IsModulePass; } + /// Return true if the module contains the whole world, thus, no outside + /// functions exist. + bool isClosedWorldModule() const { return Configuration.IsClosedWorldModule; } + /// Return true if we derive attributes for \p Fn bool isRunOn(Function &Fn) const { return isRunOn(&Fn); } bool isRunOn(Function *Fn) const { Index: llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -950,6 +950,7 @@ AC.Allowed = &Allowed; AC.IsModulePass = true; AC.DefaultInitializeLiveInternals = false; + AC.IsClosedWorldModule = true; AC.IPOAmendableCB = [](const Function &F) { return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; }; Index: llvm/lib/Transforms/IPO/Attributor.cpp =================================================================== --- llvm/lib/Transforms/IPO/Attributor.cpp +++ llvm/lib/Transforms/IPO/Attributor.cpp @@ -3251,11 +3251,20 @@ // determine if it is part of a must-tail call edge. This will influence what // attributes we can derive. InformationCache::FunctionInfo &FI = InfoCache.getFunctionInfo(F); - if (!isModulePass() && !FI.CalledViaMustTail) { - for (const Use &U : F.uses()) + if (isClosedWorldModule() || (!isModulePass() && !FI.CalledViaMustTail)) { + bool IsIndirectlyCallable = !isClosedWorldModule() || !F.hasLocalLinkage(); + for (const Use &U : F.uses()) { if (const auto *CB = dyn_cast(U.getUser())) - if (CB->isCallee(&U) && CB->isMustTailCall()) - FI.CalledViaMustTail = true; + if (CB->isCallee(&U)) + if (CB->isMustTailCall()) { + FI.CalledViaMustTail = true; + if (IsIndirectlyCallable) + break; + continue; + } + if (isClosedWorldModule() && IsIndirectlyCallable) + InfoCache.IndirectlyCallableFunctions.push_back(&F); + } } IRPosition FPos = IRPosition::function(F); Index: llvm/lib/Transforms/IPO/AttributorAttributes.cpp =================================================================== --- llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -10401,18 +10401,58 @@ struct AACallEdgesCallSite : public AACallEdgesImpl { AACallEdgesCallSite(const IRPosition &IRP, Attributor &A) : AACallEdgesImpl(IRP, A) {} + /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { ChangeStatus Change = ChangeStatus::UNCHANGED; + CallBase *CB = cast(getCtxI()); + + auto IsValidTypePun = [](Type &T1, Type &T2) { + if (&T1 == &T2) + return true; + if (T1.isIntOrPtrTy() && T2.isIntOrIntVectorTy()) + return true; + if (T1.isFloatTy() && T2.isFloatTy()) + return true; + if (T1.isDoubleTy() && T2.isDoubleTy()) + return true; + return false; + }; + auto VisitValue = [&](Value &V, const Instruction *CtxI) -> bool { if (Function *Fn = dyn_cast(&V)) { addCalledFunction(Fn, Change); - } else { - LLVM_DEBUG(dbgs() << "[AACallEdges] Unrecognized value: " << V << "\n"); + // Explore all values. + return true; + } + if (!A.isClosedWorldModule()) { + LLVM_DEBUG(if (!hasUnknownCallee()) dbgs() + << "[AACallEdges] Assume unknown callee due to: " << V + << "\n"); setHasUnknownCallee(true, Change); + // Explore all values. + return true; } + unsigned NumArgs = CB->arg_size(); + LLVM_DEBUG(dbgs() << "[AACallEdges] Unrecognized value: " << V + << ", checking indirect callable functions:\n"); + for (auto *Fn : A.getInfoCache().getIndirectlyCallableFunctions()) { + if (Fn->arg_size() != NumArgs) + continue; + if (!IsValidTypePun(*Fn->getReturnType(), *CB->getType())) + continue; + bool Valid = true; + for (unsigned ArgNo = 0; Valid && ArgNo < NumArgs; ++ArgNo) + Valid &= IsValidTypePun(*Fn->getArg(ArgNo)->getType(), + *CB->getArgOperand(ArgNo)->getType()); + if (!Valid) + continue; + LLVM_DEBUG(dbgs() << "[AACallEdges] Add compatible callable function: " + << Fn->getName() << "\n"); + addCalledFunction(Fn, Change); + } // Explore all values. return true; }; @@ -10435,7 +10475,6 @@ VisitValue(*VAC.getValue(), VAC.getCtxI()); }; - CallBase *CB = cast(getCtxI()); if (auto *IA = dyn_cast(CB->getCalledOperand())) { if (IA->hasSideEffects() && Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll @@ -4,51 +4,36 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) { ; CHECK-LABEL: name: test_indirect_call_sgpr_ptr ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr8, $vgpr0, $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr8 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.fptr.kernarg.offset1, align 16, addrspace 4) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(p4) = COPY [[COPY2]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) - ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY4]], [[C]](s64) + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY9]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[DEF]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY3]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) - ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[DEF1]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY5]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY6]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[COPY7]](s32) + ; CHECK-NEXT: $vgpr31 = COPY [[COPY8]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[LOAD]](p0), 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -769,7 +769,7 @@ ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR17]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -806,7 +806,7 @@ ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR17]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] Index: llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll +++ llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll @@ -51,7 +51,7 @@ define amdgpu_kernel void @entry() { ; CHECK-LABEL: define {{[^@]+}}@entry -; CHECK-SAME: () #[[ATTR0]] { +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [[TMP0:%.*]], align 8, addrspace(5) ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr ; CHECK-NEXT: [[ARST:%.*]] = call double @baz(ptr [[CAST]]) @@ -63,5 +63,6 @@ ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. Index: llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -11,7 +11,7 @@ define internal void @direct() { ; CHECK-LABEL: define {{[^@]+}}@direct -; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) ; CHECK-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 ; CHECK-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_direct_indirect_call() { ; CHECK-LABEL: define {{[^@]+}}@test_direct_indirect_call -; CHECK-SAME: () #[[ATTR1]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: call void @direct() ; CHECK-NEXT: ret void ; @@ -36,5 +36,4 @@ } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. Index: llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -43,5 +43,5 @@ ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. Index: llvm/test/CodeGen/AMDGPU/indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -28,21 +28,21 @@ ; GCN-NEXT: enable_mem_ordered = 0 ; GCN-NEXT: enable_fwd_progress = 0 ; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 -; GCN-NEXT: user_sgpr_count = 14 +; GCN-NEXT: user_sgpr_count = 8 ; GCN-NEXT: enable_trap_handler = 0 ; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 0 ; GCN-NEXT: enable_sgpr_workgroup_info = 0 -; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_vgpr_workitem_id = 0 ; GCN-NEXT: enable_exception_msb = 0 ; GCN-NEXT: granulated_lds_size = 0 ; GCN-NEXT: enable_exception = 0 ; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 -; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 -; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 0 +; GCN-NEXT: enable_sgpr_queue_ptr = 0 ; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 0 ; GCN-NEXT: enable_sgpr_flat_scratch_init = 1 ; GCN-NEXT: enable_sgpr_private_segment_size = 0 ; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -58,7 +58,7 @@ ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 64 +; GCN-NEXT: kernarg_segment_byte_size = 4 ; GCN-NEXT: workgroup_fbarrier_count = 0 ; GCN-NEXT: wavefront_sgpr_count = 68 ; GCN-NEXT: workitem_vgpr_count = 42 @@ -77,26 +77,21 @@ ; GCN-NEXT: .end_amd_kernel_code_t ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-NEXT: s_add_i32 s12, s12, s17 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s13, s15 -; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: s_getpc_b64 s[14:15] -; GCN-NEXT: s_add_u32 s14, s14, gv.fptr0@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s15, s15, gv.fptr0@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 -; GCN-NEXT: s_add_u32 s8, s8, 8 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 -; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_mov_b32 s12, s8 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, gv.fptr0@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, gv.fptr0@rel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s8, s4, 8 +; GCN-NEXT: s_addc_u32 s9, s5, 0 +; GCN-NEXT: v_mov_b32_e32 v31, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm ; ; GISEL-LABEL: test_indirect_call_sgpr_ptr: @@ -121,21 +116,21 @@ ; GISEL-NEXT: enable_mem_ordered = 0 ; GISEL-NEXT: enable_fwd_progress = 0 ; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 -; GISEL-NEXT: user_sgpr_count = 14 +; GISEL-NEXT: user_sgpr_count = 8 ; GISEL-NEXT: enable_trap_handler = 0 ; GISEL-NEXT: enable_sgpr_workgroup_id_x = 1 -; GISEL-NEXT: enable_sgpr_workgroup_id_y = 1 -; GISEL-NEXT: enable_sgpr_workgroup_id_z = 1 +; GISEL-NEXT: enable_sgpr_workgroup_id_y = 0 +; GISEL-NEXT: enable_sgpr_workgroup_id_z = 0 ; GISEL-NEXT: enable_sgpr_workgroup_info = 0 -; GISEL-NEXT: enable_vgpr_workitem_id = 2 +; GISEL-NEXT: enable_vgpr_workitem_id = 0 ; GISEL-NEXT: enable_exception_msb = 0 ; GISEL-NEXT: granulated_lds_size = 0 ; GISEL-NEXT: enable_exception = 0 ; GISEL-NEXT: enable_sgpr_private_segment_buffer = 1 -; GISEL-NEXT: enable_sgpr_dispatch_ptr = 1 -; GISEL-NEXT: enable_sgpr_queue_ptr = 1 +; GISEL-NEXT: enable_sgpr_dispatch_ptr = 0 +; GISEL-NEXT: enable_sgpr_queue_ptr = 0 ; GISEL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GISEL-NEXT: enable_sgpr_dispatch_id = 1 +; GISEL-NEXT: enable_sgpr_dispatch_id = 0 ; GISEL-NEXT: enable_sgpr_flat_scratch_init = 1 ; GISEL-NEXT: enable_sgpr_private_segment_size = 0 ; GISEL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -151,7 +146,7 @@ ; GISEL-NEXT: workitem_private_segment_byte_size = 16384 ; GISEL-NEXT: workgroup_group_segment_byte_size = 0 ; GISEL-NEXT: gds_segment_byte_size = 0 -; GISEL-NEXT: kernarg_segment_byte_size = 64 +; GISEL-NEXT: kernarg_segment_byte_size = 4 ; GISEL-NEXT: workgroup_fbarrier_count = 0 ; GISEL-NEXT: wavefront_sgpr_count = 68 ; GISEL-NEXT: workitem_vgpr_count = 42 @@ -170,26 +165,22 @@ ; GISEL-NEXT: .end_amd_kernel_code_t ; GISEL-NEXT: ; %bb.0: ; GISEL-NEXT: s_mov_b32 s32, 0 -; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GISEL-NEXT: s_add_i32 s12, s12, s17 -; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GISEL-NEXT: s_add_i32 s6, s6, s9 +; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GISEL-NEXT: s_add_u32 s0, s0, s9 ; GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GISEL-NEXT: s_mov_b32 s13, s15 -; GISEL-NEXT: s_mov_b32 s12, s14 -; GISEL-NEXT: s_getpc_b64 s[14:15] -; GISEL-NEXT: s_add_u32 s14, s14, gv.fptr0@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s15, s15, gv.fptr0@rel32@hi+12 -; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GISEL-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-NEXT: s_add_u32 s8, s8, 8 -; GISEL-NEXT: s_addc_u32 s9, s9, 0 -; GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2 -; GISEL-NEXT: v_or_b32_e32 v31, v0, v1 -; GISEL-NEXT: s_mov_b32 s14, s16 +; GISEL-NEXT: s_getpc_b64 s[6:7] +; GISEL-NEXT: s_add_u32 s6, s6, gv.fptr0@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s7, s7, gv.fptr0@rel32@hi+12 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GISEL-NEXT: s_add_u32 s4, s4, 8 +; GISEL-NEXT: s_addc_u32 s5, s5, 0 +; GISEL-NEXT: s_mov_b32 s12, s8 +; GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v31, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: s_endpgm %fptr = load ptr, ptr addrspace(4) @gv.fptr0 call void %fptr() @@ -219,21 +210,21 @@ ; GCN-NEXT: enable_mem_ordered = 0 ; GCN-NEXT: enable_fwd_progress = 0 ; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 -; GCN-NEXT: user_sgpr_count = 14 +; GCN-NEXT: user_sgpr_count = 8 ; GCN-NEXT: enable_trap_handler = 0 ; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 0 ; GCN-NEXT: enable_sgpr_workgroup_info = 0 -; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_vgpr_workitem_id = 0 ; GCN-NEXT: enable_exception_msb = 0 ; GCN-NEXT: granulated_lds_size = 0 ; GCN-NEXT: enable_exception = 0 ; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 -; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 -; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 0 +; GCN-NEXT: enable_sgpr_queue_ptr = 0 ; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 0 ; GCN-NEXT: enable_sgpr_flat_scratch_init = 1 ; GCN-NEXT: enable_sgpr_private_segment_size = 0 ; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -249,7 +240,7 @@ ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 64 +; GCN-NEXT: kernarg_segment_byte_size = 4 ; GCN-NEXT: workgroup_fbarrier_count = 0 ; GCN-NEXT: wavefront_sgpr_count = 68 ; GCN-NEXT: workitem_vgpr_count = 42 @@ -268,27 +259,22 @@ ; GCN-NEXT: .end_amd_kernel_code_t ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-NEXT: s_add_i32 s12, s12, s17 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s13, s15 -; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: s_getpc_b64 s[14:15] -; GCN-NEXT: s_add_u32 s14, s14, gv.fptr1@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s15, s15, gv.fptr1@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 -; GCN-NEXT: s_add_u32 s8, s8, 8 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s12, s8 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, gv.fptr1@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, gv.fptr1@rel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s8, s4, 8 +; GCN-NEXT: s_addc_u32 s9, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x7b -; GCN-NEXT: s_mov_b32 s14, s16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm ; ; GISEL-LABEL: test_indirect_call_sgpr_ptr_arg: @@ -313,21 +299,21 @@ ; GISEL-NEXT: enable_mem_ordered = 0 ; GISEL-NEXT: enable_fwd_progress = 0 ; GISEL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 -; GISEL-NEXT: user_sgpr_count = 14 +; GISEL-NEXT: user_sgpr_count = 8 ; GISEL-NEXT: enable_trap_handler = 0 ; GISEL-NEXT: enable_sgpr_workgroup_id_x = 1 -; GISEL-NEXT: enable_sgpr_workgroup_id_y = 1 -; GISEL-NEXT: enable_sgpr_workgroup_id_z = 1 +; GISEL-NEXT: enable_sgpr_workgroup_id_y = 0 +; GISEL-NEXT: enable_sgpr_workgroup_id_z = 0 ; GISEL-NEXT: enable_sgpr_workgroup_info = 0 -; GISEL-NEXT: enable_vgpr_workitem_id = 2 +; GISEL-NEXT: enable_vgpr_workitem_id = 0 ; GISEL-NEXT: enable_exception_msb = 0 ; GISEL-NEXT: granulated_lds_size = 0 ; GISEL-NEXT: enable_exception = 0 ; GISEL-NEXT: enable_sgpr_private_segment_buffer = 1 -; GISEL-NEXT: enable_sgpr_dispatch_ptr = 1 -; GISEL-NEXT: enable_sgpr_queue_ptr = 1 +; GISEL-NEXT: enable_sgpr_dispatch_ptr = 0 +; GISEL-NEXT: enable_sgpr_queue_ptr = 0 ; GISEL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GISEL-NEXT: enable_sgpr_dispatch_id = 1 +; GISEL-NEXT: enable_sgpr_dispatch_id = 0 ; GISEL-NEXT: enable_sgpr_flat_scratch_init = 1 ; GISEL-NEXT: enable_sgpr_private_segment_size = 0 ; GISEL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -343,7 +329,7 @@ ; GISEL-NEXT: workitem_private_segment_byte_size = 16384 ; GISEL-NEXT: workgroup_group_segment_byte_size = 0 ; GISEL-NEXT: gds_segment_byte_size = 0 -; GISEL-NEXT: kernarg_segment_byte_size = 64 +; GISEL-NEXT: kernarg_segment_byte_size = 4 ; GISEL-NEXT: workgroup_fbarrier_count = 0 ; GISEL-NEXT: wavefront_sgpr_count = 68 ; GISEL-NEXT: workitem_vgpr_count = 42 @@ -362,27 +348,23 @@ ; GISEL-NEXT: .end_amd_kernel_code_t ; GISEL-NEXT: ; %bb.0: ; GISEL-NEXT: s_mov_b32 s32, 0 -; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GISEL-NEXT: s_add_i32 s12, s12, s17 -; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GISEL-NEXT: s_add_i32 s6, s6, s9 +; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GISEL-NEXT: s_add_u32 s0, s0, s9 ; GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GISEL-NEXT: s_mov_b32 s13, s15 -; GISEL-NEXT: s_mov_b32 s12, s14 -; GISEL-NEXT: s_getpc_b64 s[14:15] -; GISEL-NEXT: s_add_u32 s14, s14, gv.fptr1@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s15, s15, gv.fptr1@rel32@hi+12 -; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GISEL-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-NEXT: s_add_u32 s8, s8, 8 -; GISEL-NEXT: s_addc_u32 s9, s9, 0 -; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v31, v0 +; GISEL-NEXT: s_getpc_b64 s[6:7] +; GISEL-NEXT: s_add_u32 s6, s6, gv.fptr1@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s7, s7, gv.fptr1@rel32@hi+12 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GISEL-NEXT: s_add_u32 s4, s4, 8 +; GISEL-NEXT: s_addc_u32 s5, s5, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b -; GISEL-NEXT: s_mov_b32 s14, s16 +; GISEL-NEXT: s_mov_b32 s12, s8 +; GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: s_endpgm %fptr = load ptr, ptr addrspace(4) @gv.fptr1 call void %fptr(i32 123) Index: llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -43,9 +43,9 @@ ; GFX9-LABEL: test_simple_indirect_call: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 @@ -74,5 +74,5 @@ ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;.