Index: lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -89,6 +89,21 @@ } } +/// If \p U is instruction or constant, collect functions which directly or +/// indirectly use it. +static void collectFunctionUsers(User *U, DenseSet &Funcs) { + if (auto *I = dyn_cast(U)) { + auto *F = I->getParent()->getParent(); + Funcs.insert(F); + collectCallers(F, Funcs); + return; + } + if (!isa(U)) + return; + for (auto UU : U->users()) + collectFunctionUsers(&*UU, Funcs); +} + bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { DenseSet Callers; auto &C = M.getContext(); @@ -101,6 +116,7 @@ M.getDataLayout()); F.setName(Name); } + DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n'); auto RuntimeHandle = (F.getName() + ".runtime_handle").str(); auto *GV = new GlobalVariable( M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS), @@ -111,22 +127,15 @@ DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); for (auto U : F.users()) { - if (!isa(&*U)) + auto *UU = &*U; + if (!isa(UU)) continue; - auto *BitCast = cast(&*U); + collectFunctionUsers(UU, Callers); + auto *BitCast = cast(UU); auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType()); BitCast->replaceAllUsesWith(NewPtr); F.addFnAttr("runtime-handle", RuntimeHandle); F.setLinkage(GlobalValue::ExternalLinkage); - - // Collect direct or indirect callers of enqueue_kernel. - for (auto U : NewPtr->users()) { - if (auto *I = dyn_cast(&*U)) { - auto *F = I->getParent()->getParent(); - Callers.insert(F); - collectCallers(F, Callers); - } - } Changed = true; } } @@ -136,6 +145,7 @@ if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) continue; F->addFnAttr("calls-enqueue-kernel"); + DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n'); } return Changed; } Index: test/CodeGen/AMDGPU/enqueue-kernel.ll =================================================================== --- test/CodeGen/AMDGPU/enqueue-kernel.ll +++ test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -1,9 +1,9 @@ ; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s | FileCheck %s -; CHECK: @__test_block_invoke_kernel.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* -; CHECK: @__test_block_invoke_2_kernel.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* -; CHECK: @__amdgpu_enqueued_kernel.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* -; CHECK: @__amdgpu_enqueued_kernel.1.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* +; CHECK: @__test_block_invoke_kernel.runtime_handle = addrspace(1) global i8 addrspace(1)* null +; CHECK: @__test_block_invoke_2_kernel.runtime_handle = addrspace(1) global i8 addrspace(1)* null +; CHECK: @__amdgpu_enqueued_kernel.runtime_handle = addrspace(1) global i8 addrspace(1)* null +; CHECK: @__amdgpu_enqueued_kernel.1.runtime_handle = addrspace(1) global i8 addrspace(1)* null %struct.ndrange_t = type { i32 } %opencl.queue_t = type opaque @@ -80,6 +80,19 @@ ret void } +; __enqueue_kernel* functions may get inlined +; CHECK-LABEL: define amdgpu_kernel void @inlined_caller +; CHECK-SAME: #[[AT_CALLER]] +; CHECK-NOT: @__test_block_invoke_kernel +; CHECK: load i64, i64 addrspace(1)* bitcast (i8 addrspace(1)* addrspace(1)* @__test_block_invoke_kernel.runtime_handle to i64 addrspace(1)*) +define amdgpu_kernel void @inlined_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr + !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { +entry: + %tmp = load i64, i64 addrspace(1)* addrspacecast (i64* bitcast (void (<{ i32, i32, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i64*) to i64 addrspace(1)*) + store i64 %tmp, i64 addrspace(1)* %c + ret void +} + ; CHECK-LABEL: define dso_local amdgpu_kernel void @__test_block_invoke_kernel ; CHECK-SAME: #[[AT1:[0-9]+]] define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(1)*, i8 }> %arg) #0