Index: llvm/trunk/docs/AMDGPUUsage.rst =================================================================== --- llvm/trunk/docs/AMDGPUUsage.rst +++ llvm/trunk/docs/AMDGPUUsage.rst @@ -1039,10 +1039,10 @@ passed in the kernarg. "HiddenCompletionAction" - *TBD* - - .. TODO - Add description. + A global address space pointer + to help link enqueued kernels into + the ancestor tree for determining + when the parent kernel has finished. "ValueType" string Required Kernel argument value type. Only present if "ValueKind" is Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -25,12 +25,20 @@ // linkage does not work since optimization passes will try to replace loads // of the global variable with its initialization value. // +// It also identifies the kernels directly or indirectly enqueues kernels +// and adds "calls-enqueue-kernel" function attribute to them, which will +// be used to determine whether to emit runtime metadata for the kernel +// enqueue related hidden kernel arguments. +// //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/User.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -66,7 +74,22 @@ return new AMDGPUOpenCLEnqueuedBlockLowering(); } +/// Collect direct or indrect callers of \p F and save them +/// to \p Callers. +static void collectCallers(Function *F, DenseSet &Callers) { + for (auto U : F->users()) { + if (auto *CI = dyn_cast(&*U)) { + auto *Caller = CI->getParent()->getParent(); + if (Callers.count(Caller)) + continue; + Callers.insert(Caller); + collectCallers(Caller, Callers); + } + } +} + bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { + DenseSet Callers; auto &C = M.getContext(); auto AS = AMDGPU::getAMDGPUAS(M); bool Changed = false; @@ -91,8 +114,23 @@ AddrCast->replaceAllUsesWith(NewPtr); F.addFnAttr("runtime-handle", RuntimeHandle); F.setLinkage(GlobalValue::ExternalLinkage); + + // Collect direct or indirect callers of enqueue_kernel. + for (auto U : NewPtr->users()) { + if (auto *I = dyn_cast(&*U)) { + auto *F = I->getParent()->getParent(); + Callers.insert(F); + collectCallers(F, Callers); + } + } Changed = true; } } + + for (auto F : Callers) { + if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) + continue; + F->addFnAttr("calls-enqueue-kernel"); + } return Changed; } Index: llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp +++ llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp @@ -266,12 +266,21 @@ emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); - if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - return; - auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUASI.GLOBAL_ADDRESS); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts"); + if (CallsPrintf) + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + if (Func.hasFnAttribute("calls-enqueue-kernel")) { + if (!CallsPrintf) { + // Emit a dummy argument so that the remaining hidden arguments + // have a fixed position relative to the first hidden argument. + // This is to facilitate library code to access hidden arguments. + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + } + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue); + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction); + } } void MetadataStreamer::emitKernelArg(const Argument &Arg) { Index: llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll +++ llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -9,7 +9,21 @@ %struct.ndrange_t = type { i32 } %opencl.queue_t = type opaque -define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr +; CHECK: define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space +define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr + !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { + ret void +} + +; CHECK: define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER:[0-9]+]] +define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr + !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { + call void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) + ret void +} + +; CHECK: define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER]] +define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { entry: %block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8 @@ -77,6 +91,7 @@ ret void } +; CHECK: attributes #[[AT_CALLER]] = { "calls-enqueue-kernel" } ; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle" ; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle" Index: llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll +++ llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll @@ -0,0 +1,96 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] +; CHECK-NOT: Printf: +; CHECK: Kernels: + +; CHECK: - Name: test_non_enqueue_kernel_caller +; CHECK-NEXT: SymbolName: 'test_non_enqueue_kernel_caller@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NOT: ValueKind: HiddenNone +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction +define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_enqueue_kernel_caller +; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenDefaultQueue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenCompletionAction +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +attributes #0 = { "calls-enqueue-kernel" } + +!1 = !{i32 0} +!2 = !{!"none"} +!3 = !{!"char"} +!4 = !{!""} + +!opencl.ocl.version = !{!90} +!90 = !{i32 2, i32 0} + + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS Index: llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -51,6 +51,8 @@ ; CHECK-NEXT: ValueKind: HiddenPrintfBuffer ; CHECK-NEXT: ValueType: I8 ; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction define amdgpu_kernel void @test_char(i8 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { @@ -1267,7 +1269,52 @@ ret void } +; CHECK: - Name: test_enqueue_kernel_caller +; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenDefaultQueue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenCompletionAction +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } +attributes #1 = { "calls-enqueue-kernel" } !llvm.printf.fmts = !{!100, !101}