diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1382,6 +1382,11 @@ to the runtime printf buffer is passed in kernarg. + "HiddenHostcallBuffer" + A global address space pointer + to the runtime hostcall buffer + is passed in kernarg. + "HiddenDefaultQueue" A global address space pointer to the OpenCL device enqueue @@ -1876,6 +1881,11 @@ to the runtime printf buffer is passed in kernarg. + "hidden_hostcall_buffer" + A global address space pointer + to the runtime hostcall buffer + is passed in kernarg. + "hidden_default_queue" A global address space pointer to the OpenCL device enqueue diff --git a/llvm/include/llvm/Support/AMDGPUMetadata.h b/llvm/include/llvm/Support/AMDGPUMetadata.h --- a/llvm/include/llvm/Support/AMDGPUMetadata.h +++ b/llvm/include/llvm/Support/AMDGPUMetadata.h @@ -75,6 +75,7 @@ HiddenDefaultQueue = 12, HiddenCompletionAction = 13, HiddenMultiGridSyncArg = 14, + HiddenHostcallBuffer = 15, Unknown = 0xff }; diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp --- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp +++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp @@ -119,6 +119,7 @@ .Case("hidden_global_offset_z", true) .Case("hidden_none", true) .Case("hidden_printf_buffer", true) + .Case("hidden_hostcall_buffer", true) .Case("hidden_default_queue", true) .Case("hidden_completion_action", true) .Case("hidden_multigrid_sync_arg", true) diff --git a/llvm/lib/Support/AMDGPUMetadata.cpp b/llvm/lib/Support/AMDGPUMetadata.cpp --- a/llvm/lib/Support/AMDGPUMetadata.cpp +++ b/llvm/lib/Support/AMDGPUMetadata.cpp @@ -62,6 +62,7 @@ YIO.enumCase(EN, "HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ); YIO.enumCase(EN, "HiddenNone", ValueKind::HiddenNone); YIO.enumCase(EN, "HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer); + YIO.enumCase(EN, "HiddenHostcallBuffer", ValueKind::HiddenHostcallBuffer); YIO.enumCase(EN, "HiddenDefaultQueue", ValueKind::HiddenDefaultQueue); YIO.enumCase(EN, "HiddenCompletionAction", ValueKind::HiddenCompletionAction); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -421,7 +421,12 @@ if (HiddenArgNumBytes >= 32) { if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); - else + else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) { + // The printf runtime binding pass should have ensured that hostcall and + // printf are not used in the same module. + assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts")); + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenHostcallBuffer); + } else emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); } @@ -854,7 +859,12 @@ if (HiddenArgNumBytes >= 32) { if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args); - else + else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) { + // The printf runtime binding pass should have ensured that hostcall and + // printf are not used in the same module. + assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts")); + emitKernelArg(DL, Int8PtrTy, "hidden_hostcall_buffer", Offset, Args); + } else emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -582,6 +582,15 @@ if (Printfs.empty()) return false; + if (auto HostcallFunction = M.getFunction("__ockl_hostcall_internal")) { + for (auto &U : HostcallFunction->uses()) { + if (auto *CI = dyn_cast(U.getUser())) { + M.getContext().emitError( + CI, "Cannot use both printf and hostcall in the same module"); + } + } + } + TD = &M.getDataLayout(); auto DTWP = getAnalysisIfAvailable(); DT = DTWP ? &DTWP->getDomTree() : nullptr; diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll @@ -42,6 +42,7 @@ ; CHECK-NEXT: .size: 8 ; CHECK-NOT: .value_kind: hidden_default_queue ; CHECK-NOT: .value_kind: hidden_completion_action +; CHECK-NOT: .value_kind: hidden_hostcall_buffer ; CHECK-NEXT: .value_kind: hidden_printf_buffer ; CHECK-NEXT: .value_type: i8 ; CHECK: .value_kind: hidden_multigrid_sync_arg diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -49,6 +49,7 @@ ; CHECK-NEXT: ValueType: I64 ; CHECK-NEXT: - Size: 8 ; CHECK-NEXT: Align: 8 +; CHECK-NOT: ValueKind: HiddenHostcallBuffer ; CHECK-NEXT: ValueKind: HiddenPrintfBuffer ; CHECK-NEXT: ValueType: I8 ; CHECK-NEXT: AddrSpaceQual: Global diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent-v3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent-v3.ll @@ -0,0 +1,55 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +; CHECK: --- +; CHECK: amdhsa.kernels: +; CHECK: - .args: +; CHECK-NEXT: - .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 + +; CHECK-NOT: .value_kind: hidden_hostcall_buffer + +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_kernel +; CHECK: .symbol: test_kernel.kd + +define amdgpu_kernel void @test_kernel(i8 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 + +attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } + +!1 = !{i32 0} +!2 = !{!"none"} +!3 = !{!"char"} +!4 = !{!""} + +!opencl.ocl.version = !{!90} +!90 = !{i32 2, i32 0} + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll @@ -0,0 +1,52 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] +; CHECK: Kernels: + +; CHECK: - Name: test_kernel +; CHECK-NEXT: SymbolName: 'test_kernel@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NOT: ValueKind: HiddenHostcallBuffer +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction + +define amdgpu_kernel void @test_kernel(i8 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } + +!1 = !{i32 0} +!2 = !{!"none"} +!3 = !{!"char"} +!4 = !{!""} + +!opencl.ocl.version = !{!90} +!90 = !{i32 2, i32 0} + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3.ll @@ -0,0 +1,59 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +; CHECK: --- +; CHECK: amdhsa.kernels: +; CHECK: - .args: +; CHECK-NEXT: - .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: .value_type: i8 +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_kernel +; CHECK: .symbol: test_kernel.kd + +declare <2 x i64> @__ockl_hostcall_internal(i8* %0, i32 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) + +define amdgpu_kernel void @test_kernel(i8 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: amdhsa.version: +; CHECK-NEXT: - 1 +; CHECK-NEXT: - 0 + +attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } + +!1 = !{i32 0} +!2 = !{!"none"} +!3 = !{!"char"} +!4 = !{!""} + +!opencl.ocl.version = !{!90} +!90 = !{i32 2, i32 0} + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll @@ -0,0 +1,58 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] +; CHECK: Kernels: + +; CHECK: - Name: test_kernel +; CHECK-NEXT: SymbolName: 'test_kernel@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenHostcallBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction + +declare <2 x i64> @__ockl_hostcall_internal(i8* %0, i32 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) + +define amdgpu_kernel void @test_kernel(i8 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } + +!1 = !{i32 0} +!2 = !{!"none"} +!3 = !{!"char"} +!4 = !{!""} + +!opencl.ocl.version = !{!90} +!90 = !{i32 2, i32 0} + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf-no-hostcall.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf-no-hostcall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/opencl-printf-no-hostcall.ll @@ -0,0 +1,18 @@ +; RUN: not opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-printf-runtime-binding < %s 2>&1 | FileCheck %s + +@.str = private unnamed_addr addrspace(2) constant [6 x i8] c"%s:%d\00", align 1 + +define amdgpu_kernel void @test_kernel(i32 %n) { +entry: + %str = alloca [9 x i8], align 1 + %arraydecay = getelementptr inbounds [9 x i8], [9 x i8]* %str, i32 0, i32 0 + %call1 = call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8* %arraydecay, i32 %n) + %call2 = call <2 x i64> (i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64) @__ockl_hostcall_internal(i8* undef, i32 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9) + ret void +} + +declare i32 @printf(i8 addrspace(2)*, ...) + +declare <2 x i64> @__ockl_hostcall_internal(i8* %0, i32 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) + +; CHECK: error: Cannot use both printf and hostcall in the same module