diff --git a/openmp/libomptarget/include/dlwrap.h b/openmp/libomptarget/include/dlwrap.h --- a/openmp/libomptarget/include/dlwrap.h +++ b/openmp/libomptarget/include/dlwrap.h @@ -137,9 +137,9 @@ } // namespace dlwrap -#define DLWRAP_INSTANTIATE(SYM_USE, SYM_DEF, ARITY) \ - DLWRAP_INSTANTIATE_##ARITY(SYM_USE, SYM_DEF, \ - dlwrap::trait) +#define DLWRAP_INSTANTIATE(ARITY, SYM_USE, SYM_DEF) \ + DLWRAP_INSTANTIATE_DISPATCH(ARITY, dlwrap::trait, \ + SYM_DEF, dlwrap::SYM_USE##_Trait::get()) #define DLWRAP_FINALIZE_IMPL() \ static size_t dlwrap::size() { return DLWRAP_ID(); } \ @@ -154,14 +154,14 @@ return &dlwrap_pointers.data()[i]; \ } -#define DLWRAP_COMMON(SYMBOL, ARITY) \ +#define DLWRAP_COMMON(ARITY, SYMBOL) \ DLWRAP_INC(); \ DLWRAP_SYMBOL(SYMBOL, DLWRAP_ID() - 1); \ namespace dlwrap { \ - struct SYMBOL##_Trait : public dlwrap::trait { \ + struct SYMBOL##_Trait { \ using T = dlwrap::trait; \ static T::FunctionType get() { \ - verboseAssert::nargs>(); \ + verboseAssert(); \ constexpr size_t Index = DLWRAP_ID() - 1; \ void *P = *dlwrap::pointer(Index); \ return reinterpret_cast(P); \ @@ -170,117 +170,84 @@ } #define DLWRAP_IMPL(SYMBOL, ARITY) \ - DLWRAP_COMMON(SYMBOL, ARITY); \ - DLWRAP_INSTANTIATE(SYMBOL, SYMBOL, ARITY) + DLWRAP_COMMON(ARITY, SYMBOL); \ + DLWRAP_INSTANTIATE(ARITY, SYMBOL, SYMBOL) #define DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) \ - DLWRAP_COMMON(SYMBOL, ARITY); \ - static DLWRAP_INSTANTIATE(SYMBOL, dlwrap_##SYMBOL, ARITY) - -#define DLWRAP_INSTANTIATE_0(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF() { return dlwrap::SYM_USE##_Trait::get()(); } -#define DLWRAP_INSTANTIATE_1(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0) { \ - return dlwrap::SYM_USE##_Trait::get()(x0); \ - } -#define DLWRAP_INSTANTIATE_2(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1); \ - } -#define DLWRAP_INSTANTIATE_3(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1, \ - typename T::template arg<2>::type x2) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2); \ - } -#define DLWRAP_INSTANTIATE_4(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1, \ - typename T::template arg<2>::type x2, \ - typename T::template arg<3>::type x3) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3); \ - } -#define DLWRAP_INSTANTIATE_5(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1, \ - typename T::template arg<2>::type x2, \ - typename T::template arg<3>::type x3, \ - typename T::template arg<4>::type x4) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4); \ - } -#define DLWRAP_INSTANTIATE_6(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1, \ - typename T::template arg<2>::type x2, \ - typename T::template arg<3>::type x3, \ - typename T::template arg<4>::type x4, \ - typename T::template arg<5>::type x5) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5); \ - } + DLWRAP_COMMON(ARITY, SYMBOL); \ + static DLWRAP_INSTANTIATE(ARITY, SYMBOL, dlwrap_##SYMBOL) + +// Write a function with symbol SYM_DEF as a call to CALL, +// using dlwrap::trait T to query for argument types +#define DLWRAP_INSTANTIATE_DISPATCH(ARITY, T, SYM_DEF, CALL) \ + T::ReturnType SYM_DEF(DLWRAP_INSTANTIATE_PARAMETERS_DISPATCH(ARITY, T)) { \ + return CALL(DLWRAP_INSTANTIATE_ARGUMENTS_DISPATCH(ARITY)); \ + } + +// Instantiate function parameters or argument based on traits object and arity. + +// Emit sequence of x0, x1, ..., x(ARITY-1) +#define DLWRAP_INSTANTIATE_ARGUMENTS_DISPATCH(ARITY) \ + DLWRAP_INSTANTIATE_ARGUMENTS_##ARITY() +// Emit sequence of type0 x0, type1 x1, ..., type(ARITY-1) x(ARITY-1) +#define DLWRAP_INSTANTIATE_PARAMETERS_DISPATCH(ARITY, T) \ + DLWRAP_INSTANTIATE_PARAMETERS_##ARITY(T) + +// Base cases +#define DLWRAP_INSTANTIATE_PARAMETERS_0(T) void +#define DLWRAP_INSTANTIATE_ARGUMENTS_0(T) + +#define DLWRAP_INSTANTIATE_PARAMETERS_1(T) typename T::template arg<0>::type x0 +#define DLWRAP_INSTANTIATE_ARGUMENTS_1(T) x0 + +// The sequential instantiations below were generated to avoid typos: +#if 0 +// clang-format off +echo 'for i in range(2, 12): print(f""" +#define DLWRAP_INSTANTIATE_ARGUMENTS_{i}() DLWRAP_INSTANTIATE_ARGUMENTS_{i-1}(), x{i-1} +#define DLWRAP_INSTANTIATE_PARAMETERS_{i}(T) \\ + DLWRAP_INSTANTIATE_PARAMETERS_{i-1}(T), typename T::template arg<{i-1}>::type x{i-1}""")' | python3 +// clang-format on +#endif -#define DLWRAP_INSTANTIATE_7(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1, \ - typename T::template arg<2>::type x2, \ - typename T::template arg<3>::type x3, \ - typename T::template arg<4>::type x4, \ - typename T::template arg<5>::type x5, \ - typename T::template arg<6>::type x6) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6); \ - } +#define DLWRAP_INSTANTIATE_ARGUMENTS_2() DLWRAP_INSTANTIATE_ARGUMENTS_1(), x1 +#define DLWRAP_INSTANTIATE_PARAMETERS_2(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_1(T), typename T::template arg<1>::type x1 -#define DLWRAP_INSTANTIATE_8(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1, \ - typename T::template arg<2>::type x2, \ - typename T::template arg<3>::type x3, \ - typename T::template arg<4>::type x4, \ - typename T::template arg<5>::type x5, \ - typename T::template arg<6>::type x6, \ - typename T::template arg<7>::type x7) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7); \ - } -#define DLWRAP_INSTANTIATE_9(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1, \ - typename T::template arg<2>::type x2, \ - typename T::template arg<3>::type x3, \ - typename T::template arg<4>::type x4, \ - typename T::template arg<5>::type x5, \ - typename T::template arg<6>::type x6, \ - typename T::template arg<7>::type x7, \ - typename T::template arg<8>::type x8) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8); \ - } -#define DLWRAP_INSTANTIATE_10(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1, \ - typename T::template arg<2>::type x2, \ - typename T::template arg<3>::type x3, \ - typename T::template arg<4>::type x4, \ - typename T::template arg<5>::type x5, \ - typename T::template arg<6>::type x6, \ - typename T::template arg<7>::type x7, \ - typename T::template arg<8>::type x8, \ - typename T::template arg<9>::type x9) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \ - x9); \ - } -#define DLWRAP_INSTANTIATE_11(SYM_USE, SYM_DEF, T) \ - T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ - typename T::template arg<1>::type x1, \ - typename T::template arg<2>::type x2, \ - typename T::template arg<3>::type x3, \ - typename T::template arg<4>::type x4, \ - typename T::template arg<5>::type x5, \ - typename T::template arg<6>::type x6, \ - typename T::template arg<7>::type x7, \ - typename T::template arg<8>::type x8, \ - typename T::template arg<9>::type x9, \ - typename T::template arg<10>::type x10) { \ - return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \ - x9, x10); \ - } +#define DLWRAP_INSTANTIATE_ARGUMENTS_3() DLWRAP_INSTANTIATE_ARGUMENTS_2(), x2 +#define DLWRAP_INSTANTIATE_PARAMETERS_3(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_2(T), typename T::template arg<2>::type x2 + +#define DLWRAP_INSTANTIATE_ARGUMENTS_4() DLWRAP_INSTANTIATE_ARGUMENTS_3(), x3 +#define DLWRAP_INSTANTIATE_PARAMETERS_4(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_3(T), typename T::template arg<3>::type x3 + +#define DLWRAP_INSTANTIATE_ARGUMENTS_5() DLWRAP_INSTANTIATE_ARGUMENTS_4(), x4 +#define DLWRAP_INSTANTIATE_PARAMETERS_5(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_4(T), typename T::template arg<4>::type x4 + +#define DLWRAP_INSTANTIATE_ARGUMENTS_6() DLWRAP_INSTANTIATE_ARGUMENTS_5(), x5 +#define DLWRAP_INSTANTIATE_PARAMETERS_6(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_5(T), typename T::template arg<5>::type x5 + +#define DLWRAP_INSTANTIATE_ARGUMENTS_7() DLWRAP_INSTANTIATE_ARGUMENTS_6(), x6 +#define DLWRAP_INSTANTIATE_PARAMETERS_7(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_6(T), typename T::template arg<6>::type x6 + +#define DLWRAP_INSTANTIATE_ARGUMENTS_8() DLWRAP_INSTANTIATE_ARGUMENTS_7(), x7 +#define DLWRAP_INSTANTIATE_PARAMETERS_8(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_7(T), typename T::template arg<7>::type x7 + +#define DLWRAP_INSTANTIATE_ARGUMENTS_9() DLWRAP_INSTANTIATE_ARGUMENTS_8(), x8 +#define DLWRAP_INSTANTIATE_PARAMETERS_9(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_8(T), typename T::template arg<8>::type x8 + +#define DLWRAP_INSTANTIATE_ARGUMENTS_10() DLWRAP_INSTANTIATE_ARGUMENTS_9(), x9 +#define DLWRAP_INSTANTIATE_PARAMETERS_10(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_9(T), typename T::template arg<9>::type x9 + +#define DLWRAP_INSTANTIATE_ARGUMENTS_11() DLWRAP_INSTANTIATE_ARGUMENTS_10(), x10 +#define DLWRAP_INSTANTIATE_PARAMETERS_11(T) \ + DLWRAP_INSTANTIATE_PARAMETERS_10(T), typename T::template arg<10>::type x10 #endif diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -31,6 +31,7 @@ #include "rt.h" #include "DeviceEnvironment.h" +#include "dlwrap.h" #include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" #include "print_tracing.h" @@ -38,6 +39,53 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" +// Define the exported interface for this plugin with compile time error +// checking. +#define EXPORT(SYMBOL, ARITY) \ + EXPORT_IMPL(ARITY, SYMBOL, __tgt_rtl_##SYMBOL, \ + dlwrap::trait) + +// Implements __tgt_rtl_func in terms of an anonymous namespace exported::func +#define EXPORT_IMPL(ARITY, SYMBOL, API_SYMBOL, T) \ + namespace { \ + namespace exported { \ + T::ReturnType SYMBOL(DLWRAP_INSTANTIATE_PARAMETERS_DISPATCH(ARITY, T)); \ + } \ + } \ + extern "C" T::ReturnType API_SYMBOL( \ + DLWRAP_INSTANTIATE_PARAMETERS_DISPATCH(ARITY, T)) { \ + dlwrap::verboseAssert(); \ + return exported::SYMBOL(DLWRAP_INSTANTIATE_ARGUMENTS_DISPATCH(ARITY)); \ + } + +// The EXPORTS will fail to compile if __tgt_rtl_SYMBOL is not declared, e.g. +// if spelled incorrectly relative to the declaration in omptargetplugin +// +// If the function exported::SYMBOL is not defined, clang warns and link fails +// The symbols in exported:: are name mangled, so if there are too many/few +// parameters, or a type doesn't match, that's the same as the symbol not being +// defined. +// +// Provided the function that is intended to be exported by this plugin is named +// in the following list, the compiler thus checks the name and type. + +EXPORT(is_valid_binary, 1); +EXPORT(number_of_devices, 0); +EXPORT(init_requires, 1); +EXPORT(init_device, 1); +EXPORT(load_binary, 2); +EXPORT(data_alloc, 4); +EXPORT(data_submit, 4); +EXPORT(data_submit_async, 5); +EXPORT(data_retrieve, 4); +EXPORT(data_retrieve_async, 5); +EXPORT(data_delete, 2); +EXPORT(run_target_team_region, 8); +EXPORT(run_target_region, 5); +EXPORT(run_target_team_region_async, 9); +EXPORT(run_target_region_async, 6); +EXPORT(synchronize, 2); + // hostrpc interface, FIXME: consider moving to its own include these are // statically linked into amdgpu/plugin if present from hostrpc_services.a, // linked as --whole-archive to override the weak symbols that are used to @@ -1634,12 +1682,13 @@ } } // namespace core -extern "C" { -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { +namespace { +namespace exported { +int32_t is_valid_binary(__tgt_device_image *image) { return elf_machine_id_is_amdgcn(image); } -int __tgt_rtl_number_of_devices() { +int number_of_devices() { // If the construction failed, no methods are safe to call if (DeviceInfo.ConstructionSucceeded) { return DeviceInfo.NumberOfDevices; @@ -1649,13 +1698,13 @@ } } -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { +int64_t init_requires(int64_t RequiresFlags) { DP("Init requires flags to %ld\n", RequiresFlags); DeviceInfo.RequiresFlags = RequiresFlags; return RequiresFlags; } -int32_t __tgt_rtl_init_device(int device_id) { +int32_t init_device(int device_id) { hsa_status_t err; // this is per device id init @@ -1813,8 +1862,7 @@ static __tgt_target_table * __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); -__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, - __tgt_device_image *image) { +__tgt_target_table *load_binary(int32_t device_id, __tgt_device_image *image) { DeviceInfo.load_run_lock.lock(); __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image); DeviceInfo.load_run_lock.unlock(); @@ -2188,7 +2236,7 @@ return DeviceInfo.getOffloadEntriesTable(device_id); } -void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) { +void *data_alloc(int device_id, int64_t size, void *, int32_t kind) { void *ptr = NULL; assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); @@ -2206,49 +2254,47 @@ return ptr; } -int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { +int32_t data_submit(int device_id, void *tgt_ptr, void *hst_ptr, int64_t size) { assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); __tgt_async_info AsyncInfo; int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo); if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; - return __tgt_rtl_synchronize(device_id, &AsyncInfo); + return exported::synchronize(device_id, &AsyncInfo); } -int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr, - int64_t size, __tgt_async_info *AsyncInfo) { +int32_t data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr, + int64_t size, __tgt_async_info *AsyncInfo) { assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); if (AsyncInfo) { initAsyncInfo(AsyncInfo); return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo); } else { - return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size); + return exported::data_submit(device_id, tgt_ptr, hst_ptr, size); } } -int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { +int32_t data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); __tgt_async_info AsyncInfo; int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo); if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; - return __tgt_rtl_synchronize(device_id, &AsyncInfo); + return exported::synchronize(device_id, &AsyncInfo); } -int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr, - void *tgt_ptr, int64_t size, - __tgt_async_info *AsyncInfo) { +int32_t data_retrieve_async(int device_id, void *hst_ptr, void *tgt_ptr, + int64_t size, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); initAsyncInfo(AsyncInfo); return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo); } -int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { +int32_t data_delete(int device_id, void *tgt_ptr) { assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); hsa_status_t err; DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr); @@ -2260,12 +2306,10 @@ return OFFLOAD_SUCCESS; } -int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, - ptrdiff_t *tgt_offsets, - int32_t arg_num, int32_t num_teams, - int32_t thread_limit, - uint64_t loop_tripcount) { +int32_t run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t num_teams, + int32_t thread_limit, uint64_t loop_tripcount) { DeviceInfo.load_run_lock.lock_shared(); int32_t res = @@ -2276,23 +2320,24 @@ return res; } -int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, - int32_t arg_num) { +int32_t run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num) { // use one team and one thread // fix thread num int32_t team_num = 1; int32_t thread_limit = 0; // use default - return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + return exported::run_target_team_region(device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, thread_limit, 0); } -int32_t __tgt_rtl_run_target_team_region_async( - int32_t device_id, void *tgt_entry_ptr, void **tgt_args, - ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, - int32_t thread_limit, uint64_t loop_tripcount, - __tgt_async_info *AsyncInfo) { +int32_t run_target_team_region_async(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t num_teams, + int32_t thread_limit, + uint64_t loop_tripcount, + __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); initAsyncInfo(AsyncInfo); @@ -2305,21 +2350,19 @@ return res; } -int32_t __tgt_rtl_run_target_region_async(int32_t device_id, - void *tgt_entry_ptr, void **tgt_args, - ptrdiff_t *tgt_offsets, - int32_t arg_num, - __tgt_async_info *AsyncInfo) { +int32_t run_target_region_async(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num, __tgt_async_info *AsyncInfo) { // use one team and one thread // fix thread num int32_t team_num = 1; int32_t thread_limit = 0; // use default - return __tgt_rtl_run_target_team_region_async( + return exported::run_target_team_region_async( device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, thread_limit, 0, AsyncInfo); } -int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) { +int32_t synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant @@ -2330,4 +2373,5 @@ } return OFFLOAD_SUCCESS; } -} // extern "C" +} // namespace exported +} // namespace