Diff 318676

openmp/libomptarget/include/dlwrap.h

This file was added.

				//===------- dlwrap.h - Convenience wrapper around dlopen/dlsym -- C++ -*-===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// The openmp plugins depend on extern libraries. These can be used via:
				// - bitcode file statically linked
				// - (relocatable) object file statically linked
				// - static library
				// - dynamic library, linked at build time
				// - dynamic library, loaded at application run time by dlopen
				//
				// This file factors out most boilerplate for using a dlopened library.
				// - Function symbols are generated that are statically linked against
				// - The dlopen can be done implicitly when initializing the library
				// - dlsym lookups are done once and cached
				// - The abstraction is very thin to permit varied uses of the library
				//
				// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to:
				// int foo(char x0, double x1, void* x2) {
				// constexpr size_t index = id();
				// void * dlsymResult = pointer(index);
				// return ((int ()(char, double, void))dlsymResult)(x0, x1, x2);
				// }
				//
				// Multiple calls to DLWRAP(symbol_name, arity) with bespoke
				// initialization code that can use the thin abstraction:
				// namespace dlwrap {
				// static size_t size();
				// static const char *symbol(size_t);
				// static void **pointer(size_t);
				// }
				// will compile to an object file that only exposes the symbols that the
				// dynamic library would do, with the right function types.
				//
				//===----------------------------------------------------------------------===//

				#ifndef DLWRAP_H_INCLUDED
				#define DLWRAP_H_INCLUDED

				#include <array>
				#include <cstddef>
				#include <tuple>
				#include <type_traits>

				// Where symbol is a function, these expand to some book keeping and an
				// implementation of that function
				#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY)
				#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY)

				// For example, given a prototype:
				// int foo(char, double);
				//
				// DLWRAP(foo, 2) expands to:
				//
				// namespace dlwrap {
				// struct foo_Trait : public dlwrap::trait<decltype(&foo)> {
				// using T = dlwrap::trait<decltype(&foo)>;
				// static T::FunctionType get() {
				// constexpr size_t Index = getIndex();
				// void P = dlwrap::pointer(Index);
				// return reinterpret_cast<T::FunctionType>(P);
				// }
				// };
				// }
				// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); }
				//
				// DLWRAP_INTERNAL is similar, except the function it expands to is:
				// static int dlwrap_foo(char x0, double x1) { ... }
				// so that the function pointer call can be wrapped in library-specific code

				// DLWRAP_FINALIZE() expands to definitions of:
				#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL()
				namespace dlwrap {
				static size_t size();
				static const char *symbol(size_t); // get symbol name in [0, size())
				static void **pointer(size_t); // get pointer to function pointer in [0, size())
				} // namespace dlwrap

				// Implementation details follow.

				namespace dlwrap {

				// Extract return / argument types from address of function symbol
				template <typename F> struct trait;
				template <typename R, typename... Ts> struct trait<R (*)(Ts...)> {
				constexpr static const size_t nargs = sizeof...(Ts);
				typedef R ReturnType;
				template <size_t i> struct arg {
				typedef typename std::tuple_element<i, std::tuple<Ts...>>::type type;
				};

				typedef R (*FunctionType)(Ts...);
				};

				namespace type {
				// Book keeping is by type specialization

				template <size_t S> struct count {
				static constexpr size_t N = count<S - 1>::N;
				};

				template <> struct count<0> { static constexpr size_t N = 0; };

				// Get a constexpr size_t ID, starts at zero
				#define DLWRAP_ID() (dlwrap::type::count<__LINE__>::N)

				// Increment value returned by DLWRAP_ID
				#define DLWRAP_INC() \
				template <> struct dlwrap::type::count<__LINE__> { \
				static constexpr size_t N = 1 + dlwrap::type::count<__LINE__ - 1>::N; \
				}

				template <size_t N> struct symbol;
				#define DLWRAP_SYMBOL(SYMBOL, ID) \
				template <> struct dlwrap::type::symbol<ID> { \
				static constexpr const char *call() { return #SYMBOL; } \
				}
				} // namespace type

				template <size_t N, size_t... Is>
				constexpr std::array<const char *, N> static getSymbolArray(
				std::index_sequence<Is...>) {
				return {{dlwrap::type::symbol<Is>::call()...}};
				}

				} // namespace dlwrap

				#define DLWRAP_INSTANTIATE(SYM_USE, SYM_DEF, ARITY) \
				DLWRAP_INSTANTIATE_##ARITY(SYM_USE, SYM_DEF, \
				dlwrap::trait<decltype(&SYM_USE)>)

				#define DLWRAP_FINALIZE_IMPL() \
				static size_t dlwrap::size() { return DLWRAP_ID(); } \
				static const char *dlwrap::symbol(size_t i) { \
				static constexpr const std::array<const char *, DLWRAP_ID()> \
				dlwrap_symbols = getSymbolArray<DLWRAP_ID()>( \
				std::make_index_sequence<DLWRAP_ID()>()); \
				return dlwrap_symbols[i]; \
				} \
				static void **dlwrap::pointer(size_t i) { \
				static std::array<void *, DLWRAP_ID()> dlwrap_pointers; \
				return &dlwrap_pointers.data()[i]; \
				}

				#define DLWRAP_COMMON(SYMBOL, ARITY) \
				DLWRAP_INC(); \
				DLWRAP_SYMBOL(SYMBOL, DLWRAP_ID() - 1); \
				namespace dlwrap { \
				struct SYMBOL##_Trait : public dlwrap::trait<decltype(&SYMBOL)> { \
				using T = dlwrap::trait<decltype(&SYMBOL)>; \
				static T::FunctionType get() { \
				constexpr size_t Index = DLWRAP_ID() - 1; \
				void P = dlwrap::pointer(Index); \
				return reinterpret_cast<T::FunctionType>(P); \
				} \
				}; \
				}

				#define DLWRAP_IMPL(SYMBOL, ARITY) \
				DLWRAP_COMMON(SYMBOL, ARITY); \
				DLWRAP_INSTANTIATE(SYMBOL, SYMBOL, ARITY)

				#define DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) \
				DLWRAP_COMMON(SYMBOL, ARITY); \
				static DLWRAP_INSTANTIATE(SYMBOL, dlwrap_##SYMBOL, ARITY)

				#define DLWRAP_INSTANTIATE_0(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF() { return dlwrap::SYM_USE##_Trait::get()(); }
				#define DLWRAP_INSTANTIATE_1(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0) { \
				return dlwrap::SYM_USE##_Trait::get()(x0); \
				}
				#define DLWRAP_INSTANTIATE_2(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1); \
				}
				#define DLWRAP_INSTANTIATE_3(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1, \
				typename T::template arg<2>::type x2) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2); \
				}
				#define DLWRAP_INSTANTIATE_4(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1, \
				typename T::template arg<2>::type x2, \
				typename T::template arg<3>::type x3) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3); \
				}
				#define DLWRAP_INSTANTIATE_5(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1, \
				typename T::template arg<2>::type x2, \
				typename T::template arg<3>::type x3, \
				typename T::template arg<4>::type x4) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4); \
				}
				#define DLWRAP_INSTANTIATE_6(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1, \
				typename T::template arg<2>::type x2, \
				typename T::template arg<3>::type x3, \
				typename T::template arg<4>::type x4, \
				typename T::template arg<5>::type x5) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5); \
				}

				#define DLWRAP_INSTANTIATE_7(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1, \
				typename T::template arg<2>::type x2, \
				typename T::template arg<3>::type x3, \
				typename T::template arg<4>::type x4, \
				typename T::template arg<5>::type x5, \
				typename T::template arg<6>::type x6) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6); \
				}

				#define DLWRAP_INSTANTIATE_8(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1, \
				typename T::template arg<2>::type x2, \
				typename T::template arg<3>::type x3, \
				typename T::template arg<4>::type x4, \
				typename T::template arg<5>::type x5, \
				typename T::template arg<6>::type x6, \
				typename T::template arg<7>::type x7) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7); \
				}
				#define DLWRAP_INSTANTIATE_9(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1, \
				typename T::template arg<2>::type x2, \
				typename T::template arg<3>::type x3, \
				typename T::template arg<4>::type x4, \
				typename T::template arg<5>::type x5, \
				typename T::template arg<6>::type x6, \
				typename T::template arg<7>::type x7, \
				typename T::template arg<8>::type x8) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8); \
				}
				#define DLWRAP_INSTANTIATE_10(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1, \
				typename T::template arg<2>::type x2, \
				typename T::template arg<3>::type x3, \
				typename T::template arg<4>::type x4, \
				typename T::template arg<5>::type x5, \
				typename T::template arg<6>::type x6, \
				typename T::template arg<7>::type x7, \
				typename T::template arg<8>::type x8, \
				typename T::template arg<9>::type x9) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
				x9); \
				}
				#define DLWRAP_INSTANTIATE_11(SYM_USE, SYM_DEF, T) \
				T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
				typename T::template arg<1>::type x1, \
				typename T::template arg<2>::type x2, \
				typename T::template arg<3>::type x3, \
				typename T::template arg<4>::type x4, \
				typename T::template arg<5>::type x5, \
				typename T::template arg<6>::type x6, \
				typename T::template arg<7>::type x7, \
				typename T::template arg<8>::type x8, \
				typename T::template arg<9>::type x9, \
				typename T::template arg<10>::type x10) { \
				return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
				x9, x10); \
				}

				#endif

openmp/libomptarget/plugins/cuda/CMakeLists.txt

	Show All 9 Lines
	#			#
	##===----------------------------------------------------------------------===##			##===----------------------------------------------------------------------===##
	if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)\|(ppc64le)\|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))			if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)\|(ppc64le)\|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
	libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")			libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
	return()			return()
	elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)			elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
	libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")			libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
	return()			return()
	elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
	libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
	return()
	elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
	libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
	return()
	endif()			endif()

	libomptarget_say("Building CUDA offloading plugin.")			libomptarget_say("Building CUDA offloading plugin.")

	# Define the suffix for the runtime messaging dumps.			# Define the suffix for the runtime messaging dumps.
	add_definitions(-DTARGET_NAME=CUDA)			add_definitions(-DTARGET_NAME=CUDA)

	include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
	include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})			include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions This probably warrants a variable to force one choice or the other, even if cuda is available on the system. Also, not sure what the default should be when either option is available. JonChesterfield: This probably warrants a variable to force one choice or the other, even if cuda is available…
				jdoerfertUnsubmitted Done Reply Inline Actions +1 for variable, default to use installed CUDA for now. jdoerfert: +1 for variable, default to use installed CUDA for now.

				option(LIBOMPTARGET_DLOPEN_LIBCUDA "Build with dlopened libcuda" OFF)

				if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND
				AND NOT LIBOMPTARGET_DLOPEN_LIBCUDA)
				libomptarget_say("Building CUDA plugin linked against libcuda")
				include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
	add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)			add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
				set (LIBOMPTARGET_DEP_LIBRARIES ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
				else()
				libomptarget_say("Building CUDA plugin for dlopened libcuda")
				include_directories(dynamic_cuda)
				add_library(omptarget.rtl.cuda SHARED src/rtl.cpp dynamic_cuda/cuda.cpp)
				set (LIBOMPTARGET_DEP_LIBRARIES ${CMAKE_DL_LIBS})
				endif()

	# Install plugin under the lib destination folder.			# Install plugin under the lib destination folder.
	install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")			install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")

	target_link_libraries(omptarget.rtl.cuda			target_link_libraries(omptarget.rtl.cuda
	elf_common			elf_common
	MemoryManager			MemoryManager
	${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}			${LIBOMPTARGET_DEP_LIBRARIES}
	${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}			${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
	"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"			"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
	"-Wl,-z,defs")			"-Wl,-z,defs")

	# Report to the parent scope that we are building a plugin for CUDA.			# Report to the parent scope that we are building a plugin for CUDA.
	set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)			set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)

openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h

This file was added.

				//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// The parts of the cuda api that are presently in use by the openmp cuda plugin
				//
				//===----------------------------------------------------------------------===//

				#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED
				#define DYNAMIC_CUDA_CUDA_H_INCLUDED
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Contents from D95104, with function pointer replaced with prototype. JonChesterfield: Contents from D95104, with function pointer replaced with prototype.

				#include <cstddef>
				#include <cstdint>

				typedef int CUdevice;
				typedef uintptr_t CUdeviceptr;
				typedef struct CUmod_st *CUmodule;
				typedef struct CUctx_st *CUcontext;
				typedef struct CUfunc_st *CUfunction;
				typedef struct CUstream_st *CUstream;

				typedef enum cudaError_enum {
				CUDA_SUCCESS = 0,
				CUDA_ERROR_INVALID_VALUE = 1,
				} CUresult;

				typedef enum CUstream_flags_enum {
				CU_STREAM_DEFAULT = 0x0,
				CU_STREAM_NON_BLOCKING = 0x1,
				} CUstream_flags;

				typedef enum CUdevice_attribute_enum {
				CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
				CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
				CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
				} CUdevice_attribute;

				typedef enum CUfunction_attribute_enum {
				CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
				} CUfunction_attribute;

				typedef enum CUctx_flags_enum {
				CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
				CU_CTX_SCHED_MASK = 0x07,
				} CUctx_flags;

				#define cuMemFree cuMemFree_v2
				#define cuMemAlloc cuMemAlloc_v2
				#define cuMemcpyDtoH cuMemcpyDtoH_v2
				#define cuMemcpyHtoD cuMemcpyHtoD_v2
				#define cuStreamDestroy cuStreamDestroy_v2
				#define cuModuleGetGlobal cuModuleGetGlobal_v2
				#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
				#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
				#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
				#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2
				#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2

				CUresult cuCtxGetDevice(CUdevice *);
				CUresult cuDeviceGet(CUdevice *, int);
				CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
				CUresult cuDeviceGetCount(int *);
				CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction);

				CUresult cuGetErrorString(CUresult, const char **);
				CUresult cuInit(unsigned);
				CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
				unsigned, unsigned, unsigned, CUstream, void **,
				void **);

				CUresult cuMemAlloc(CUdeviceptr *, size_t);
				CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);

				CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
				CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
				CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
				CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);

				CUresult cuMemFree(CUdeviceptr);
				CUresult cuModuleGetFunction(CUfunction , CUmodule, const char );
				CUresult cuModuleGetGlobal(CUdeviceptr , size_t , CUmodule, const char *);

				CUresult cuModuleUnload(CUmodule);
				CUresult cuStreamCreate(CUstream *, unsigned);
				CUresult cuStreamDestroy(CUstream);
				CUresult cuStreamSynchronize(CUstream);
				CUresult cuCtxSetCurrent(CUcontext);
				CUresult cuDevicePrimaryCtxRelease(CUdevice);
				CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned , int );
				CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned);
				CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice);
				CUresult cuModuleLoadDataEx(CUmodule , const void , unsigned, void *,
				void **);

				CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice);
				CUresult cuCtxEnablePeerAccess(CUcontext, unsigned);
				CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext,
				size_t, CUstream);

				jdoerfertUnsubmitted Not Done Reply Inline Actions @tra suggested to name the declarations as they are actually named, e.g., _v2. I don't have strong feelings either way, this is kinda messy and weird no matter how we name them. jdoerfert: @tra suggested to name the declarations as they are actually named, e.g., _v2. I don't have…
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Don't mind either way, will rebase this if RTL.cpp is changed to use the v2 functions. Change seems orthogonal. JonChesterfield: Don't mind either way, will rebase this if RTL.cpp is changed to use the v2 functions. Change…
				jdoerfertUnsubmitted Not Done Reply Inline Actions I think the idea was to define it with the right name, thus _v2, but use it with the fake one, so not _v2. jdoerfert: I think the idea was to define it with the right name, thus _v2, but use it with the fake one…
				#endif

openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp

This file was added.

				//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// Implement subset of cuda api by calling into cuda library via dlopen
				// Does the dlopen/dlsym calls as part of the call to cuInit
				//
				//===----------------------------------------------------------------------===//

				#include "cuda.h"
				#include "Debug.h"
				#include "dlwrap.h"

				#include <dlfcn.h>

				DLWRAP_INTERNAL(cuInit, 1);

				DLWRAP(cuCtxGetDevice, 1);
				DLWRAP(cuDeviceGet, 2);
				DLWRAP(cuDeviceGetAttribute, 3);
				DLWRAP(cuDeviceGetCount, 1);
				DLWRAP(cuFuncGetAttribute, 3);

				DLWRAP(cuGetErrorString, 2);
				DLWRAP(cuLaunchKernel, 11);

				DLWRAP(cuMemAlloc, 2);
				DLWRAP(cuMemcpyDtoDAsync, 4);

				DLWRAP(cuMemcpyDtoH, 3);
				DLWRAP(cuMemcpyDtoHAsync, 4);
				DLWRAP(cuMemcpyHtoD, 3);
				DLWRAP(cuMemcpyHtoDAsync, 4);

				DLWRAP(cuMemFree, 1);
				DLWRAP(cuModuleGetFunction, 3);
				DLWRAP(cuModuleGetGlobal, 4);

				DLWRAP(cuModuleUnload, 1);
				DLWRAP(cuStreamCreate, 2);
				DLWRAP(cuStreamDestroy, 1);
				DLWRAP(cuStreamSynchronize, 1);
				DLWRAP(cuCtxSetCurrent, 1);
				DLWRAP(cuDevicePrimaryCtxRelease, 1);
				DLWRAP(cuDevicePrimaryCtxGetState, 3);
				DLWRAP(cuDevicePrimaryCtxSetFlags, 2);
				DLWRAP(cuDevicePrimaryCtxRetain, 2);
				DLWRAP(cuModuleLoadDataEx, 5);

				DLWRAP(cuDeviceCanAccessPeer, 3);
				DLWRAP(cuCtxEnablePeerAccess, 2);
				DLWRAP(cuMemcpyPeerAsync, 6);

				DLWRAP_FINALIZE();

				#ifndef DYNAMIC_CUDA_PATH
				#define DYNAMIC_CUDA_PATH "libcuda.so"
				#endif

				#define TARGET_NAME CUDA
				jdoerfertUnsubmitted Done Reply Inline Actions Should we make this a global, or a CMAKE variable? jdoerfert: Should we make this a global, or a CMAKE variable?
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Some user control over this is probably good, e.g. people might want to load a specific libcuda with several installed, and passing a path would allow that. Simple approach is probably a macro that can be passed in from cmake to override the default JonChesterfield: Some user control over this is probably good, e.g. people might want to load a specific libcuda…
				#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"

				static bool checkForCUDA() {
				// return true if dlopen succeeded and all functions found

				const char *CudaLib = DYNAMIC_CUDA_PATH;
				void *DynlibHandle = dlopen(CudaLib, RTLD_NOW);
				if (!DynlibHandle) {
				DP("Unable to load library '%s': %s!\n", CudaLib, dlerror());
				return false;
				}

				for (size_t I = 0; I < dlwrap::size(); I++) {
				const char *Sym = dlwrap::symbol(I);

				void *P = dlsym(DynlibHandle, Sym);
				if (P == nullptr) {
				DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib);
				return false;
				}

				*dlwrap::pointer(I) = P;
				}
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions cuInit is called by deviceRTL's global constructor, so might be fine as-is. I'm not sure we're destroying the cuda dynamic library at present JonChesterfield: cuInit is called by deviceRTL's global constructor, so might be fine as-is. I'm not sure we're…

				return true;
				}

				jdoerfertUnsubmitted Not Done Reply Inline Actions I would feel better if we use some mechanism to avoid racing on the `checkForCUDA` call, e.g., call once and a boolean global as in D95104. Don't overload cuInit but instead call it from the plugin constructor, or make it a constructor. If 2) has inherent benefits we cannot get any other way, I guess we can keep it. I'm worried it might clash with a cuda otherwise linked into the application or confuse people, or something else bad. jdoerfert: I would feel better if we 1) use some mechanism to avoid racing on the `checkForCUDA` call, e.
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions This is called once, from a global constructor, so is correct as-is. I'll add the locking now though so that can't cause trouble later. The cuInit and other symbols here aren't visible outside of the plugin (there's an exports list applied when linking it). Otherwise yeah, that would be bad. Doing the extra setup in this cuInit means the RTL.cpp doesn't know whether it's linked against cuda or against this shim. This way it is an implementation of cuda, with the implementation detail that it forwards everything to a .so with the right name. I think that's much better than passing in a C macro or adding a weak function. It might be a surprising solution to the problem. JonChesterfield: This is called once, from a global constructor, so is correct as-is. I'll add the locking now…
				CUresult cuInit(unsigned X) {
				// Note: Called exactly once from cuda rtl.cpp in a global constructor so
				// does not need to handle being called repeatedly or concurrently
				if (!checkForCUDA()) {
				return CUDA_ERROR_INVALID_VALUE;
				}
				return dlwrap_cuInit(X);
				}

This is an archive of the discontinued LLVM Phabricator instance.

[libomptarget] Build cuda plugin without cuda installed locally
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 318676

openmp/libomptarget/include/dlwrap.h

openmp/libomptarget/plugins/cuda/CMakeLists.txt

openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h

openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[libomptarget] Build cuda plugin without cuda installed locallyClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 318676

openmp/libomptarget/include/dlwrap.h

openmp/libomptarget/plugins/cuda/CMakeLists.txt

openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h

openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp

[libomptarget] Build cuda plugin without cuda installed locally
ClosedPublic