Diff 256787

openmp/libomptarget/include/omptarget.h

	Show All 12 Lines

	#ifndef _OMPTARGET_H_			#ifndef _OMPTARGET_H_
	#define _OMPTARGET_H_			#define _OMPTARGET_H_

	#include <stdint.h>			#include <stdint.h>
	#include <stddef.h>			#include <stddef.h>

	#define OFFLOAD_SUCCESS (0)			#define OFFLOAD_SUCCESS (0)
	#define OFFLOAD_FAIL (~0)			#define OFFLOAD_FAIL (~0U)
				jdoerfertUnsubmitted Done Reply Inline Actions Split this off, LGTM on this part jdoerfert: Split this off, LGTM on this part

	#define OFFLOAD_DEVICE_DEFAULT -1			#define OFFLOAD_DEVICE_DEFAULT -1
	#define HOST_DEVICE -10			#define HOST_DEVICE -10

	/// Data attributes for each data reference used in an OpenMP target region.			/// Data attributes for each data reference used in an OpenMP target region.
	enum tgt_map_type {			enum tgt_map_type {
	// No flags			// No flags
	OMP_TGT_MAPTYPE_NONE = 0x000,			OMP_TGT_MAPTYPE_NONE = 0x000,
	▲ Show 20 Lines • Show All 232 Lines • Show Last 20 Lines

openmp/libomptarget/plugins/cuda/src/rtl.cpp

Show All 25 Lines
#define TARGET_NAME CUDA		#define TARGET_NAME CUDA
#endif		#endif

#ifdef OMPTARGET_DEBUG		#ifdef OMPTARGET_DEBUG
static int DebugLevel = 0;		static int DebugLevel = 0;

#define GETNAME2(name) #name		#define GETNAME2(name) #name
#define GETNAME(name) GETNAME2(name)		#define GETNAME(name) GETNAME2(name)
#define DP(...) \		#define DP(...) \
do { \		do { \
if (DebugLevel > 0) { \		if (DebugLevel > 0) { \
DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \		DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
} \		} \
} while (false)		} while (false)

// Utility for retrieving and printing CUDA error string.		// Utility for retrieving and printing CUDA error string.
#define CUDA_ERR_STRING(err) \		#define CUDA_ERR_STRING(err) \
do { \		do { \
if (DebugLevel > 0) { \		if (DebugLevel > 0) { \
const char *errStr; \		const char *errStr; \
cuGetErrorString(err, &errStr); \		cuGetErrorString(err, &errStr); \
DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \		DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", \
		errStr); \
} \		} \
} while (false)		} while (false)
#else // OMPTARGET_DEBUG		#else // OMPTARGET_DEBUG
#define DP(...) {}		#define DP(...) \
#define CUDA_ERR_STRING(err) {}		{}
		#define CUDA_ERR_STRING(err) \
		{}
#endif // OMPTARGET_DEBUG		#endif // OMPTARGET_DEBUG

#include "../../common/elf_common.c"		#include "../../common/elf_common.c"

		namespace {
/// Keep entries table per device.		/// Keep entries table per device.
struct FuncOrGblEntryTy {		struct FuncOrGblEntryTy {
__tgt_target_table Table;		__tgt_target_table Table;
std::vector<__tgt_offload_entry> Entries;		std::vector<__tgt_offload_entry> Entries;
};		};

enum ExecutionModeType {		enum ExecutionModeType {
SPMD, // constructors, destructors,		SPMD, // constructors, destructors, combined constructs (`teams distribute
// combined constructs (`teams distribute parallel for [simd]`)		// parallel for [simd]`)
GENERIC, // everything else		GENERIC, // everything else
NONE		NONE
};		};
		jdoerfertUnsubmitted Done Reply Inline Actions I imagine the above is all clang-formated, if so split it off, LGTM on that part. (keep the namsepace as part of the other patch) jdoerfert: I imagine the above is all clang-formated, if so split it off, LGTM on that part. (keep the…
		tianshilei1992AuthorUnsubmitted Done Reply Inline Actions Okay, got your point. tianshilei1992: Okay, got your point.

/// Use a single entity to encode a kernel and a set of flags.		/// Use a single entity to encode a kernel and a set of flags.
struct KernelTy {		struct KernelTy {
CUfunction Func;		CUfunction Func;

// execution mode of kernel		// execution mode of kernel
// 0 - SPMD mode (without master warp)		// 0 - SPMD mode (without master warp)
// 1 - Generic mode (with master warp)		// 1 - Generic mode (with master warp)
int8_t ExecutionMode;		int8_t ExecutionMode;

KernelTy(CUfunction _Func, int8_t _ExecutionMode)		KernelTy(CUfunction _Func, int8_t _ExecutionMode)
: Func(_Func), ExecutionMode(_ExecutionMode) {}		: Func(_Func), ExecutionMode(_ExecutionMode) {}
};		};

		/// List that contains all the kernels.
		/// FIXME: we may need this to be per device and per library.
		std::list<KernelTy> KernelsList;

		jdoerfertUnsubmitted Done Reply Inline Actions I don't know where this should go right now but a global list sounds wrong. Follow up. jdoerfert: I don't know where this should go right now but a global list sounds wrong. Follow up.
		tianshilei1992AuthorUnsubmitted Done Reply Inline Actions This is what it is originally. Basically I didn't change any logic in this patch. From its usage, looks like it is only for storage. Items will only be `emplace_back` to it and then get the pointer. That's it. tianshilei1992: This is what it is originally. Basically I didn't change any logic in this patch. From its…
		jdoerfertUnsubmitted Done Reply Inline Actions I figured it was not you, just that it seems weird. I should have made that clearer. jdoerfert: I figured it was not you, just that it seems weird. I should have made that clearer.
/// Device environment data		/// Device environment data
/// Manually sync with the deviceRTL side for now, move to a dedicated header		/// Manually sync with the deviceRTL side for now, move to a dedicated header
/// file later.		/// file later.
struct omptarget_device_environmentTy {		struct OMPTargetDeviceEnvironmentTy {
int32_t debug_level;		int DebugLevel = 0;
};		};
		jdoerfertUnsubmitted Not Done Reply Inline Actions The above is defined in a different header. We should not redefine it here but include the appropriate header. Follow up. jdoerfert: The above is defined in a different header. We should not redefine it here but include the…

/// List that contains all the kernels.
/// FIXME: we may need this to be per device and per library.
std::list<KernelTy> KernelsList;

namespace {
bool checkResult(CUresult Err, const char *ErrMsg) {		bool checkResult(CUresult Err, const char *ErrMsg) {
if (Err == CUDA_SUCCESS)		if (Err == CUDA_SUCCESS)
return true;		return true;

DP(ErrMsg);		DP(ErrMsg);
CUDA_ERR_STRING(Err);		CUDA_ERR_STRING(Err);
return false;		return false;
}		}
} // namespace
		// Structure contains per-device data
		struct DeviceDataTy {
		std::list<FuncOrGblEntryTy> FuncGblEntries;
		CUcontext Context = nullptr;
		// Device properties
		int ThreadsPerBlock = 0;
		int BlocksPerGrid = 0;
		int WarpSize = 0;
		// OpenMP properties
		int NumTeams = 0;
		int NumThreads = 0;
		};
		jdoerfertUnsubmitted Not Done Reply Inline Actions Can we replace the list with vectors please. We do not add or delete elements once created. Can we make it a template class parametric in the Context type. e.g., `struct NVIDIADeviceDataTy : public DeviceDataTy<CUcontext> {}`. These should all be unsigned types, right? We also should add explicit documentation for each. I mean `NumThreads` is some maximum I guess. These types need to go into a common header (see also 2)). Follow up though jdoerfert: 1) Can we replace the list with vectors please. We do not add or delete elements once created.
		tianshilei1992AuthorUnsubmitted Not Done Reply Inline Actions Can we replace the list with vectors please. We do not add or delete elements once created. Yes, we can do that. Can we make it a template class parametric in the Context type. e.g., `struct NVIDIADeviceDataTy : public DeviceDataTy<CUcontext> {}`. I don't think so. Now we only know that CUDA has this context thing. We have no idea whether other platforms do. Other things like warp as well. These should all be unsigned types, right? We also should add explicit documentation for each. I mean NumThreads is some maximum I guess. Yeah, they should be. Will change them correspondingly. These types need to go into a common header (see also 2)). Follow up though. We should only put the most common part into a common header, but it seems that we only have three known common members: `FuncGblEntries`, `NumTeams`, and `NumThreads`. tianshilei1992: 1. Can we replace the list with vectors please. We do not add or delete elements once created.
		jdoerfertUnsubmitted Not Done Reply Inline Actions If we need a "common/gpu" header we can do that as well. AMD will at least look similar enough to reuse the same code. They have some "context" even if it is a class we define in the AMD headers. You can even go as far and make `common/Device.h` with the entries you mentioned and then `common/GPUDevice.h` with the template class `template<typename CtxTy> GPUDeviceDataTy : public DeviceDataTy`. jdoerfert: If we need a "common/gpu" header we can do that as well. AMD will at least look similar enough…
		tianshilei1992AuthorUnsubmitted Not Done Reply Inline Actions This part is worth a new patch. Let's do it in next step. tianshilei1992: This part is worth a new patch. Let's do it in next step.

class StreamManagerTy {		class StreamManagerTy {
int NumberOfDevices;		int NumberOfDevices;
		// The initial size of stream pool
		int EnvNumInitialStreams;
// Per-device stream mutex		// Per-device stream mutex
std::vector<std::unique_ptr<std::mutex>> StreamMtx;		std::vector<std::unique_ptr<std::mutex>> StreamMtx;
// Per-device stream Id indicates the next available stream in the pool		// Per-device stream Id indicates the next available stream in the pool
std::vector<int> NextStreamId;		std::vector<int> NextStreamId;
// Per-device stream pool		// Per-device stream pool
std::vector<std::vector<CUstream>> StreamPool;		std::vector<std::vector<CUstream>> StreamPool;
// Pointer to per-device context		// Reference to per-device data
std::vector<CUcontext> &ContextsPtr;		std::vector<DeviceDataTy> &DeviceData;

// If there is no CUstream left in the pool, we will resize the pool to		// If there is no CUstream left in the pool, we will resize the pool to
// allocate more CUstream. This function should be called with device mutex,		// allocate more CUstream. This function should be called with device mutex,
// and we do not resize to smaller one.		// and we do not resize to smaller one.
void resizeStreamPool(const int DeviceId, const size_t NewSize) {		void resizeStreamPool(const int DeviceId, const size_t NewSize) {
std::vector<CUstream> &Pool = StreamPool[DeviceId];		std::vector<CUstream> &Pool = StreamPool[DeviceId];
const size_t CurrentSize = Pool.size();		const size_t CurrentSize = Pool.size();
assert(NewSize > CurrentSize && "new size is not larger than current size");		assert(NewSize > CurrentSize && "new size is not larger than current size");

Pool.resize(NewSize, nullptr);		CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
CUresult err = cuCtxSetCurrent(ContextsPtr[DeviceId]);
if (!checkResult(err, "Error when setting current CUDA context\n")) {
// We will return if cannot switch to the right context in case of		// We will return if cannot switch to the right context in case of
// creating bunch of streams that are not corresponding to the right		// creating bunch of streams that are not corresponding to the right
// device. The offloading will fail later because selected CUstream is		// device. The offloading will fail later because selected CUstream is
// nullptr.		// nullptr.
return;		return;
}		}

		Pool.resize(NewSize, nullptr);

for (size_t I = CurrentSize; I < NewSize; ++I) {		for (size_t I = CurrentSize; I < NewSize; ++I) {
err = cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING);		checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
checkResult(err,		"Error returned from cuStreamCreate\n");
"Error when creating CUDA stream to resize stream pool\n");
}		}
}		}

public:		public:
StreamManagerTy(const int NumberOfDevices, std::vector<CUcontext> &CtxPtr)		StreamManagerTy(const int NumberOfDevices,
: NumberOfDevices(NumberOfDevices), ContextsPtr(CtxPtr) {		std::vector<DeviceDataTy> &DeviceData)
		: NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
		DeviceData(DeviceData) {
StreamPool.resize(NumberOfDevices);		StreamPool.resize(NumberOfDevices);
NextStreamId.resize(NumberOfDevices);		NextStreamId.resize(NumberOfDevices);
StreamMtx.resize(NumberOfDevices);		StreamMtx.resize(NumberOfDevices);

// Initially let's create 32 streams for each device		char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS");
int EnvNumInitialStreams = 32;		if (EnvStr)
char *envStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS");		EnvNumInitialStreams = std::stoi(EnvStr);
if (envStr)
EnvNumInitialStreams = std::stoi(envStr);

// Initialize the stream pool for each device		// Reserve the size for stream pool of each device
for (std::vector<CUstream> &S : StreamPool)		for (std::vector<CUstream> &S : StreamPool)
S.resize(EnvNumInitialStreams);		S.reserve(EnvNumInitialStreams);
		jdoerfertUnsubmitted Done Reply Inline Actions These three lines can be removed if you move the `getenv` stuff into init and call resizeStreamPool there. jdoerfert: These three lines can be removed if you move the `getenv` stuff into init and call…
		tianshilei1992AuthorUnsubmitted Done Reply Inline Actions Correct. Actually I would leave the getenv stuff here because the env has already been determined during the library is initialized. I mean, the global variable is constructed. tianshilei1992: Correct. Actually I would leave the getenv stuff here because the env has already been…
		jdoerfertUnsubmitted Not Done Reply Inline Actions OK jdoerfert: OK

// Initialize the next stream id		// Initialize the next stream id
std::fill(NextStreamId.begin(), NextStreamId.end(), 0);		std::fill(NextStreamId.begin(), NextStreamId.end(), 0);

// Initialize stream mutex		// Initialize stream mutex
for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)		for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
Ptr = std::make_unique<std::mutex>();		Ptr = std::make_unique<std::mutex>();
}		}

~StreamManagerTy() {		~StreamManagerTy() {
// Destroy streams		// Destroy streams
for (int I = 0; I < NumberOfDevices; ++I) {		for (int I = 0; I < NumberOfDevices; ++I) {
CUresult err = cuCtxSetCurrent(ContextsPtr[I]);		checkResult(cuCtxSetCurrent(DeviceData[I].Context),
checkResult(err, "Error when setting current CUDA context\n");		"Error returned from cuCtxSetCurrent\n");

for (CUstream &S : StreamPool[I]) {		for (CUstream &S : StreamPool[I]) {
if (!S)		if (S)
continue;		checkResult(cuStreamDestroy(S),
err = cuStreamDestroy(S);		"Error returned from cuStreamDestroy\n");
checkResult(err, "Error when destroying CUDA stream\n");
}		}
}		}
}		}

// Get a CUstream from pool. Per-device next stream id always points to the		// Get a CUstream from pool. Per-device next stream id always points to the
// next available CUstream. That means, CUstreams [0, id-1] have been		// next available CUstream. That means, CUstreams [0, id-1] have been
// assigned, and [id,] are still available. If there is no CUstream left, we		// assigned, and [id,] are still available. If there is no CUstream left, we
// will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,		// will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
// the id will increase one.		// the id will increase one.
// xxxxxs+++++++++		// xxxxxs+++++++++
// ^		// ^
// id		// id
// After assignment, the pool becomes the following and s is assigned.		// After assignment, the pool becomes the following and s is assigned.
// xxxxxs+++++++++		// xxxxxs+++++++++
// ^		// ^
// id		// id
CUstream getStream(const int DeviceId) {		CUstream getStream(const int DeviceId) {
assert(DeviceId >= 0 &&
static_cast<size_t>(DeviceId) < NextStreamId.size() &&
"Unexpected device id");

const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);		const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
int &Id = NextStreamId[DeviceId];		int &Id = NextStreamId[DeviceId];
// No CUstream left in the pool, we need to request from CUDA RT		// No CUstream left in the pool, we need to request from CUDA RT
if (Id == StreamPool[DeviceId].size()) {		if (Id == StreamPool[DeviceId].size()) {
// By default we double the stream pool every time		// By default we double the stream pool every time
resizeStreamPool(DeviceId, Id * 2);		resizeStreamPool(DeviceId, Id * 2);
}		}
return StreamPool[DeviceId][Id++];		return StreamPool[DeviceId][Id++];
Show All 9 Lines	public:
// xxax+a+++++		// xxax+a+++++
// ^		// ^
// id		// id
// However, it doesn't matter, because they're always on the two sides of		// However, it doesn't matter, because they're always on the two sides of
// id. The left one will in the end be overwritten by another CUstream.		// id. The left one will in the end be overwritten by another CUstream.
// Therefore, after several execution, the order of pool might be different		// Therefore, after several execution, the order of pool might be different
// from its initial state.		// from its initial state.
void returnStream(const int DeviceId, CUstream Stream) {		void returnStream(const int DeviceId, CUstream Stream) {
assert(DeviceId >= 0 &&
static_cast<size_t>(DeviceId) < NextStreamId.size() &&
"Unexpected device id");

const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);		const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
int &Id = NextStreamId[DeviceId];		int &Id = NextStreamId[DeviceId];
assert(Id > 0 && "Wrong stream ID");		assert(Id > 0 && "Wrong stream ID");
StreamPool[DeviceId][--Id] = Stream;		StreamPool[DeviceId][--Id] = Stream;
}		}

void initializeDevice(int DeviceId) {		bool initializeDeviceStreamPool(const int DeviceId) {
// This function should be called after setting right context		assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
for (CUstream &Stream : StreamPool[DeviceId]) {
CUresult Err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
checkResult(Err, "Error when creating CUDA stream\n");
}
}
};

/// Class containing all the device information.		resizeStreamPool(DeviceId, EnvNumInitialStreams);
class RTLDeviceInfoTy {
std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
std::shared_ptr<StreamManagerTy> StreamManager;

public:		// Check the size of stream pool
int NumberOfDevices;		if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
std::vector<CUmodule> Modules;		return false;
std::vector<CUcontext> Contexts;

// Device properties		// Check whether each stream is valid
std::vector<int> ThreadsPerBlock;		for (CUstream &S : StreamPool[DeviceId])
std::vector<int> BlocksPerGrid;		if (!S)
std::vector<int> WarpSize;		return false;

// OpenMP properties		return true;
std::vector<int> NumTeams;		}
		jdoerfertUnsubmitted Not Done Reply Inline Actions We check already in `resizeStreamPool`. Just return if it worked or not there. jdoerfert: We check already in `resizeStreamPool`. Just return if it worked or not there.
		tianshilei1992AuthorUnsubmitted Not Done Reply Inline Actions Actually the failure in `resizeStreamPool` will not impact its return. Those checks would only print out some messages but not aborting the program. Actually, I'm very not sure whether we need to abort the whole program as long as one CUDA operation returns error. From my point of view, I think we should. WDYT? tianshilei1992: Actually the failure in `resizeStreamPool` will not impact its return. Those checks would only…
std::vector<int> NumThreads;		};
		jdoerfertUnsubmitted Not Done Reply Inline Actions Once we have the common header, the StreamManager interface has to go there as well. We need generic functions for stream create/destory and set context can be done via a DeviceDataTy method. jdoerfert: Once we have the common header, the StreamManager interface has to go there as well. We need…

// OpenMP Environment properties		class DeviceRTLTy {
		int NumberOfDevices;
		// OpenMP environment properties
int EnvNumTeams;		int EnvNumTeams;
int EnvTeamLimit;		int EnvTeamLimit;
		// OpenMP requires flags
// OpenMP Requires Flags
int64_t RequiresFlags;		int64_t RequiresFlags;

// static int EnvNumThreads;		static constexpr const int HardTeamLimit = 1U << 16U; // 64k
static const int HardTeamLimit = 1 << 16; // 64k		static constexpr const int HardThreadLimit = 1024;
static const int HardThreadLimit = 1024;		static constexpr const int DefaultNumTeams = 128;
static const int DefaultNumTeams = 128;		static constexpr const int DefaultNumThreads = 128;
static const int DefaultNumThreads = 128;

std::shared_ptr<StreamManagerTy> getStreamManager() { return StreamManager; }

CUstream getStream(const int DeviceId) {
return StreamManager->getStream(DeviceId);
}

void returnStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) {		std::unique_ptr<StreamManagerTy> StreamManager;
assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");		std::vector<DeviceDataTy> DeviceData;
assert(AsyncInfoPtr->Queue && "AsyncInfoPtr->Queue is nullptr");		std::vector<CUmodule> Modules;

StreamManager->returnStream(
DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
AsyncInfoPtr->Queue = nullptr;
}

// Record entry point associated with device		// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {		void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
assert(device_id < (int32_t)FuncGblEntries.size() &&		FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();

E.Entries.push_back(entry);		E.Entries.push_back(entry);
}		}

// Return true if the entry is associated with device		// Return true if the entry is associated with device
bool findOffloadEntry(int32_t device_id, void *addr) {		bool findOffloadEntry(const int DeviceId, const void *Addr) const {
assert(device_id < (int32_t)FuncGblEntries.size() &&		const FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();

for (auto &it : E.Entries) {		for (const __tgt_offload_entry &it : E.Entries)
if (it.addr == addr)		if (it.addr == Addr)
return true;		return true;
}

		jdoerfertUnsubmitted Done Reply Inline Actions Nit: `It` or better `Entry` jdoerfert: Nit: `It` or better `Entry`
return false;		return false;
}		}

// Return the pointer to the target entries table		// Return the pointer to the target entries table
__tgt_target_table *getOffloadEntriesTable(int32_t device_id) {		__tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
assert(device_id < (int32_t)FuncGblEntries.size() &&		FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();

int32_t size = E.Entries.size();

// Table is empty
if (!size)
return 0;

__tgt_offload_entry *begin = &E.Entries[0];		if (E.Entries.empty())
__tgt_offload_entry *end = &E.Entries[size - 1];		return nullptr;

// Update table info according to the entries and return the pointer		// Update table info according to the entries and return the pointer
E.Table.EntriesBegin = begin;		E.Table.EntriesBegin = E.Entries.data();
E.Table.EntriesEnd = ++end;		E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
		jdoerfertUnsubmitted Done Reply Inline Actions (Off topic: Why do we have a table member that needs to be updated when we use it, that seems wrong.) jdoerfert: (Off topic: Why do we have a table member that needs to be updated when we use it, that seems…
		tianshilei1992AuthorUnsubmitted Done Reply Inline Actions Well, that is what it is. We could probably think how to improve it later. tianshilei1992: Well, that is what it is. We could probably think how to improve it later.

return &E.Table;		return &E.Table;
}		}

// Clear entries table for a device		// Clear entries table for a device
void clearOffloadEntriesTable(int32_t device_id) {		void clearOffloadEntriesTable(const int DeviceId) {
assert(device_id < (int32_t)FuncGblEntries.size() &&		DeviceData[DeviceId].FuncGblEntries.emplace_back();
"Unexpected device id!");		FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
FuncGblEntries[device_id].emplace_back();
FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
E.Entries.clear();		E.Entries.clear();
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;		E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
}		}

RTLDeviceInfoTy() {		CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
#ifdef OMPTARGET_DEBUG		assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
DebugLevel = std::stoi(envStr);		if (!AsyncInfoPtr->Queue)
		AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);

		return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
}		}

		public:
		// This class should not be copied
		DeviceRTLTy(const DeviceRTLTy &) = delete;
		DeviceRTLTy(DeviceRTLTy &&) = delete;

		DeviceRTLTy()
		: NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
		RequiresFlags(OMP_REQ_UNDEFINED) {
		#ifdef OMPTARGET_DEBUG
		if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
		DebugLevel = std::stoi(EnvStr);
#endif // OMPTARGET_DEBUG		#endif // OMPTARGET_DEBUG

DP("Start initializing CUDA\n");		DP("Start initializing CUDA\n");

CUresult err = cuInit(0);		CUresult Err = cuInit(0);
if (err != CUDA_SUCCESS) {		if (!checkResult(Err, "Error returned from cuInit\n")) {
DP("Error when initializing CUDA\n");
CUDA_ERR_STRING(err);
return;		return;
}		}

NumberOfDevices = 0;		Err = cuDeviceGetCount(&NumberOfDevices);
		if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) {
err = cuDeviceGetCount(&NumberOfDevices);
if (err != CUDA_SUCCESS) {
DP("Error when getting CUDA device count\n");
CUDA_ERR_STRING(err);
return;		return;
}		}
		jdoerfertUnsubmitted Done Reply Inline Actions No braces around return also above. jdoerfert: No braces around return also above.

if (NumberOfDevices == 0) {		if (NumberOfDevices == 0) {
DP("There are no devices supporting CUDA.\n");		DP("There are no devices supporting CUDA.\n");
return;		return;
}		}

FuncGblEntries.resize(NumberOfDevices);		DeviceData.resize(NumberOfDevices);
		jdoerfertUnsubmitted Done Reply Inline Actions Nice jdoerfert: Nice
Contexts.resize(NumberOfDevices);
ThreadsPerBlock.resize(NumberOfDevices);
BlocksPerGrid.resize(NumberOfDevices);
WarpSize.resize(NumberOfDevices);
NumTeams.resize(NumberOfDevices);
NumThreads.resize(NumberOfDevices);

// Get environment variables regarding teams		// Get environment variables regarding teams
char *envStr = getenv("OMP_TEAM_LIMIT");		const char *EnvStr = getenv("OMP_TEAM_LIMIT");
if (envStr) {		if (EnvStr) {
// OMP_TEAM_LIMIT has been set		// OMP_TEAM_LIMIT has been set
EnvTeamLimit = std::stoi(envStr);		EnvTeamLimit = std::stoi(EnvStr);
DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);		DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
} else {
EnvTeamLimit = -1;
}		}
envStr = getenv("OMP_NUM_TEAMS");		EnvStr = getenv("OMP_NUM_TEAMS");
if (envStr) {		if (EnvStr) {
// OMP_NUM_TEAMS has been set		// OMP_NUM_TEAMS has been set
EnvNumTeams = std::stoi(envStr);		EnvNumTeams = std::stoi(EnvStr);
DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);		DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
} else {
EnvNumTeams = -1;
}		}
		jdoerfertUnsubmitted Done Reply Inline Actions Style: I'd move the getenv calls into the condition `if (const ... = getenv)` jdoerfert: Style: I'd move the getenv calls into the condition `if (const ... = getenv) `
		tianshilei1992AuthorUnsubmitted Done Reply Inline Actions But here `EnvStr` are used twice...You would like to have two local variables? :-) tianshilei1992: But here `EnvStr` are used twice...You would like to have two local variables? :-)
		jdoerfertUnsubmitted Done Reply Inline Actions If you do what I described the lifetime is limited to the conditional. There will be two variables, distinct but with the same name. That is for different reasons preferable, e.g., the variable is not available outside the conditional and it is not reused. Overall it is less complex. jdoerfert: If you do what I described the lifetime is limited to the conditional. There will be two…

StreamManager =		StreamManager =
std::make_shared<StreamManagerTy>(NumberOfDevices, Contexts);		std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);

// Default state.
RequiresFlags = OMP_REQ_UNDEFINED;
}		}

~RTLDeviceInfoTy() {		~DeviceRTLTy() {
// First destruct stream manager in case of Contexts is destructed before it		// First destruct stream manager in case of Contexts is destructed before it
StreamManager = nullptr;		StreamManager = nullptr;

// Close modules		for (CUmodule &M : Modules)
for (auto &module : Modules)		// Close module
if (module) {		if (M)
CUresult err = cuModuleUnload(module);		checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
if (err != CUDA_SUCCESS) {
DP("Error when unloading CUDA module\n");
CUDA_ERR_STRING(err);
}
}

// Destroy contexts		for (DeviceDataTy &D : DeviceData) {
for (auto &ctx : Contexts)		// Destroy context
if (ctx) {		if (D.Context)
CUresult err = cuCtxDestroy(ctx);		checkResult(cuCtxDestroy(D.Context),
if (err != CUDA_SUCCESS) {		"Error returned from cuCtxDestroy\n");
DP("Error when destroying CUDA context\n");
CUDA_ERR_STRING(err);
}
}		}
}		}
};

static RTLDeviceInfoTy DeviceInfo;

namespace {		// Check whether a given DeviceId is valid
CUstream getStream(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr) {		bool isValidDeviceId(const int DeviceId) const {
assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");		return DeviceId >= 0 && DeviceId < NumberOfDevices;

if (!AsyncInfoPtr->Queue)
AsyncInfoPtr->Queue = DeviceInfo.getStream(DeviceId);

return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
}		}

int32_t dataRetrieve(int32_t DeviceId, void HstPtr, void TgtPtr, int64_t Size,		bool getNumOfDevices() const { return NumberOfDevices; }
__tgt_async_info *AsyncInfoPtr) {
assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
if (err != CUDA_SUCCESS) {
DP("Error when setting CUDA context\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}

CUstream Stream = getStream(DeviceId, AsyncInfoPtr);		void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }

err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);		int initDevice(const int DeviceId) {
if (err != CUDA_SUCCESS) {		CUdevice Device;
DP("Error when copying data from device to host. Pointers: host = " DPxMOD
", device = " DPxMOD ", size = %" PRId64 "\n",
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}

return OFFLOAD_SUCCESS;		DP("Getting device %d\n", DeviceId);
}		CUresult Err = cuDeviceGet(&Device, DeviceId);
		if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
int32_t dataSubmit(int32_t DeviceId, void TgtPtr, void HstPtr, int64_t Size,
__tgt_async_info *AsyncInfoPtr) {
assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
if (err != CUDA_SUCCESS) {
DP("Error when setting CUDA context\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}

CUstream Stream = getStream(DeviceId, AsyncInfoPtr);

err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
if (err != CUDA_SUCCESS) {
DP("Error when copying data from host to device. Pointers: host = " DPxMOD
", device = " DPxMOD ", size = %" PRId64 "\n",
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}

return OFFLOAD_SUCCESS;
}
} // namespace

#ifdef __cplusplus
extern "C" {
#endif

int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
return elf_check_machine(image, 190); // EM_CUDA = 190.
}

int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }

int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
DP("Init requires flags to %ld\n", RequiresFlags);
DeviceInfo.RequiresFlags = RequiresFlags;
return RequiresFlags;
}

int32_t __tgt_rtl_init_device(int32_t device_id) {

CUdevice cuDevice;
DP("Getting device %d\n", device_id);
CUresult err = cuDeviceGet(&cuDevice, device_id);
if (err != CUDA_SUCCESS) {
DP("Error when getting CUDA device with id = %d\n", device_id);
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}

// Create the context and save it to use whenever this device is selected.		// Create the context and save it to use whenever this device is selected.
err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,		Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC,
cuDevice);		Device);
if (err != CUDA_SUCCESS) {		if (!checkResult(Err, "Error returned from cuCtxCreate\n"))
DP("Error when creating a CUDA context\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}

err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);		Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (err != CUDA_SUCCESS) {		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
DP("Error when setting current CUDA context\n");		return OFFLOAD_FAIL;
CUDA_ERR_STRING(err);
}

// Initialize stream pool		// Initialize stream pool
DeviceInfo.getStreamManager()->initializeDevice(device_id);		if (!StreamManager->initializeDeviceStreamPool(DeviceId))
		return OFFLOAD_FAIL;

// Query attributes to determine number of threads/block and blocks/grid.		// Query attributes to determine number of threads/block and blocks/grid.
int maxGridDimX;		int MaxGridDimX;
err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,		Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
cuDevice);		Device);
if (err != CUDA_SUCCESS) {		if (Err != CUDA_SUCCESS) {
DP("Error getting max grid dimension, use default\n");		DP("Error getting max grid dimension, use default value %d\n",
DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;		DeviceRTLTy::DefaultNumTeams);
} else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {		DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;		} else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
DP("Using %d CUDA blocks per grid\n", maxGridDimX);		DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
		DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
} else {		} else {
DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "		DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
"at the hard limit\n",		"at the hard limit\n",
maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);		MaxGridDimX, DeviceRTLTy::HardTeamLimit);
		DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
}		}

// We are only exploiting threads along the x axis.		// We are only exploiting threads along the x axis.
int maxBlockDimX;		int MaxBlockDimX;
err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,		Err = cuDeviceGetAttribute(&MaxBlockDimX,
cuDevice);		CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
if (err != CUDA_SUCCESS) {		if (Err != CUDA_SUCCESS) {
DP("Error getting max block dimension, use default\n");		DP("Error getting max block dimension, use default value %d\n",
DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;		DeviceRTLTy::DefaultNumThreads);
} else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {		DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;		} else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
DP("Using %d CUDA threads per block\n", maxBlockDimX);		DP("Using %d CUDA threads per block\n", MaxBlockDimX);
		DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
} else {		} else {
DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;		DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"		"capping at the hard limit\n",
"at the hard limit\n",		MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);		DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
}		}

int warpSize;		// Get and set warp size
err =		int WarpSize;
cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);		Err =
if (err != CUDA_SUCCESS) {		cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
DP("Error getting warp size, assume default\n");		if (Err != CUDA_SUCCESS) {
DeviceInfo.WarpSize[device_id] = 32;		DP("Error getting warp size, assume default value 32\n");
		DeviceData[DeviceId].WarpSize = 32;
} else {		} else {
DeviceInfo.WarpSize[device_id] = warpSize;		DP("Using warp size %d\n", WarpSize);
		DeviceData[DeviceId].WarpSize = WarpSize;
}		}

// Adjust teams to the env variables		// Adjust teams to the env variables
if (DeviceInfo.EnvTeamLimit > 0 &&		if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",		DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
DeviceInfo.EnvTeamLimit);		EnvTeamLimit);
		DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
}		}

DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",		DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],		DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock,
DeviceInfo.WarpSize[device_id]);		DeviceData[DeviceId].WarpSize);

// Set default number of teams		// Set default number of teams
if (DeviceInfo.EnvNumTeams > 0) {		if (EnvNumTeams > 0) {
DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
DP("Default number of teams set according to environment %d\n",		DP("Default number of teams set according to environment %d\n",
DeviceInfo.EnvNumTeams);		EnvNumTeams);
		DeviceData[DeviceId].NumTeams = EnvNumTeams;
} else {		} else {
DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;		DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
DP("Default number of teams set according to library's default %d\n",		DP("Default number of teams set according to library's default %d\n",
RTLDeviceInfoTy::DefaultNumTeams);		DeviceRTLTy::DefaultNumTeams);
}		}
if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];		if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
DP("Default number of teams exceeds device limit, capping at %d\n",		DP("Default number of teams exceeds device limit, capping at %d\n",
DeviceInfo.BlocksPerGrid[device_id]);		DeviceData[DeviceId].BlocksPerGrid);
		DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
}		}

// Set default number of threads		// Set default number of threads
DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;		DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
DP("Default number of threads set according to library's default %d\n",		DP("Default number of threads set according to library's default %d\n",
RTLDeviceInfoTy::DefaultNumThreads);		DeviceRTLTy::DefaultNumThreads);
if (DeviceInfo.NumThreads[device_id] >		if (DeviceData[DeviceId].NumThreads >
DeviceInfo.ThreadsPerBlock[device_id]) {		DeviceData[DeviceId].ThreadsPerBlock) {
DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
DP("Default number of threads exceeds device limit, capping at %d\n",		DP("Default number of threads exceeds device limit, capping at %d\n",
DeviceInfo.ThreadsPerBlock[device_id]);		DeviceData[DeviceId].ThreadsPerBlock);
		DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
}		}

return OFFLOAD_SUCCESS;		return OFFLOAD_SUCCESS;
}		}

__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,		__tgt_target_table *loadBinary(const int DeviceId,
__tgt_device_image *image) {		const __tgt_device_image *Image) {
		// Set the context we are using
// Set the context we are using.		CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
if (err != CUDA_SUCCESS) {		return nullptr;
DP("Error when setting a CUDA context for device %d\n", device_id);
CUDA_ERR_STRING(err);
return NULL;
}

// Clear the offload table as we are going to create a new one.		// Clear the offload table as we are going to create a new one.
DeviceInfo.clearOffloadEntriesTable(device_id);		clearOffloadEntriesTable(DeviceId);

// Create the module and extract the function pointers.		// Create the module and extract the function pointers.
		CUmodule Module;
CUmodule cumod;		DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));		Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);		if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
if (err != CUDA_SUCCESS) {		return nullptr;
DP("Error when loading CUDA module\n");
CUDA_ERR_STRING(err);
return NULL;
}

DP("CUDA module successfully loaded!\n");		DP("CUDA module successfully loaded!\n");
DeviceInfo.Modules.push_back(cumod);

// Find the symbols in the module by name.
__tgt_offload_entry *HostBegin = image->EntriesBegin;
__tgt_offload_entry *HostEnd = image->EntriesEnd;

for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {

if (!e->addr) {
// We return NULL when something like this happens, the host should have
// always something in the address to uniquely identify the target region.
DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);

return NULL;
}

if (e->size) {		Modules.push_back(Module);
__tgt_offload_entry entry = *e;

CUdeviceptr cuptr;		// Find the symbols in the module by name.
size_t cusize;		const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);		const __tgt_offload_entry *HostEnd = Image->EntriesEnd;

if (err != CUDA_SUCCESS) {		for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
DP("Loading global '%s' (Failed)\n", e->name);		if (!E->addr) {
CUDA_ERR_STRING(err);		// We return nullptr when something like this happens, the host should
return NULL;		// have always something in the address to uniquely identify the target
		// region.
		DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
		return nullptr;
		}

		if (E->size) {
		__tgt_offload_entry Entry = *E;
		CUdeviceptr CUPtr;
		size_t CUSize;
		Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
		// We keep this style here because we need the name
		if (Err != CUDA_SUCCESS) {
		DP("Loading global '%s' (Failed)\n", E->name);
		CUDA_ERR_STRING(Err);
		return nullptr;
}		}

if (cusize != e->size) {		if (CUSize != E->size) {
DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,		DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
cusize, e->size);		CUSize, E->size);
CUDA_ERR_STRING(err);		return nullptr;
return NULL;
}		}

DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",		DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));		DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
entry.addr = (void *)cuptr;
		Entry.addr = (void *)(CUPtr);

// Note: In the current implementation declare target variables		// Note: In the current implementation declare target variables
// can either be link or to. This means that once unified		// can either be link or to. This means that once unified
// memory is activated via the requires directive, the variable		// memory is activated via the requires directive, the variable
// can be used directly from the host in both cases.		// can be used directly from the host in both cases.
// TODO: when variables types other than to or link are added,		// TODO: when variables types other than to or link are added,
// the below condition should be changed to explicitly		// the below condition should be changed to explicitly
// check for to and link variables types:		// check for to and link variables types:
// (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&		// (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
// (e->flags & OMP_DECLARE_TARGET_LINK \|\|		// OMP_DECLARE_TARGET_LINK \|\| e->flags == OMP_DECLARE_TARGET_TO))
// e->flags == OMP_DECLARE_TARGET_TO))		if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
// If unified memory is present any target link or to variables		// If unified memory is present any target link or to variables
// can access host addresses directly. There is no longer a		// can access host addresses directly. There is no longer a
// need for device copies.		// need for device copies.
cuMemcpyHtoD(cuptr, e->addr, sizeof(void *));		cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
DP("Copy linked variable host address (" DPxMOD ")"		DP("Copy linked variable host address (" DPxMOD
"to device address (" DPxMOD ")\n",		") to device address (" DPxMOD ")\n",
DPxPTR(((void*)e->addr)), DPxPTR(cuptr));		DPxPTR(((void *)E->addr)), DPxPTR(CUPtr));
}		}

DeviceInfo.addOffloadEntry(device_id, entry);		addOffloadEntry(DeviceId, Entry);

continue;		continue;
}		}

CUfunction fun;		CUfunction Func;
err = cuModuleGetFunction(&fun, cumod, e->name);		Err = cuModuleGetFunction(&Func, Module, E->name);
		// TODO: Improve checkResult function using E->name
if (err != CUDA_SUCCESS) {		if (!checkResult(Err, "Error returned from cuModuleGetFunction\n"))
DP("Loading '%s' (Failed)\n", e->name);		return nullptr;
CUDA_ERR_STRING(err);
return NULL;
}

DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",		DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
DPxPTR(e - HostBegin), e->name, DPxPTR(fun));		DPxPTR(E - HostBegin), E->name, DPxPTR(Func));

// default value GENERIC (in case symbol is missing from cubin file)		// default value GENERIC (in case symbol is missing from cubin file)
int8_t ExecModeVal = ExecutionModeType::GENERIC;		int8_t ExecModeVal = ExecutionModeType::GENERIC;
std::string ExecModeNameStr (e->name);		std::string ExecModeNameStr(E->name);
ExecModeNameStr += "_exec_mode";		ExecModeNameStr += "_exec_mode";
const char *ExecModeName = ExecModeNameStr.c_str();		const char *ExecModeName = ExecModeNameStr.c_str();

CUdeviceptr ExecModePtr;		CUdeviceptr ExecModePtr;
size_t cusize;		size_t CUSize;
err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);		Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
if (err == CUDA_SUCCESS) {		if (Err == CUDA_SUCCESS) {
if ((size_t)cusize != sizeof(int8_t)) {		if (CUSize != sizeof(int8_t)) {
DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",		DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
ExecModeName, cusize, sizeof(int8_t));		ExecModeName, CUSize, sizeof(int8_t));
CUDA_ERR_STRING(err);		return nullptr;
return NULL;
}		}

err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);		Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
if (err != CUDA_SUCCESS) {		// TODO: Improve checkResult function using E->name
		if (Err != CUDA_SUCCESS) {
DP("Error when copying data from device to host. Pointers: "		DP("Error when copying data from device to host. Pointers: "
"host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",		"host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);		DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(Err);
return NULL;		return nullptr;
}		}

if (ExecModeVal < 0 \|\| ExecModeVal > 1) {		if (ExecModeVal < 0 \|\| ExecModeVal > 1) {
DP("Error wrong exec_mode value specified in cubin file: %d\n",		DP("Error wrong exec_mode value specified in cubin file: %d\n",
ExecModeVal);		ExecModeVal);
return NULL;		return nullptr;
}		}
} else {		} else {
DP("Loading global exec_mode '%s' - symbol missing, using default value "		DP("Loading global exec_mode '%s' - symbol missing, using default "
"GENERIC (1)\n", ExecModeName);		"value GENERIC (1)\n",
CUDA_ERR_STRING(err);		ExecModeName);
		CUDA_ERR_STRING(Err);
}		}

KernelsList.push_back(KernelTy(fun, ExecModeVal));		KernelsList.emplace_back(Func, ExecModeVal);

__tgt_offload_entry entry = *e;		__tgt_offload_entry Entry = *E;
entry.addr = (void *)&KernelsList.back();		Entry.addr = &KernelsList.back();
DeviceInfo.addOffloadEntry(device_id, entry);		addOffloadEntry(DeviceId, Entry);
}		}

// send device environment data to the device		// send device environment data to the device
{		{
omptarget_device_environmentTy device_env;		OMPTargetDeviceEnvironmentTy DeviceEnv;

device_env.debug_level = 0;

#ifdef OMPTARGET_DEBUG		#ifdef OMPTARGET_DEBUG
if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {		if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
device_env.debug_level = std::stoi(envStr);		DeviceEnv.DebugLevel = std::stoi(EnvStr);
}
#endif		#endif

const char * device_env_Name="omptarget_device_environment";		const char *DeviceEnvName = "omptarget_device_environment";
CUdeviceptr device_env_Ptr;		CUdeviceptr DeviceEnvPtr;
size_t cusize;		size_t CUSize;

err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);		Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
		if (Err == CUDA_SUCCESS) {
if (err == CUDA_SUCCESS) {		if (CUSize != sizeof(DeviceEnv)) {
if ((size_t)cusize != sizeof(device_env)) {
DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",		DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
device_env_Name, cusize, sizeof(int32_t));		DeviceEnvName, CUSize, sizeof(int32_t));
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(Err);
return NULL;		return nullptr;
}		}

err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);		Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
if (err != CUDA_SUCCESS) {		// TODO: Improve checkResult function using E->name
		if (Err != CUDA_SUCCESS) {
DP("Error when copying data from host to device. Pointers: "		DP("Error when copying data from host to device. Pointers: "
"host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",		"host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);		DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(Err);
return NULL;		return nullptr;
}		}

DP("Sending global device environment data %zu bytes\n", (size_t)cusize);		DP("Sending global device environment data %zu bytes\n", CUSize);
} else {		} else {
DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);		DP("Finding global device environment '%s' - symbol missing.\n",
DP("Continue, considering this is a device RTL which does not accept environment setting.\n");		DeviceEnvName);
		DP("Continue, considering this is a device RTL which does not accept "
		"environment setting.\n");
}		}
}		}

return DeviceInfo.getOffloadEntriesTable(device_id);		return getOffloadEntriesTable(DeviceId);
}		}

void __tgt_rtl_data_alloc(int32_t device_id, int64_t size, void hst_ptr) {		void *dataAlloc(const int DeviceId, const int64_t Size) const {
if (size == 0) {		if (Size == 0)
return NULL;		return nullptr;
}

// Set the context we are using.		CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
if (err != CUDA_SUCCESS) {		return nullptr;
DP("Error while trying to set CUDA current context\n");
CUDA_ERR_STRING(err);
return NULL;
}

CUdeviceptr ptr;		CUdeviceptr DevicePtr;
err = cuMemAlloc(&ptr, size);		Err = cuMemAlloc(&DevicePtr, Size);
if (err != CUDA_SUCCESS) {		if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
DP("Error while trying to allocate %d\n", err);		return nullptr;
CUDA_ERR_STRING(err);
return NULL;
}

void vptr = (void )ptr;		return (void *)DevicePtr;
return vptr;
}		}

int32_t __tgt_rtl_data_submit(int32_t device_id, void tgt_ptr, void hst_ptr,		int dataSubmit(const int DeviceId, const void TgtPtr, const void HstPtr,
int64_t size) {		const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
__tgt_async_info async_info;		assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, size,
&async_info);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;

return __tgt_rtl_synchronize(device_id, &async_info);		CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
}		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
		return OFFLOAD_FAIL;

int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,		CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
void *hst_ptr, int64_t size,
__tgt_async_info *async_info_ptr) {
assert(async_info_ptr && "async_info_ptr is nullptr");
return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr);
}

int32_t __tgt_rtl_data_retrieve(int32_t device_id, void hst_ptr, void tgt_ptr,		Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
int64_t size) {		if (Err != CUDA_SUCCESS) {
__tgt_async_info async_info;		DP("Error when copying data from host to device. Pointers: host = " DPxMOD
int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, size,		", device = " DPxMOD ", size = %" PRId64 "\n",
&async_info);		DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
if (rc != OFFLOAD_SUCCESS)		CUDA_ERR_STRING(Err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;

return __tgt_rtl_synchronize(device_id, &async_info);
}		}

int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,		return OFFLOAD_SUCCESS;
void *tgt_ptr, int64_t size,
__tgt_async_info *async_info_ptr) {
assert(async_info_ptr && "async_info_ptr is nullptr");
return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr);
}		}

int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {		int dataRetrieve(int32_t DeviceId, void HstPtr, void TgtPtr, int64_t Size,
// Set the context we are using.		__tgt_async_info *AsyncInfoPtr) const {
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);		assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
if (err != CUDA_SUCCESS) {
DP("Error when setting CUDA context\n");		CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
CUDA_ERR_STRING(err);		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}

err = cuMemFree((CUdeviceptr)tgt_ptr);		CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
if (err != CUDA_SUCCESS) {
DP("Error when freeing CUDA memory\n");		Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
CUDA_ERR_STRING(err);		if (Err != CUDA_SUCCESS) {
		DP("Error when copying data from device to host. Pointers: host = " DPxMOD
		", device = " DPxMOD ", size = %" PRId64 "\n",
		DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
		CUDA_ERR_STRING(Err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}		}

return OFFLOAD_SUCCESS;		return OFFLOAD_SUCCESS;
}		}

int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,		int dataDelete(const int DeviceId, void *TgtPtr) const {
void **tgt_args,		CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
ptrdiff_t *tgt_offsets,		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
int32_t arg_num, int32_t team_num,
int32_t thread_limit,
uint64_t loop_tripcount) {
__tgt_async_info async_info;
int32_t rc = __tgt_rtl_run_target_team_region_async(
device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
thread_limit, loop_tripcount, &async_info);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;

return __tgt_rtl_synchronize(device_id, &async_info);		Err = cuMemFree((CUdeviceptr)TgtPtr);
		if (!checkResult(Err, "Error returned from cuMemFree\n"))
		return OFFLOAD_FAIL;

		return OFFLOAD_SUCCESS;
}		}

int32_t __tgt_rtl_run_target_team_region_async(		int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
int32_t device_id, void tgt_entry_ptr, void *tgt_args,		void *TgtArgs, ptrdiff_t TgtOffsets,
ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,		const int ArgNum, const int TeamNum,
int32_t thread_limit, uint64_t loop_tripcount,		const int ThreadLimit,
__tgt_async_info *async_info) {		const unsigned int LoopTripCount,
// Set the context we are using.		__tgt_async_info *AsyncInfo) const {
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);		CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (err != CUDA_SUCCESS) {		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
DP("Error when setting CUDA context\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}

// All args are references.		// All args are references.
std::vector<void *> args(arg_num);		std::vector<void *> Args(ArgNum);
std::vector<void *> ptrs(arg_num);		std::vector<void *> Ptrs(ArgNum);

for (int32_t i = 0; i < arg_num; ++i) {		for (int I = 0; I < ArgNum; ++I) {
ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);		Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
args[i] = &ptrs[i];		Args[I] = &Ptrs[I];
}		}

KernelTy KernelInfo = (KernelTy )tgt_entry_ptr;		const KernelTy *KernelInfo =
		reinterpret_cast<const KernelTy *>(TgtEntryPtr);
int cudaThreadsPerBlock;

if (thread_limit > 0) {		unsigned int CudaThreadsPerBlock;
cudaThreadsPerBlock = thread_limit;		if (ThreadLimit > 0) {
DP("Setting CUDA threads per block to requested %d\n", thread_limit);		DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
		CudaThreadsPerBlock = ThreadLimit;
// Add master warp if necessary		// Add master warp if necessary
if (KernelInfo->ExecutionMode == GENERIC) {		if (KernelInfo->ExecutionMode == GENERIC) {
cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];		DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);		CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
}		}
} else {		} else {
cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
DP("Setting CUDA threads per block to default %d\n",		DP("Setting CUDA threads per block to default %d\n",
DeviceInfo.NumThreads[device_id]);		DeviceData[DeviceId].NumThreads);
		CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
}		}

if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {		if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
DP("Threads per block capped at device limit %d\n",		DP("Threads per block capped at device limit %d\n",
DeviceInfo.ThreadsPerBlock[device_id]);		DeviceData[DeviceId].ThreadsPerBlock);
		CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
}		}

int kernel_limit;		int KernelLimit;
err = cuFuncGetAttribute(&kernel_limit,		Err = cuFuncGetAttribute(&KernelLimit,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);		CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
if (err == CUDA_SUCCESS) {		KernelInfo->Func);
if (kernel_limit < cudaThreadsPerBlock) {		if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
cudaThreadsPerBlock = kernel_limit;		DP("Threads per block capped at kernel limit %d\n", KernelLimit);
DP("Threads per block capped at kernel limit %d\n", kernel_limit);		CudaThreadsPerBlock = KernelLimit;
}		}
}
		unsigned int CudaBlocksPerGrid;
int cudaBlocksPerGrid;		if (TeamNum <= 0) {
if (team_num <= 0) {		if (LoopTripCount > 0 && EnvNumTeams < 0) {
if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
if (KernelInfo->ExecutionMode == SPMD) {		if (KernelInfo->ExecutionMode == SPMD) {
// We have a combined construct, i.e. `target teams distribute parallel		// We have a combined construct, i.e. `target teams distribute
// for [simd]`. We launch so many teams so that each thread will		// parallel for [simd]`. We launch so many teams so that each thread
// execute one iteration of the loop.		// will execute one iteration of the loop. round up to the nearest
// round up to the nearest integer		// integer
cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;		CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
} else {		} else {
// If we reach this point, then we have a non-combined construct, i.e.		// If we reach this point, then we have a non-combined construct, i.e.
// `teams distribute` with a nested `parallel for` and each team is		// `teams distribute` with a nested `parallel for` and each team is
// assigned one iteration of the `distribute` loop. E.g.:		// assigned one iteration of the `distribute` loop. E.g.:
//		//
// #pragma omp target teams distribute		// #pragma omp target teams distribute
// for(...loop_tripcount...) {		// for(...loop_tripcount...) {
// #pragma omp parallel for		// #pragma omp parallel for
// for(...) {}		// for(...) {}
// }		// }
//		//
// Threads within a team will execute the iterations of the `parallel`		// Threads within a team will execute the iterations of the `parallel`
// loop.		// loop.
cudaBlocksPerGrid = loop_tripcount;		CudaBlocksPerGrid = LoopTripCount;
}		}
DP("Using %d teams due to loop trip count %" PRIu64 " and number of "		DP("Using %d teams due to loop trip count %" PRIu64
"threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,		" and number of threads per block %d\n",
cudaThreadsPerBlock);		CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
} else {		} else {
cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];		DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);		CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
}		}
} else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {		} else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
DP("Capping number of teams to team limit %d\n",		DP("Capping number of teams to team limit %d\n",
DeviceInfo.BlocksPerGrid[device_id]);		DeviceData[DeviceId].BlocksPerGrid);
		CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
} else {		} else {
cudaBlocksPerGrid = team_num;		DP("Using requested number of teams %d\n", TeamNum);
DP("Using requested number of teams %d\n", team_num);		CudaBlocksPerGrid = TeamNum;
}		}

// Run on the device.		// Run on the device.
DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,		DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid,
cudaThreadsPerBlock);		CudaThreadsPerBlock);

CUstream Stream = getStream(device_id, async_info);		CUstream Stream = getStream(DeviceId, AsyncInfo);
err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,		Err =
cudaThreadsPerBlock, 1, 1, 0 /bytes of shared memory/,		cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, 1, 1,
Stream, &args[0], 0);		CudaThreadsPerBlock, 1, 1, 0, Stream, &Args[0], nullptr);
if (err != CUDA_SUCCESS) {		if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
DP("Device kernel launch failed!\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}

DP("Launch of entry point at " DPxMOD " successful!\n",		DP("Launch of entry point at " DPxMOD " successful!\n",
DPxPTR(tgt_entry_ptr));		DPxPTR(TgtEntryPtr));

		return OFFLOAD_SUCCESS;
		}

		int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
		CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
		CUresult Err = cuStreamSynchronize(Stream);
		if (Err != CUDA_SUCCESS) {
		DP("Error when synchronizing stream. stream = " DPxMOD
		", async info ptr = " DPxMOD "\n",
		DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
		CUDA_ERR_STRING(Err);
		return OFFLOAD_FAIL;
		}

		jdoerfertUnsubmitted Done Reply Inline Actions You can make `checkResult` variadic in a separate commit to reuse it here. It could even be a macro if that makes it easier. jdoerfert: You can make `checkResult` variadic in a separate commit to reuse it here. It could even be a…
		tianshilei1992AuthorUnsubmitted Done Reply Inline Actions Actually I'm thinking to improve it. Will do in another patch. tianshilei1992: Actually I'm thinking to improve it. Will do in another patch.
		// Once the stream is synchronized, return it to stream pool and reset
		// async_info. This is to make sure the synchronization only works for its
		// own tasks.
		StreamManager->returnStream(
		DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
		AsyncInfoPtr->Queue = nullptr;

return OFFLOAD_SUCCESS;		return OFFLOAD_SUCCESS;
}		}
		};

		DeviceRTLTy DeviceRTL;
		} // namespace

		// Exposed library API function
		#ifdef __cplusplus
		extern "C" {
		#endif

		int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
		return elf_check_machine(image, 190); // EM_CUDA = 190.
		jdoerfertUnsubmitted Done Reply Inline Actions Nit: LLVM style "generally" avoids trailing comments. Maybe inline it: `, /* EM_CUDA / 190` jdoerfert:* Nit: LLVM style "generally" avoids trailing comments. Maybe inline it: `, /* EM_CUDA */ 190`
		}

		int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }

		int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
		DP("Init requires flags to %ld\n", RequiresFlags);
		DeviceRTL.setRequiresFlag(RequiresFlags);
		return RequiresFlags;
		}

		int32_t __tgt_rtl_init_device(int32_t device_id) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

		return DeviceRTL.initDevice(device_id);
		}

		__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
		__tgt_device_image *image) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

		return DeviceRTL.loadBinary(device_id, image);
		}

		void __tgt_rtl_data_alloc(int32_t device_id, int64_t size, void ) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

		return DeviceRTL.dataAlloc(device_id, size);
		}

		int32_t __tgt_rtl_data_submit(int32_t device_id, void tgt_ptr, void hst_ptr,
		int64_t size) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

		__tgt_async_info async_info;
		const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
		size, &async_info);
		if (rc != OFFLOAD_SUCCESS)
		return OFFLOAD_FAIL;

		return __tgt_rtl_synchronize(device_id, &async_info);
		}

		int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
		void *hst_ptr, int64_t size,
		__tgt_async_info *async_info_ptr) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
		assert(async_info_ptr && "async_info_ptr is nullptr");

		return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
		async_info_ptr);
		}

		int32_t __tgt_rtl_data_retrieve(int32_t device_id, void hst_ptr, void tgt_ptr,
		int64_t size) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

		__tgt_async_info async_info;
		const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
		size, &async_info);
		if (rc != OFFLOAD_SUCCESS)
		return OFFLOAD_FAIL;

		return __tgt_rtl_synchronize(device_id, &async_info);
		}

		int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
		void *tgt_ptr, int64_t size,
		__tgt_async_info *async_info_ptr) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
		assert(async_info_ptr && "async_info_ptr is nullptr");

		return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
		async_info_ptr);
		}

		int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

		return DeviceRTL.dataDelete(device_id, tgt_ptr);
		}

		int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
		void **tgt_args,
		ptrdiff_t *tgt_offsets,
		int32_t arg_num, int32_t team_num,
		int32_t thread_limit,
		uint64_t loop_tripcount) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

		__tgt_async_info async_info;
		const int32_t rc = __tgt_rtl_run_target_team_region_async(
		device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
		thread_limit, loop_tripcount, &async_info);
		if (rc != OFFLOAD_SUCCESS)
		return OFFLOAD_FAIL;

		return __tgt_rtl_synchronize(device_id, &async_info);
		}

		int32_t __tgt_rtl_run_target_team_region_async(
		int32_t device_id, void tgt_entry_ptr, void *tgt_args,
		ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
		int32_t thread_limit, uint64_t loop_tripcount,
		__tgt_async_info *async_info_ptr) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

		return DeviceRTL.runTargetTeamRegion(
		device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
		thread_limit, loop_tripcount, async_info_ptr);
		}

int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,		int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
void *tgt_args, ptrdiff_t tgt_offsets,		void *tgt_args, ptrdiff_t tgt_offsets,
int32_t arg_num) {		int32_t arg_num) {
		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

__tgt_async_info async_info;		__tgt_async_info async_info;
int32_t rc = __tgt_rtl_run_target_region_async(		const int32_t rc = __tgt_rtl_run_target_region_async(
device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);		device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
if (rc != OFFLOAD_SUCCESS)		if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;

return __tgt_rtl_synchronize(device_id, &async_info);		return __tgt_rtl_synchronize(device_id, &async_info);
}		}

int32_t __tgt_rtl_run_target_region_async(int32_t device_id,		int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
void tgt_entry_ptr, void *tgt_args,		void tgt_entry_ptr, void *tgt_args,
ptrdiff_t *tgt_offsets,		ptrdiff_t *tgt_offsets,
int32_t arg_num,		int32_t arg_num,
__tgt_async_info *async_info) {		__tgt_async_info *async_info_ptr) {
// use one team and the default number of threads.		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
const int32_t team_num = 1;
const int32_t thread_limit = 0;
return __tgt_rtl_run_target_team_region_async(
device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
thread_limit, 0, async_info);
}

int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) {
assert(async_info && "async_info is nullptr");
assert(async_info->Queue && "async_info->Queue is nullptr");

CUstream Stream = reinterpret_cast<CUstream>(async_info->Queue);		return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr,
CUresult Err = cuStreamSynchronize(Stream);		tgt_args, tgt_offsets, arg_num,
if (Err != CUDA_SUCCESS) {		1, 1, 0, async_info_ptr);
		jdoerfertUnsubmitted Done Reply Inline Actions Add comments for the constants or define hem as before. /* ThreadNum / or something is fine. jdoerfert:* Add comments for the constants or define hem as before. /* ThreadNum */ or something is fine.
DP("Error when synchronizing stream. stream = " DPxMOD
", async info ptr = " DPxMOD "\n",
DPxPTR(Stream), DPxPTR(async_info));
CUDA_ERR_STRING(Err);
return OFFLOAD_FAIL;
}		}

// Once the stream is synchronized, return it to stream pool and reset		int32_t __tgt_rtl_synchronize(int32_t device_id,
// async_info. This is to make sure the synchronization only works for its own		__tgt_async_info *async_info_ptr) {
// tasks.		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
DeviceInfo.returnStream(device_id, async_info);		assert(async_info_ptr && "async_info_ptr is nullptr");
		assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
		jdoerfertUnsubmitted Done Reply Inline Actions I like the validation in the user facing functions. Makes the assertion location better too. jdoerfert: I like the validation in the user facing functions. Makes the assertion location better too.
		tianshilei1992AuthorUnsubmitted Done Reply Inline Actions You mean take the validation from `isValidDeviceId` to this function? tianshilei1992: You mean take the validation from `isValidDeviceId` to this function?
		jdoerfertUnsubmitted Done Reply Inline Actions I mean it is good to have the asserts here as that is where user provide input which we can validate. I'd leave it as it is in this version. jdoerfert: I mean it is good to have the asserts here as that is where user provide input which we can…

return OFFLOAD_SUCCESS;		return DeviceRTL.synchronize(device_id, async_info_ptr);
}		}

#ifdef __cplusplus		#ifdef __cplusplus
}		}
#endif		#endif

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP] Refined CUDA plugin to put all CUDA operations into class
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 256787

openmp/libomptarget/include/omptarget.h

openmp/libomptarget/plugins/cuda/src/rtl.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP] Refined CUDA plugin to put all CUDA operations into classClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 256787

openmp/libomptarget/include/omptarget.h

openmp/libomptarget/plugins/cuda/src/rtl.cpp

[OpenMP] Refined CUDA plugin to put all CUDA operations into class
ClosedPublic