Diff 255779

openmp/libomptarget/plugins/cuda/src/rtl.cpp

Show All 10 Lines
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include <atomic>		#include <atomic>
#include <cassert>		#include <cassert>
#include <cstddef>		#include <cstddef>
#include <cuda.h>		#include <cuda.h>
#include <list>		#include <list>
#include <memory>		#include <memory>
		#include <mutex>
#include <string>		#include <string>
#include <vector>		#include <vector>

#include "omptargetplugin.h"		#include "omptargetplugin.h"

#ifndef TARGET_NAME		#ifndef TARGET_NAME
#define TARGET_NAME CUDA		#define TARGET_NAME CUDA
#endif		#endif
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines
};		};

/// List that contains all the kernels.		/// List that contains all the kernels.
/// FIXME: we may need this to be per device and per library.		/// FIXME: we may need this to be per device and per library.
std::list<KernelTy> KernelsList;		std::list<KernelTy> KernelsList;

/// Class containing all the device information.		/// Class containing all the device information.
class RTLDeviceInfoTy {		class RTLDeviceInfoTy {
		class StreamManagerTy {
		int NumberOfDevices;
		// Per-device stream mutex
		std::vector<std::unique_ptr<std::mutex>> StreamMtx;
		// Per-device stream Id indicates the next available stream in the pool
		std::vector<int> NextStreamId;
		// Per-device stream pool
		std::vector<std::vector<CUstream>> StreamPool;
		jdoerfertUnsubmitted Not Done Reply Inline Actions Later we can think abut making this a vector of structs instead as we tend to access them per device anyway. We can then align the structs such that different processes interacting with different devices don't really interfere with each other. jdoerfert: Later we can think abut making this a vector of structs instead as we tend to access them per…
		tianshilei1992AuthorUnsubmitted Not Done Reply Inline Actions That would be ideal. tianshilei1992: That would be ideal.
		// Pointer to per-device context
		std::vector<CUcontext> *ContextsPtr;

		// If there is no CUstream left in the pool, we will resize the pool to
		// allocate more CUstream. This function should be called with device mutex,
		// and we do not resize to smaller one.
		void resizeStreamPool(const int DeviceId, const size_t NewSize) {
		std::vector<CUstream> &Pool = StreamPool[DeviceId];
		const size_t CurrentSize = Pool.size();
		assert(NewSize > CurrentSize &&
		"new size is not larger than current size");

		Pool.resize(NewSize, nullptr);

		CUresult err = cuCtxSetCurrent((*ContextsPtr)[DeviceId]);
		jdoerfertUnsubmitted Not Done Reply Inline Actions Outline this into a helper. Also the error checking should be a helper. So something like: /// .... bool checkResult(CUResult err, const char ErrMsg) { if (err == CUDA_SUCCESS) return true; DP(ErrMsg); CUDA_ERR_STRING(err); return false; } /// ... bool switchToDevice(DeviceId) { CUresult err = cuCtxSetCurrent((ContextsPtr)[DeviceId]); return checkResult(err, "Error when creating CUDA stream to resize stream pool\n"); } and then use it eveywhere. Should cut down the size and duplication. jdoerfert: Outline this into a helper. Also the error checking should be a helper. So something like: ```…
		tianshilei1992AuthorUnsubmitted Not Done Reply Inline Actions We can do the first one. As for the second function, it must be the member function. Let's do that in next patch which moves every CUDA related things into class. tianshilei1992: We can do the first one. As for the second function, it must be the member function. Let's do…
		jdoerfertUnsubmitted Not Done Reply Inline Actions ok jdoerfert: ok
		if (err != CUDA_SUCCESS) {
		DP("Error when setting current CUDA context\n");
		CUDA_ERR_STRING(err);
		// We will return if cannot switch to the right context in case of
		// creating bunch of streams that are not corresponding to the right
		// device. The offloading will fail later because selected CUstream is
		// nullptr.
		return;
		}

		for (size_t I = CurrentSize; I < NewSize; ++I) {
		err = cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING);
		if (err != CUDA_SUCCESS) {
		DP("Error when creating CUDA stream to resize stream pool\n");
		CUDA_ERR_STRING(err);
		}
		}
		}

		jdoerfertUnsubmitted Not Done Reply Inline Actions If there is no reason ever to provide a nullptr as `CtxPtr` make it a reference everywhere instead. jdoerfert: If there is no reason ever to provide a nullptr as `CtxPtr` make it a reference everywhere…
		tianshilei1992AuthorUnsubmitted Not Done Reply Inline Actions My bad. At the very beginning I thought by some chances it will be `nullptr` but it turns out that will never happen. :-) tianshilei1992: My bad. At the very beginning I thought by some chances it will be `nullptr` but it turns out…
		public:
		// Shilei: I don't like to put this friend function here but it seems like
		// the best way not to refine the whole file.
		friend int32_t __tgt_rtl_init_device(int32_t device_id);

		StreamManagerTy(const int NumberOfDevices, std::vector<CUcontext> *CtxPtr)
		: NumberOfDevices(NumberOfDevices), ContextsPtr(CtxPtr) {
		StreamPool.resize(NumberOfDevices);
		NextStreamId.resize(NumberOfDevices);
		StreamMtx.resize(NumberOfDevices);

		// Initially let's create 32 streams for each device
		int EnvNumInitialStreams = 32;
		jdoerfertUnsubmitted Not Done Reply Inline Actions Don't you need to create the sreams, e.g., call resizeStreamPool instead? jdoerfert: Don't you need to create the sreams, e.g., call resizeStreamPool instead?
		tianshilei1992AuthorUnsubmitted Not Done Reply Inline Actions Here I follow the initial design which is later initialization. In the constructor, it only allocate memory but does not initialize. tianshilei1992: Here I follow the initial design which is later initialization. In the constructor, it only…
		char *envStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS");
		if (envStr) {
		EnvNumInitialStreams = std::stoi(envStr);
		}

		// Initialize the stream pool for each device
		for (std::vector<CUstream> &S : StreamPool) {
		S.resize(EnvNumInitialStreams);
		}

		// Initialize the next stream id
		std::fill(NextStreamId.begin(), NextStreamId.end(), 0);

		// Initialize stream mutex
		for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) {
		Ptr = std::make_unique<std::mutex>();
		}
		}

		~StreamManagerTy() {
		// Destroy streams
		for (int I = 0; I < NumberOfDevices; ++I) {
		CUresult err = cuCtxSetCurrent((*ContextsPtr)[I]);
		if (err != CUDA_SUCCESS) {
		DP("Error when setting current CUDA context\n");
		CUDA_ERR_STRING(err);
		}

		for (CUstream &S : StreamPool[I]) {
		if (!S)
		continue;
		err = cuStreamDestroy(S);
		if (err != CUDA_SUCCESS) {
		DP("Error when destroying CUDA stream\n");
		CUDA_ERR_STRING(err);
		}
		}
		}
		}

		// Get a CUstream from pool. Per-device next stream id always points to the
		// next available CUstream. That means, CUstreams [0, id-1] have been
		// assigned, and [id,] are still available. If there is no CUstream left, we
		// will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
		// the id will increase one.
		// xxxxxs+++++++++
		// ^
		// id
		// After assignment, the pool becomes the following and s is assigned.
		// xxxxxs+++++++++
		// ^
		// id
		CUstream getStream(const int DeviceId) {
		assert(DeviceId >= 0 &&
		static_cast<size_t>(DeviceId) < NextStreamId.size() &&
		"Unexpected device id");

		const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
		int &Id = NextStreamId[DeviceId];
		// No CUstream left in the pool, we need to request from CUDA RT
		if (Id == StreamPool[DeviceId].size()) {
		// By default we double the stream pool every time
		resizeStreamPool(DeviceId, Id * 2);
		}
		return StreamPool[DeviceId][Id++];
		}

		// Return a CUstream back to pool. As mentioned above, per-device next
		// stream is always points to the next available CUstream, so when we return
		// a CUstream, we need to first decrease the id, and then copy the CUstream
		// back.
		// It is worth noting that, the order of streams return might be different
		// from that they're assigned, that saying, at some point, there might be
		// two identical CUstreams.
		// xxax+a+++++
		// ^
		// id
		// However, it doesn't matter, because they're always on the two sides of
		// id. The left one will in the end be overwritten by another CUstream.
		// Therefore, after several execution, the order of pool might be different
		// from its initial state.
		void returnStream(const int DeviceId, CUstream Stream) {
		assert(DeviceId >= 0 &&
		static_cast<size_t>(DeviceId) < NextStreamId.size() &&
		"Unexpected device id");
		{
		const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
		int &Id = NextStreamId[DeviceId];
		assert(Id > 0 && "Wrong stream ID");
		StreamPool[DeviceId][--Id] = Stream;
		}
		}
		};

std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;		std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
std::vector<std::unique_ptr<std::atomic_uint>> NextStreamId;		std::shared_ptr<StreamManagerTy> StreamManager;

public:		public:
		// Shilei: I don't like to put this friend function here but it seems like
		// the best way not to refine the whole file.
		friend int32_t __tgt_rtl_init_device(int32_t device_id);
		jdoerfertUnsubmitted Done Reply Inline Actions No need to put your name here, `git blame` would know anyway. We should move all the CUDA stuff into the stream manager and the stream manager out of the `RTLDeviceInfoTy` but both can be done later. For now just replace the `friends` with a getter. jdoerfert: No need to put your name here, `git blame` would know anyway. We should move all the CUDA…

int NumberOfDevices;		int NumberOfDevices;
std::vector<CUmodule> Modules;		std::vector<CUmodule> Modules;
std::vector<CUcontext> Contexts;		std::vector<CUcontext> Contexts;
std::vector<std::vector<CUstream>> Streams;

// Device properties		// Device properties
std::vector<int> ThreadsPerBlock;		std::vector<int> ThreadsPerBlock;
std::vector<int> BlocksPerGrid;		std::vector<int> BlocksPerGrid;
std::vector<int> WarpSize;		std::vector<int> WarpSize;

// OpenMP properties		// OpenMP properties
std::vector<int> NumTeams;		std::vector<int> NumTeams;
std::vector<int> NumThreads;		std::vector<int> NumThreads;

// OpenMP Environment properties		// OpenMP Environment properties
int EnvNumTeams;		int EnvNumTeams;
int EnvTeamLimit;		int EnvTeamLimit;
int EnvNumStreams;

// OpenMP Requires Flags		// OpenMP Requires Flags
int64_t RequiresFlags;		int64_t RequiresFlags;

//static int EnvNumThreads;		// static int EnvNumThreads;
static const int HardTeamLimit = 1<<16; // 64k		static const int HardTeamLimit = 1 << 16; // 64k
static const int HardThreadLimit = 1024;		static const int HardThreadLimit = 1024;
static const int DefaultNumTeams = 128;		static const int DefaultNumTeams = 128;
static const int DefaultNumThreads = 128;		static const int DefaultNumThreads = 128;

		CUstream getStream(const int DeviceId) {
		return StreamManager->getStream(DeviceId);
		}

		void returnStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) {
		assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
		assert(AsyncInfoPtr->Queue && "AsyncInfoPtr->Queue is nullptr");

		StreamManager->returnStream(
		DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
		AsyncInfoPtr->Queue = nullptr;
		}

// Record entry point associated with device		// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {		void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
assert(device_id < (int32_t)FuncGblEntries.size() &&		assert(device_id < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");		"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();		FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();

E.Entries.push_back(entry);		E.Entries.push_back(entry);
}		}
Show All 39 Lines	void clearOffloadEntriesTable(int32_t device_id) {
assert(device_id < (int32_t)FuncGblEntries.size() &&		assert(device_id < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");		"Unexpected device id!");
FuncGblEntries[device_id].emplace_back();		FuncGblEntries[device_id].emplace_back();
FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();		FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
E.Entries.clear();		E.Entries.clear();
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;		E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
}		}

// Get the next stream on a given device in a round robin manner
CUstream &getNextStream(const int DeviceId) {
assert(DeviceId >= 0 &&
static_cast<size_t>(DeviceId) < NextStreamId.size() &&
"Unexpected device id!");
const unsigned int Id = NextStreamId[DeviceId]->fetch_add(1);
return Streams[DeviceId][Id % EnvNumStreams];
}

RTLDeviceInfoTy() {		RTLDeviceInfoTy() {
#ifdef OMPTARGET_DEBUG		#ifdef OMPTARGET_DEBUG
if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {		if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
DebugLevel = std::stoi(envStr);		DebugLevel = std::stoi(envStr);
}		}
#endif // OMPTARGET_DEBUG		#endif // OMPTARGET_DEBUG

DP("Start initializing CUDA\n");		DP("Start initializing CUDA\n");

CUresult err = cuInit(0);		CUresult err = cuInit(0);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when initializing CUDA\n");		DP("Error when initializing CUDA\n");
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
return;		return;
}		}

NumberOfDevices = 0;		NumberOfDevices = 0;

err = cuDeviceGetCount(&NumberOfDevices);		err = cuDeviceGetCount(&NumberOfDevices);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when getting CUDA device count\n");		DP("Error when getting CUDA device count\n");
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
		jdoerfertUnsubmitted Done Reply Inline Actions This is clever. So clever in fact that we need a comment above `StreamPool` (or similar) describing how the different values interact and how the `StreamPool` evolves. Basically how it works. jdoerfert: This is clever. So clever in fact that we need a comment above `StreamPool` (or similar)…
return;		return;
}		}

if (NumberOfDevices == 0) {		if (NumberOfDevices == 0) {
DP("There are no devices supporting CUDA.\n");		DP("There are no devices supporting CUDA.\n");
return;		return;
}		}

FuncGblEntries.resize(NumberOfDevices);		FuncGblEntries.resize(NumberOfDevices);
Contexts.resize(NumberOfDevices);		Contexts.resize(NumberOfDevices);
Streams.resize(NumberOfDevices);
NextStreamId.resize(NumberOfDevices);
ThreadsPerBlock.resize(NumberOfDevices);		ThreadsPerBlock.resize(NumberOfDevices);
BlocksPerGrid.resize(NumberOfDevices);		BlocksPerGrid.resize(NumberOfDevices);
WarpSize.resize(NumberOfDevices);		WarpSize.resize(NumberOfDevices);
NumTeams.resize(NumberOfDevices);		NumTeams.resize(NumberOfDevices);
NumThreads.resize(NumberOfDevices);		NumThreads.resize(NumberOfDevices);

// Get environment variables regarding teams		// Get environment variables regarding teams
char *envStr = getenv("OMP_TEAM_LIMIT");		char *envStr = getenv("OMP_TEAM_LIMIT");
if (envStr) {		if (envStr) {
// OMP_TEAM_LIMIT has been set		// OMP_TEAM_LIMIT has been set
EnvTeamLimit = std::stoi(envStr);		EnvTeamLimit = std::stoi(envStr);
DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);		DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
} else {		} else {
EnvTeamLimit = -1;		EnvTeamLimit = -1;
}		}
envStr = getenv("OMP_NUM_TEAMS");		envStr = getenv("OMP_NUM_TEAMS");
if (envStr) {		if (envStr) {
// OMP_NUM_TEAMS has been set		// OMP_NUM_TEAMS has been set
EnvNumTeams = std::stoi(envStr);		EnvNumTeams = std::stoi(envStr);
DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);		DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
} else {		} else {
EnvNumTeams = -1;		EnvNumTeams = -1;
}		}

// By default let's create 256 streams per device		StreamManager =
EnvNumStreams = 256;		std::make_shared<StreamManagerTy>(NumberOfDevices, &Contexts);
envStr = getenv("LIBOMPTARGET_NUM_STREAMS");
if (envStr) {
EnvNumStreams = std::stoi(envStr);
}

// Initialize streams for each device
for (std::vector<CUstream> &S : Streams) {
S.resize(EnvNumStreams);
}

// Initialize the next stream id
for (std::unique_ptr<std::atomic_uint> &Ptr : NextStreamId) {
Ptr = std::make_unique<std::atomic_uint>(0);
}

// Default state.		// Default state.
RequiresFlags = OMP_REQ_UNDEFINED;		RequiresFlags = OMP_REQ_UNDEFINED;
}		}

~RTLDeviceInfoTy() {		~RTLDeviceInfoTy() {
		// First destruct stream manager in case of Contexts is destructed before it
		StreamManager = nullptr;

// Close modules		// Close modules
for (auto &module : Modules)		for (auto &module : Modules)
if (module) {		if (module) {
CUresult err = cuModuleUnload(module);		CUresult err = cuModuleUnload(module);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when unloading CUDA module\n");		DP("Error when unloading CUDA module\n");
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
}		}
}		}

// Destroy streams before contexts
for (int I = 0; I < NumberOfDevices; ++I) {
CUresult err = cuCtxSetCurrent(Contexts[I]);
if (err != CUDA_SUCCESS) {
DP("Error when setting current CUDA context\n");
CUDA_ERR_STRING(err);
}

for (auto &S : Streams[I])
if (S) {
err = cuStreamDestroy(S);
if (err != CUDA_SUCCESS) {
DP("Error when destroying CUDA stream\n");
CUDA_ERR_STRING(err);
}
}
}

// Destroy contexts		// Destroy contexts
for (auto &ctx : Contexts)		for (auto &ctx : Contexts)
if (ctx) {		if (ctx) {
CUresult err = cuCtxDestroy(ctx);		CUresult err = cuCtxDestroy(ctx);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when destroying CUDA context\n");		DP("Error when destroying CUDA context\n");
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
}		}
}		}
}		}
};		};

static RTLDeviceInfoTy DeviceInfo;		static RTLDeviceInfoTy DeviceInfo;

namespace {		namespace {
CUstream selectStream(int32_t Id, __tgt_async_info *AsyncInfo) {		CUstream getStream(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr) {
if (!AsyncInfo)		assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
return DeviceInfo.getNextStream(Id);

if (!AsyncInfo->Queue)		if (!AsyncInfoPtr->Queue)
AsyncInfo->Queue = DeviceInfo.getNextStream(Id);		AsyncInfoPtr->Queue = DeviceInfo.getStream(DeviceId);

return reinterpret_cast<CUstream>(AsyncInfo->Queue);		return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
}		}

int32_t dataRetrieve(int32_t DeviceId, void HstPtr, void TgtPtr, int64_t Size,		int32_t dataRetrieve(int32_t DeviceId, void HstPtr, void TgtPtr, int64_t Size,
__tgt_async_info *AsyncInfoPtr) {		__tgt_async_info *AsyncInfoPtr) {
assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");		assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
// Set the context we are using.		// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);		CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when setting CUDA context\n");		DP("Error when setting CUDA context\n");
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}		}

CUstream Stream = selectStream(DeviceId, AsyncInfoPtr);		CUstream Stream = getStream(DeviceId, AsyncInfoPtr);

err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);		err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when copying data from device to host. Pointers: host = " DPxMOD		DP("Error when copying data from device to host. Pointers: host = " DPxMOD
", device = " DPxMOD ", size = %" PRId64 "\n",		", device = " DPxMOD ", size = %" PRId64 "\n",
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);		DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}		}

return OFFLOAD_SUCCESS;		return OFFLOAD_SUCCESS;
}		}

int32_t dataSubmit(int32_t DeviceId, void TgtPtr, void HstPtr, int64_t Size,		int32_t dataSubmit(int32_t DeviceId, void TgtPtr, void HstPtr, int64_t Size,
__tgt_async_info *AsyncInfoPtr) {		__tgt_async_info *AsyncInfoPtr) {
assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");		assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
// Set the context we are using.		// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);		CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when setting CUDA context\n");		DP("Error when setting CUDA context\n");
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}		}

CUstream Stream = selectStream(DeviceId, AsyncInfoPtr);		CUstream Stream = getStream(DeviceId, AsyncInfoPtr);

err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);		err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when copying data from host to device. Pointers: host = " DPxMOD		DP("Error when copying data from host to device. Pointers: host = " DPxMOD
", device = " DPxMOD ", size = %" PRId64 "\n",		", device = " DPxMOD ", size = %" PRId64 "\n",
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);		DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
Show All 40 Lines	int32_t __tgt_rtl_init_device(int32_t device_id) {
}		}

err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);		err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when setting current CUDA context\n");		DP("Error when setting current CUDA context\n");
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
}		}

for (CUstream &Stream : DeviceInfo.Streams[device_id]) {		for (CUstream &Stream : DeviceInfo.StreamManager->StreamPool[device_id]) {
err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);		err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error when creating CUDA stream\n");		DP("Error when creating CUDA stream\n");
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
}		}
		jdoerfertUnsubmitted Not Done Reply Inline Actions I was expecting this to be part of the StreamManager. If it can only happen at this point, maybe we want a `StreamManager::initializeDevice` method or similar. We can then also avoid exposing the stream pool to the outside. jdoerfert: I was expecting this to be part of the StreamManager. If it can only happen at this point…
		tianshilei1992AuthorUnsubmitted Not Done Reply Inline Actions Yes, we could do that. But what I'm thinking is, do we assume `cuCtxSetCurrent` is called before we call `StreamManager::initializeDevice`, or we will set the context again in `StreamManager::initializeDevice`? tianshilei1992: Yes, we could do that. But what I'm thinking is, do we assume `cuCtxSetCurrent` is called…
		jdoerfertUnsubmitted Not Done Reply Inline Actions Once could move both to the `StreamManager::initializeDevice`, set ctx and create streams. One could then either reset the context or just document that the context is set by `initializeDevice`. This is a one time cost so setting the context twice is not my main concern. jdoerfert: Once could move both to the `StreamManager::initializeDevice`, set ctx and create streams. One…
		tianshilei1992AuthorUnsubmitted Not Done Reply Inline Actions Yes, in the next patch, all these issues are no long existing. Here let's assume that when call `StreamManager::initializeDevice`, the right context has been set. tianshilei1992: Yes, in the next patch, all these issues are no long existing. Here let's assume that when call…
}		}

// Query attributes to determine number of threads/block and blocks/grid.		// Query attributes to determine number of threads/block and blocks/grid.
int maxGridDimX;		int maxGridDimX;
err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,		err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
cuDevice);		cuDevice);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Error getting max grid dimension, use default\n");		DP("Error getting max grid dimension, use default\n");
▲ Show 20 Lines • Show All 444 Lines • ▼ Show 20 Lines	if (team_num <= 0) {
cudaBlocksPerGrid = team_num;		cudaBlocksPerGrid = team_num;
DP("Using requested number of teams %d\n", team_num);		DP("Using requested number of teams %d\n", team_num);
}		}

// Run on the device.		// Run on the device.
DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,		DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
cudaThreadsPerBlock);		cudaThreadsPerBlock);

CUstream Stream = selectStream(device_id, async_info);		CUstream Stream = getStream(device_id, async_info);
err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,		err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
cudaThreadsPerBlock, 1, 1, 0 /bytes of shared memory/,		cudaThreadsPerBlock, 1, 1, 0 /bytes of shared memory/,
Stream, &args[0], 0);		Stream, &args[0], 0);
if (err != CUDA_SUCCESS) {		if (err != CUDA_SUCCESS) {
DP("Device kernel launch failed!\n");		DP("Device kernel launch failed!\n");
CUDA_ERR_STRING(err);		CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}		}
Show All 24 Lines	int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) {
CUresult Err = cuStreamSynchronize(Stream);		CUresult Err = cuStreamSynchronize(Stream);
if (Err != CUDA_SUCCESS) {		if (Err != CUDA_SUCCESS) {
DP("Error when synchronizing stream. stream = " DPxMOD		DP("Error when synchronizing stream. stream = " DPxMOD
", async info ptr = " DPxMOD "\n",		", async info ptr = " DPxMOD "\n",
DPxPTR(Stream), DPxPTR(async_info));		DPxPTR(Stream), DPxPTR(async_info));
CUDA_ERR_STRING(Err);		CUDA_ERR_STRING(Err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}		}

		// Once the stream is synchronized, return it to stream pool and reset
		// async_info. This is to make sure the synchronization only works for its own
		// tasks.
		DeviceInfo.returnStream(device_id, async_info);

return OFFLOAD_SUCCESS;		return OFFLOAD_SUCCESS;
}		}

#ifdef __cplusplus		#ifdef __cplusplus
}		}
#endif		#endif

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP] Introduce stream pool to make sure the correctness of device synchronization
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 255779

openmp/libomptarget/plugins/cuda/src/rtl.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP] Introduce stream pool to make sure the correctness of device synchronizationClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 255779

openmp/libomptarget/plugins/cuda/src/rtl.cpp

[OpenMP] Introduce stream pool to make sure the correctness of device synchronization
ClosedPublic