Diff 266707

openmp/libomptarget/include/omptargetplugin.h

	Show All 25 Lines

	// Return an integer different from zero if the provided device image can be			// Return an integer different from zero if the provided device image can be
	// supported by the runtime. The functionality is similar to comparing the			// supported by the runtime. The functionality is similar to comparing the
	// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a			// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
	// lightweight query to determine if the RTL is suitable for an image without			// lightweight query to determine if the RTL is suitable for an image without
	// having to load the library, which can be expensive.			// having to load the library, which can be expensive.
	int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);			int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);

				// Return an integer other than zero if the data can be exchaned from SrcDevId
				// to DstDevId. If it is data exchangable, the device plugin should provide
				// function to move data from source device to destination device directly.
				int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int32_t DstDevId);

	// Initialize the requires flags for the device.			// Initialize the requires flags for the device.
	int64_t __tgt_rtl_init_requires(int64_t RequiresFlags);			int64_t __tgt_rtl_init_requires(int64_t RequiresFlags);

	// Initialize the specified device. In case of success return 0; otherwise			// Initialize the specified device. In case of success return 0; otherwise
	// return an error code.			// return an error code.
	int32_t __tgt_rtl_init_device(int32_t ID);			int32_t __tgt_rtl_init_device(int32_t ID);

	// Pass an executable image section described by image to the specified			// Pass an executable image section described by image to the specified
	Show All 30 Lines
	int32_t __tgt_rtl_data_retrieve(int32_t ID, void HostPtr, void TargetPtr,			int32_t __tgt_rtl_data_retrieve(int32_t ID, void HostPtr, void TargetPtr,
	int64_t Size);			int64_t Size);

	// Asynchronous version of __tgt_rtl_data_retrieve			// Asynchronous version of __tgt_rtl_data_retrieve
	int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr,			int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr,
	void *TargetPtr, int64_t Size,			void *TargetPtr, int64_t Size,
	__tgt_async_info *AsyncInfoPtr);			__tgt_async_info *AsyncInfoPtr);

				// Copy the data content from one target device to another target device using
				// its address. This operation does not need to copy data back to host and then
				// from host to another device. In case of success, return zero. Otherwise,
				// return an error code.
				int32_t __tgt_rtl_data_exchange(int32_t SrcID, void *SrcPtr, int32_t DstID,
				void *DstPtr, int64_t Size);

				// Asynchronous version of __tgt_rtl_data_exchange
				int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr,
				int32_t DesID, void *DstPtr, int64_t Size,
				__tgt_async_info *AsyncInfoPtr);

	// De-allocate the data referenced by target ptr on the device. In case of			// De-allocate the data referenced by target ptr on the device. In case of
	// success, return zero. Otherwise, return an error code.			// success, return zero. Otherwise, return an error code.
	int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);			int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);

	// Transfer control to the offloaded entry Entry on the target device.			// Transfer control to the offloaded entry Entry on the target device.
	// Args and Offsets are arrays of NumArgs size of target addresses and			// Args and Offsets are arrays of NumArgs size of target addresses and
	// offsets. An offset should be added to the target address before passing it			// offsets. An offset should be added to the target address before passing it
	// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is			// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is
	Show All 36 Lines

openmp/libomptarget/plugins/cuda/src/rtl.cpp

Show First 20 Lines • Show All 730 Lines • ▼ Show 20 Lines	if (Err != CUDA_SUCCESS) {
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);		DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
CUDA_ERR_STRING(Err);		CUDA_ERR_STRING(Err);
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
}		}

return OFFLOAD_SUCCESS;		return OFFLOAD_SUCCESS;
}		}

		int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId,
		const void *DstPtr, int64_t Size,
		__tgt_async_info *AsyncInfoPtr) const {
		assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");

		CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
		return OFFLOAD_FAIL;

		CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);

		// If they are two devices, we try peer to peer copy first
		if (SrcDevId != DstDevId) {
		int CanAccessPeer = 0;
		Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
		if (Err != CUDA_SUCCESS) {
		DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
		", dst = %" PRId32 "\n",
		SrcDevId, DstDevId);
		CUDA_ERR_STRING(Err);
		goto MemcpyDtoD;
		}

		if (!CanAccessPeer) {
		DP("P2P memcpy not supported so fall back to D2D memcpy");
		goto MemcpyDtoD;
		}

		Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
		if (Err != CUDA_SUCCESS) {
		DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
		", dst = %" PRId32 "\n",
		SrcDevId, DstDevId);
		CUDA_ERR_STRING(Err);
		goto MemcpyDtoD;
		}

		Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
		(CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
		Size, Stream);
		if (Err == CUDA_SUCCESS)
		return OFFLOAD_SUCCESS;

		DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
		", src_id =%" PRId32 ", dst_ptr = %" DPxMOD ", dst_id =%" PRId32 "\n",
		SrcPtr, SrcDevId, DstPtr, DstDevId);
		CUDA_ERR_STRING(Err);
		}

		MemcpyDtoD:
		jdoerfertUnsubmitted Done Reply Inline Actions Any reason not to make this a function and call it? jdoerfert: Any reason not to make this a function and call it?
		Err = cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size,
		Stream);
		if (Err != CUDA_SUCCESS) {
		DP("Error when copying data from device to device. Pointers: src "
		"= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
		DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
		CUDA_ERR_STRING(Err);
		return OFFLOAD_FAIL;
		}

		return OFFLOAD_SUCCESS;
		}

int dataDelete(const int DeviceId, void *TgtPtr) const {		int dataDelete(const int DeviceId, void *TgtPtr) const {
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);		CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))		if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;

Err = cuMemFree((CUdeviceptr)TgtPtr);		Err = cuMemFree((CUdeviceptr)TgtPtr);
if (!checkResult(Err, "Error returned from cuMemFree\n"))		if (!checkResult(Err, "Error returned from cuMemFree\n"))
return OFFLOAD_FAIL;		return OFFLOAD_FAIL;
▲ Show 20 Lines • Show All 148 Lines • ▼ Show 20 Lines
int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }		int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }

int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {		int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
DP("Init requires flags to %ld\n", RequiresFlags);		DP("Init requires flags to %ld\n", RequiresFlags);
DeviceRTL.setRequiresFlag(RequiresFlags);		DeviceRTL.setRequiresFlag(RequiresFlags);
return RequiresFlags;		return RequiresFlags;
}		}

		int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
		if (DeviceRTL.isValidDeviceId(src_dev_id) &&
		DeviceRTL.isValidDeviceId(dst_dev_id))
		return OFFLOAD_SUCCESS;

		return OFFLOAD_FAIL;
		}

int32_t __tgt_rtl_init_device(int32_t device_id) {		int32_t __tgt_rtl_init_device(int32_t device_id) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

return DeviceRTL.initDevice(device_id);		return DeviceRTL.initDevice(device_id);
}		}

__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,		__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
__tgt_device_image *image) {		__tgt_device_image *image) {
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines	int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
__tgt_async_info *async_info_ptr) {		__tgt_async_info *async_info_ptr) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
assert(async_info_ptr && "async_info_ptr is nullptr");		assert(async_info_ptr && "async_info_ptr is nullptr");

return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,		return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
async_info_ptr);		async_info_ptr);
}		}

		int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
		int dst_dev_id, void *dst_ptr,
		int64_t size,
		__tgt_async_info *async_info_ptr) {
		assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
		assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
		assert(async_info_ptr && "async_info_ptr is nullptr");

		return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
		async_info_ptr);
		}

		int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
		int32_t dst_dev_id, void *dst_ptr,
		int64_t size) {
		assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
		assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");

		__tgt_async_info async_info;
		const int32_t rc = __tgt_rtl_data_exchange_async(
		src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
		if (rc != OFFLOAD_SUCCESS)
		return OFFLOAD_FAIL;

		return __tgt_rtl_synchronize(src_dev_id, &async_info);
		}

int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {		int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");		assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");

return DeviceRTL.dataDelete(device_id, tgt_ptr);		return DeviceRTL.dataDelete(device_id, tgt_ptr);
}		}

int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,		int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args,		void **tgt_args,
▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines

openmp/libomptarget/plugins/exports

	VERS1.0 {			VERS1.0 {
	global:			global:
	__tgt_rtl_is_valid_binary;			__tgt_rtl_is_valid_binary;
				__tgt_rtl_is_data_exchangable;
	__tgt_rtl_number_of_devices;			__tgt_rtl_number_of_devices;
	__tgt_rtl_init_requires;			__tgt_rtl_init_requires;
	__tgt_rtl_init_device;			__tgt_rtl_init_device;
	__tgt_rtl_load_binary;			__tgt_rtl_load_binary;
	__tgt_rtl_data_alloc;			__tgt_rtl_data_alloc;
	__tgt_rtl_data_submit;			__tgt_rtl_data_submit;
	__tgt_rtl_data_submit_async;			__tgt_rtl_data_submit_async;
	__tgt_rtl_data_retrieve;			__tgt_rtl_data_retrieve;
	__tgt_rtl_data_retrieve_async;			__tgt_rtl_data_retrieve_async;
				__tgt_rtl_data_exchange;
				__tgt_rtl_data_exchange_async;
	__tgt_rtl_data_delete;			__tgt_rtl_data_delete;
	__tgt_rtl_run_target_team_region;			__tgt_rtl_run_target_team_region;
	__tgt_rtl_run_target_team_region_async;			__tgt_rtl_run_target_team_region_async;
	__tgt_rtl_run_target_region;			__tgt_rtl_run_target_region;
	__tgt_rtl_run_target_region_async;			__tgt_rtl_run_target_region_async;
	__tgt_rtl_synchronize;			__tgt_rtl_synchronize;
	local:			local:
	*;			*;
	};			};

openmp/libomptarget/src/api.cpp

Show First 20 Lines • Show All 162 Lines • ▼ Show 20 Lines	if (src_device == omp_get_initial_device() &&
DeviceTy& DstDev = Devices[dst_device];		DeviceTy& DstDev = Devices[dst_device];
rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr);		rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr);
} else if (dst_device == omp_get_initial_device()) {		} else if (dst_device == omp_get_initial_device()) {
DP("copy from device to host\n");		DP("copy from device to host\n");
DeviceTy& SrcDev = Devices[src_device];		DeviceTy& SrcDev = Devices[src_device];
rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr);		rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr);
} else {		} else {
DP("copy from device to device\n");		DP("copy from device to device\n");
void *buffer = malloc(length);
DeviceTy& SrcDev = Devices[src_device];		DeviceTy &SrcDev = Devices[src_device];
DeviceTy& DstDev = Devices[dst_device];		DeviceTy &DstDev = Devices[dst_device];
		// First try to use D2D memcpy which is more efficient. If fails, fall back
		// to unefficient way.
		if (SrcDev.isDataExchangable(DstDev)) {
		rc = SrcDev.data_exchange(srcAddr, DstDev, dstAddr, length, nullptr);
		if (rc == OFFLOAD_SUCCESS)
		return OFFLOAD_SUCCESS;
		jdoerfertUnsubmitted Done Reply Inline Actions Please move this into the conditional. jdoerfert: Please move this into the conditional.
		}

		void *buffer = malloc(length);
rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr);		rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr);
if (rc == OFFLOAD_SUCCESS)		if (rc == OFFLOAD_SUCCESS)
rc = DstDev.data_submit(dstAddr, buffer, length, nullptr);		rc = DstDev.data_submit(dstAddr, buffer, length, nullptr);
free(buffer);		free(buffer);
}		}

DP("omp_target_memcpy returns %d\n", rc);		DP("omp_target_memcpy returns %d\n", rc);
return rc;		return rc;
▲ Show 20 Lines • Show All 110 Lines • Show Last 20 Lines

openmp/libomptarget/src/device.h

Show First 20 Lines • Show All 151 Lines • ▼ Show 20 Lines	DeviceTy& operator=(const DeviceTy &d) {
HostDataToTargetMap = d.HostDataToTargetMap;		HostDataToTargetMap = d.HostDataToTargetMap;
PendingCtorsDtors = d.PendingCtorsDtors;		PendingCtorsDtors = d.PendingCtorsDtors;
ShadowPtrMap = d.ShadowPtrMap;		ShadowPtrMap = d.ShadowPtrMap;
LoopTripCnt = d.LoopTripCnt;		LoopTripCnt = d.LoopTripCnt;

return *this;		return *this;
}		}

		// Return true if data can be copied to DstDevice directly
		jdoerfertUnsubmitted Done Reply Inline Actions Documentation please. jdoerfert: Documentation please.
		bool isDataExchangable(const DeviceTy& DstDevice);

uint64_t getMapEntryRefCnt(void *HstPtrBegin);		uint64_t getMapEntryRefCnt(void *HstPtrBegin);
LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);		LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
void getOrAllocTgtPtr(void HstPtrBegin, void *HstPtrBase, int64_t Size,		void getOrAllocTgtPtr(void HstPtrBegin, void *HstPtrBase, int64_t Size,
bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true,		bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true,
bool HasCloseModifier = false);		bool HasCloseModifier = false);
void getTgtPtrBegin(void HstPtrBegin, int64_t Size);		void getTgtPtrBegin(void HstPtrBegin, int64_t Size);
void getTgtPtrBegin(void HstPtrBegin, int64_t Size, bool &IsLast,		void getTgtPtrBegin(void HstPtrBegin, int64_t Size, bool &IsLast,
bool UpdateRefCount, bool &IsHostPtr);		bool UpdateRefCount, bool &IsHostPtr);
int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete,		int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete,
bool HasCloseModifier = false);		bool HasCloseModifier = false);
int associatePtr(void HstPtrBegin, void TgtPtrBegin, int64_t Size);		int associatePtr(void HstPtrBegin, void TgtPtrBegin, int64_t Size);
int disassociatePtr(void *HstPtrBegin);		int disassociatePtr(void *HstPtrBegin);

// calls to RTL		// calls to RTL
int32_t initOnce();		int32_t initOnce();
__tgt_target_table load_binary(void Img);		__tgt_target_table load_binary(void Img);

// Data transfer. When AsyncInfoPtr is nullptr, the transfer will be		// Data transfer. When AsyncInfoPtr is nullptr, the transfer will be
// synchronous.		// synchronous.
int32_t data_submit(void TgtPtrBegin, void HstPtrBegin, int64_t Size,		int32_t data_submit(void TgtPtrBegin, void HstPtrBegin, int64_t Size,
__tgt_async_info *AsyncInfoPtr);		__tgt_async_info *AsyncInfoPtr);
int32_t data_retrieve(void HstPtrBegin, void TgtPtrBegin, int64_t Size,		int32_t data_retrieve(void HstPtrBegin, void TgtPtrBegin, int64_t Size,
__tgt_async_info *AsyncInfoPtr);		__tgt_async_info *AsyncInfoPtr);
		int32_t data_exchange(void SrcPtr, DeviceTy DstDev, void DstPtr,
		int64_t Size, __tgt_async_info *AsyncInfoPtr);
		jdoerfertUnsubmitted Done Reply Inline Actions Documentation please. jdoerfert: Documentation please.
		jdoerfertUnsubmitted Done Reply Inline Actions ^ jdoerfert: ^

int32_t run_region(void TgtEntryPtr, void *TgtVarsPtr,		int32_t run_region(void TgtEntryPtr, void *TgtVarsPtr,
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,		ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
__tgt_async_info *AsyncInfoPtr);		__tgt_async_info *AsyncInfoPtr);
int32_t run_team_region(void TgtEntryPtr, void *TgtVarsPtr,		int32_t run_team_region(void TgtEntryPtr, void *TgtVarsPtr,
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,		ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
int32_t NumTeams, int32_t ThreadLimit,		int32_t NumTeams, int32_t ThreadLimit,
uint64_t LoopTripCount,		uint64_t LoopTripCount,
Show All 14 Lines

openmp/libomptarget/src/device.cpp

Show First 20 Lines • Show All 346 Lines • ▼ Show 20 Lines	int32_t DeviceTy::data_retrieve(void HstPtrBegin, void TgtPtrBegin,
int64_t Size, __tgt_async_info *AsyncInfoPtr) {		int64_t Size, __tgt_async_info *AsyncInfoPtr) {
if (!AsyncInfoPtr \|\| !RTL->data_retrieve_async \|\| !RTL->synchronize)		if (!AsyncInfoPtr \|\| !RTL->data_retrieve_async \|\| !RTL->synchronize)
return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);		return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
else		else
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,		return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
AsyncInfoPtr);		AsyncInfoPtr);
}		}

		// Copy data from current device to destination device directly
		int32_t DeviceTy::data_exchange(void SrcPtr, DeviceTy DstDev, void DstPtr,
		int64_t Size, __tgt_async_info *AsyncInfoPtr) {
		if (!AsyncInfoPtr \|\| !RTL->data_exchange_async \|\| !RTL->synchronize) {
		assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
		return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
		Size);
		} else
		return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
		DstPtr, Size, AsyncInfoPtr);
		}

// Run region on device		// Run region on device
int32_t DeviceTy::run_region(void TgtEntryPtr, void *TgtVarsPtr,		int32_t DeviceTy::run_region(void TgtEntryPtr, void *TgtVarsPtr,
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,		ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
__tgt_async_info *AsyncInfoPtr) {		__tgt_async_info *AsyncInfoPtr) {
if (!AsyncInfoPtr \|\| !RTL->run_region \|\| !RTL->synchronize)		if (!AsyncInfoPtr \|\| !RTL->run_region \|\| !RTL->synchronize)
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,		return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
TgtVarsSize);		TgtVarsSize);
else		else
Show All 12 Lines	return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,		TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
LoopTripCount);		LoopTripCount);
else		else
return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,		return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, NumTeams,		TgtOffsets, TgtVarsSize, NumTeams,
ThreadLimit, LoopTripCount, AsyncInfoPtr);		ThreadLimit, LoopTripCount, AsyncInfoPtr);
}		}

		// Whether data can be copied to DstDevice directly
		bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
		if (RTL != DstDevice.RTL \|\| !RTL->is_data_exchangable)
		return false;
		jdoerfertUnsubmitted Done Reply Inline Actions We use the name? I somehow feel uneasy about this. Don't we have some form of ID? jdoerfert: We use the name? I somehow feel uneasy about this. Don't we have some form of ID?
		grokosUnsubmitted Done Reply Inline Actions I agree. Be careful because `RTLInfoTy::RTLName` is only available in debug builds, so this piece of code will break if we compile the library in release mode. You can use a direct pointer comparison `RTL == OtherDevice.RTL` (devices managed by the same RTL will point to the same `RTLInfoTy` object). grokos: I agree. Be careful because `RTLInfoTy::RTLName` is only available in debug builds, so this…
		tianshilei1992AuthorUnsubmitted Done Reply Inline Actions Yes, that's why I removed the macro. The pointer comparison may not work considering that there is in fact one case can violate it: OpenCL ICD, although it is not part of OpenMP. Maybe adding a new plugin interface here is more appropriate. tianshilei1992: Yes, that's why I removed the macro. The pointer comparison may not work considering that there…

		if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID) ==
		OFFLOAD_SUCCESS)
		return (RTL->data_exchange != nullptr) \|\|
		(RTL->data_exchange_async != nullptr);

		return false;
		}

/// Check whether a device has an associated RTL and initialize it if it's not		/// Check whether a device has an associated RTL and initialize it if it's not
/// already initialized.		/// already initialized.
bool device_is_ready(int device_num) {		bool device_is_ready(int device_num) {
DP("Checking whether device %d is ready.\n", device_num);		DP("Checking whether device %d is ready.\n", device_num);
// Devices.size() can only change while registering a new		// Devices.size() can only change while registering a new
// library, so try to acquire the lock of RTLs' mutex.		// library, so try to acquire the lock of RTLs' mutex.
RTLsMtx->lock();		RTLsMtx->lock();
size_t Devices_size = Devices.size();		size_t Devices_size = Devices.size();
Show All 22 Lines

openmp/libomptarget/src/rtl.h

Show All 20 Lines
#include <vector>		#include <vector>

// Forward declarations.		// Forward declarations.
struct DeviceTy;		struct DeviceTy;
struct __tgt_bin_desc;		struct __tgt_bin_desc;

struct RTLInfoTy {		struct RTLInfoTy {
typedef int32_t(is_valid_binary_ty)(void *);		typedef int32_t(is_valid_binary_ty)(void *);
		typedef int32_t(is_data_exchangable_ty)(int32_t, int32_t);
typedef int32_t(number_of_devices_ty)();		typedef int32_t(number_of_devices_ty)();
typedef int32_t(init_device_ty)(int32_t);		typedef int32_t(init_device_ty)(int32_t);
typedef __tgt_target_table (load_binary_ty)(int32_t, void );		typedef __tgt_target_table (load_binary_ty)(int32_t, void );
typedef void (data_alloc_ty)(int32_t, int64_t, void );		typedef void (data_alloc_ty)(int32_t, int64_t, void );
typedef int32_t(data_submit_ty)(int32_t, void , void , int64_t);		typedef int32_t(data_submit_ty)(int32_t, void , void , int64_t);
typedef int32_t(data_submit_async_ty)(int32_t, void , void , int64_t,		typedef int32_t(data_submit_async_ty)(int32_t, void , void , int64_t,
__tgt_async_info *);		__tgt_async_info *);
typedef int32_t(data_retrieve_ty)(int32_t, void , void , int64_t);		typedef int32_t(data_retrieve_ty)(int32_t, void , void , int64_t);
typedef int32_t(data_retrieve_async_ty)(int32_t, void , void , int64_t,		typedef int32_t(data_retrieve_async_ty)(int32_t, void , void , int64_t,
__tgt_async_info *);		__tgt_async_info *);
		typedef int32_t(data_exchange_ty)(int32_t, void , int32_t, void , int64_t);
		typedef int32_t(data_exchange_async_ty)(int32_t, void , int32_t, void ,
		int64_t, __tgt_async_info *);
typedef int32_t(data_delete_ty)(int32_t, void *);		typedef int32_t(data_delete_ty)(int32_t, void *);
typedef int32_t(run_region_ty)(int32_t, void , void , ptrdiff_t ,		typedef int32_t(run_region_ty)(int32_t, void , void , ptrdiff_t ,
int32_t);		int32_t);
typedef int32_t(run_region_async_ty)(int32_t, void , void , ptrdiff_t ,		typedef int32_t(run_region_async_ty)(int32_t, void , void , ptrdiff_t ,
int32_t, __tgt_async_info *);		int32_t, __tgt_async_info *);
typedef int32_t(run_team_region_ty)(int32_t, void , void , ptrdiff_t ,		typedef int32_t(run_team_region_ty)(int32_t, void , void , ptrdiff_t ,
int32_t, int32_t, int32_t, uint64_t);		int32_t, int32_t, int32_t, uint64_t);
typedef int32_t(run_team_region_async_ty)(int32_t, void , void *,		typedef int32_t(run_team_region_async_ty)(int32_t, void , void *,
Show All 12 Lines	struct RTLInfoTy {
void *LibraryHandler = nullptr;		void *LibraryHandler = nullptr;

#ifdef OMPTARGET_DEBUG		#ifdef OMPTARGET_DEBUG
std::string RTLName;		std::string RTLName;
#endif		#endif

// Functions implemented in the RTL.		// Functions implemented in the RTL.
is_valid_binary_ty *is_valid_binary = nullptr;		is_valid_binary_ty *is_valid_binary = nullptr;
		is_data_exchangable_ty *is_data_exchangable = nullptr;
number_of_devices_ty *number_of_devices = nullptr;		number_of_devices_ty *number_of_devices = nullptr;
init_device_ty *init_device = nullptr;		init_device_ty *init_device = nullptr;
load_binary_ty *load_binary = nullptr;		load_binary_ty *load_binary = nullptr;
data_alloc_ty *data_alloc = nullptr;		data_alloc_ty *data_alloc = nullptr;
data_submit_ty *data_submit = nullptr;		data_submit_ty *data_submit = nullptr;
data_submit_async_ty *data_submit_async = nullptr;		data_submit_async_ty *data_submit_async = nullptr;
data_retrieve_ty *data_retrieve = nullptr;		data_retrieve_ty *data_retrieve = nullptr;
data_retrieve_async_ty *data_retrieve_async = nullptr;		data_retrieve_async_ty *data_retrieve_async = nullptr;
		data_exchange_ty *data_exchange = nullptr;
		data_exchange_async_ty *data_exchange_async = nullptr;
data_delete_ty *data_delete = nullptr;		data_delete_ty *data_delete = nullptr;
run_region_ty *run_region = nullptr;		run_region_ty *run_region = nullptr;
run_region_async_ty *run_region_async = nullptr;		run_region_async_ty *run_region_async = nullptr;
run_team_region_ty *run_team_region = nullptr;		run_team_region_ty *run_team_region = nullptr;
run_team_region_async_ty *run_team_region_async = nullptr;		run_team_region_async_ty *run_team_region_async = nullptr;
init_requires_ty *init_requires = nullptr;		init_requires_ty *init_requires = nullptr;
synchronize_ty *synchronize = nullptr;		synchronize_ty *synchronize = nullptr;

Show All 12 Lines	#endif
RTLInfoTy(const RTLInfoTy &r) {		RTLInfoTy(const RTLInfoTy &r) {
Idx = r.Idx;		Idx = r.Idx;
NumberOfDevices = r.NumberOfDevices;		NumberOfDevices = r.NumberOfDevices;
LibraryHandler = r.LibraryHandler;		LibraryHandler = r.LibraryHandler;
#ifdef OMPTARGET_DEBUG		#ifdef OMPTARGET_DEBUG
RTLName = r.RTLName;		RTLName = r.RTLName;
#endif		#endif
is_valid_binary = r.is_valid_binary;		is_valid_binary = r.is_valid_binary;
		is_data_exchangable = r.is_data_exchangable;
number_of_devices = r.number_of_devices;		number_of_devices = r.number_of_devices;
init_device = r.init_device;		init_device = r.init_device;
load_binary = r.load_binary;		load_binary = r.load_binary;
data_alloc = r.data_alloc;		data_alloc = r.data_alloc;
data_submit = r.data_submit;		data_submit = r.data_submit;
data_submit_async = r.data_submit_async;		data_submit_async = r.data_submit_async;
data_retrieve = r.data_retrieve;		data_retrieve = r.data_retrieve;
data_retrieve_async = r.data_retrieve_async;		data_retrieve_async = r.data_retrieve_async;
		data_exchange = r.data_exchange;
		data_exchange_async = r.data_exchange_async;
data_delete = r.data_delete;		data_delete = r.data_delete;
run_region = r.run_region;		run_region = r.run_region;
run_region_async = r.run_region_async;		run_region_async = r.run_region_async;
run_team_region = r.run_team_region;		run_team_region = r.run_team_region;
run_team_region_async = r.run_team_region_async;		run_team_region_async = r.run_team_region_async;
init_requires = r.init_requires;		init_requires = r.init_requires;
isUsed = r.isUsed;		isUsed = r.isUsed;
synchronize = r.synchronize;		synchronize = r.synchronize;
▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines

openmp/libomptarget/src/rtl.cpp

Show First 20 Lines • Show All 134 Lines • ▼ Show 20 Lines	((void *)&R.data_submit_async) =
dlsym(dynlib_handle, "__tgt_rtl_data_submit_async");		dlsym(dynlib_handle, "__tgt_rtl_data_submit_async");
((void *)&R.data_retrieve_async) =		((void *)&R.data_retrieve_async) =
dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async");		dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async");
((void *)&R.run_region_async) =		((void *)&R.run_region_async) =
dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async");		dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async");
((void *)&R.run_team_region_async) =		((void *)&R.run_team_region_async) =
dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async");		dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async");
((void *)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize");		((void *)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize");
		((void *)&R.data_exchange) =
		dlsym(dynlib_handle, "__tgt_rtl_data_exchange");
		((void *)&R.data_exchange_async) =
		dlsym(dynlib_handle, "__tgt_rtl_data_exchange_async");
		((void *)&R.is_data_exchangable) =
		dlsym(dynlib_handle, "__tgt_rtl_is_data_exchangable");

// No devices are supported by this RTL?		// No devices are supported by this RTL?
if (!(R.NumberOfDevices = R.number_of_devices())) {		if (!(R.NumberOfDevices = R.number_of_devices())) {
DP("No devices supported in this RTL\n");		DP("No devices supported in this RTL\n");
continue;		continue;
}		}

DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(),		DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(),
▲ Show 20 Lines • Show All 285 Lines • Show Last 20 Lines

openmp/libomptarget/test/offloading/d2d_memcpy.c

This file was added.

				// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-aarch64-unknown-linux-gnu \| %fcheck-aarch64-unknown-linux-gnu
				// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu \| %fcheck-powerpc64-ibm-linux-gnu
				// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu \| %fcheck-powerpc64le-ibm-linux-gnu
				// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu \| %fcheck-x86_64-pc-linux-gnu -allow-empty

				#include <assert.h>
				#include <omp.h>
				#include <stdio.h>
				#include <stdlib.h>

				const int magic_num = 7;

				int main(int argc, char *argv[]) {
				const int N = 128;
				const int num_devices = omp_get_num_devices();

				if (num_devices == 0) {
				printf("PASS\n");
				return 0;
				}

				const int src_device = 0;
				int dst_device = 1;
				if (dst_device >= num_devices)
				dst_device = num_devices - 1;

				int length = N * sizeof(int);
				protze.joachimUnsubmitted Not Done Reply Inline Actions What is this supposed to do? I guess you intended to use a different condition in the if, right? Why not unconditionally set the dst_device to num_devices-1? protze.joachim: What is this supposed to do? I guess you intended to use a different condition in the if…
				tianshilei1992AuthorUnsubmitted Done Reply Inline Actions Yeah, that would be better. tianshilei1992: Yeah, that would be better.
				int *src_ptr = omp_target_alloc(length, src_device);
				int *dst_ptr = omp_target_alloc(length, dst_device);

				assert(src_ptr && "src_ptr is NULL");
				assert(dst_ptr && "dst_ptr is NULL");

				#pragma omp target teams distribute parallel for device(src_device) \
				is_device_ptr(src_ptr)
				for (int i = 0; i < N; ++i) {
				src_ptr[i] = magic_num;
				}

				int rc =
				omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device);

				assert(rc == 0 && "error in omp_target_memcpy");

				int *buffer = malloc(length);

				assert(buffer && "failed to malloc host buffer");

				#pragma omp target teams distribute parallel for device(dst_device) \
				map(from: buffer[0:N]) is_device_ptr(dst_ptr)
				for (int i = 0; i < N; ++i) {
				buffer[i] = dst_ptr[i] + magic_num;
				}

				for (int i = 0; i < N; ++i)
				assert(buffer[i] == 2 * magic_num);

				printf("PASS\n");

				return 0;
				}

				// CHECK: PASS

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP] Improve D2D memcpy to use more efficient driver API
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 266707

openmp/libomptarget/include/omptargetplugin.h

openmp/libomptarget/plugins/cuda/src/rtl.cpp

openmp/libomptarget/plugins/exports

openmp/libomptarget/src/api.cpp

openmp/libomptarget/src/device.h

openmp/libomptarget/src/device.cpp

openmp/libomptarget/src/rtl.h

openmp/libomptarget/src/rtl.cpp

openmp/libomptarget/test/offloading/d2d_memcpy.c

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP] Improve D2D memcpy to use more efficient driver APIClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 266707

openmp/libomptarget/include/omptargetplugin.h

openmp/libomptarget/plugins/cuda/src/rtl.cpp

openmp/libomptarget/plugins/exports

openmp/libomptarget/src/api.cpp

openmp/libomptarget/src/device.h

openmp/libomptarget/src/device.cpp

openmp/libomptarget/src/rtl.h

openmp/libomptarget/src/rtl.cpp

openmp/libomptarget/test/offloading/d2d_memcpy.c

[OpenMP] Improve D2D memcpy to use more efficient driver API
ClosedPublic