This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
openmp/libomptarget/deviceRTLs/
-
libomptarget/
-
deviceRTLs/
-
common/
-
omptarget.h
-
src/
2
data_sharing.cu
-
omp_data.cu
-
omptarget.cu
-
interface.h

Differential D104666

[Libomptarget] Improve device runtime implementation for globalized variables.
ClosedPublic

Authored by jhuber6 on Jun 21 2021, 1:00 PM.

Download Raw Diff

Details

Reviewers

jdoerfert
tianshilei1992

Commits

rG244e98ff4808: [Libomptarget] Improve device runtime implementation for globalized variables.

Summary

Currently the runtime implementation of __kmpc_alloc_shared is extremely slow because it allocated memory for each thread individually. This patch adds a small buffer for the threads to share data and will greatly improve performance for builds where all globalization could not be optimized out. If the shared buffer is full, then memory will not only be allocated per-warp rather than per-thread.

Depends on D97680

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jhuber6 created this revision.Jun 21 2021, 1:00 PM

Herald added a subscriber: jfb. · View Herald TranscriptJun 21 2021, 1:00 PM

jhuber6 requested review of this revision.Jun 21 2021, 1:00 PM

Herald added a project: Restricted Project. · View Herald TranscriptJun 21 2021, 1:00 PM

Herald added subscribers: openmp-commits, sstefan1. · View Herald Transcript

Harbormaster completed remote builds in B110274: Diff 353467.Jun 21 2021, 1:01 PM

tianshilei1992 added inline comments.Jun 21 2021, 1:21 PM

openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
42	IIRC, in the new `deviceRTLs`, we only have one stack where the first chunk, which is bigger than the rest, is for the main thread in non-SPMD mode. Why do we want to have two here?

jdoerfert added inline comments.Jun 21 2021, 1:41 PM

openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
42	Was easier to write it like this from scratch, either way works, no real difference, this might be actually nicer.

This revision is now accepted and ready to land.Jun 21 2021, 10:13 PM

This revision was landed with ongoing or failed builds.Jun 22 2021, 8:53 AM

Closed by commit rG244e98ff4808: [Libomptarget] Improve device runtime implementation for globalized variables. (authored by jhuber6). · Explain Why

This revision was automatically updated to reflect the committed changes.

jhuber6 added a commit: rG244e98ff4808: [Libomptarget] Improve device runtime implementation for globalized variables..

Revision Contents

Path

Size

openmp/

libomptarget/

deviceRTLs/

common/

omptarget.h

12 lines

src/

277 lines

5 lines

2 lines

6 lines

Diff 353467

openmp/libomptarget/deviceRTLs/common/omptarget.h

	Show First 20 Lines • Show All 79 Lines • ▼ Show 20 Lines
	struct __kmpc_data_sharing_slot {			struct __kmpc_data_sharing_slot {
	__kmpc_data_sharing_slot *Next;			__kmpc_data_sharing_slot *Next;
	__kmpc_data_sharing_slot *Prev;			__kmpc_data_sharing_slot *Prev;
	void *PrevSlotStackPtr;			void *PrevSlotStackPtr;
	void *DataEnd;			void *DataEnd;
	char Data[DS_Worker_Warp_Slot_Size];			char Data[DS_Worker_Warp_Slot_Size];
	};			};

	// Data structure to keep in shared memory that traces the current slot, stack,
	// and frame pointer as well as the active threads that didn't exit the current
	// environment.
	struct DataSharingStateTy {
	__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
	void *StackPtr[DS_Max_Warp_Number];
	void *volatile FramePtr[DS_Max_Warp_Number];
	__kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
	};

	extern DataSharingStateTy EXTERN_SHARED(DataSharingState);

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// task ICV and (implicit & explicit) task state			// task ICV and (implicit & explicit) task state

	class omptarget_nvptx_TaskDescr {			class omptarget_nvptx_TaskDescr {
	public:			public:
	// methods for flags			// methods for flags
	INLINE omp_sched_t GetRuntimeSched() const;			INLINE omp_sched_t GetRuntimeSched() const;
	INLINE void SetRuntimeSched(omp_sched_t sched);			INLINE void SetRuntimeSched(omp_sched_t sched);
	▲ Show 20 Lines • Show All 236 Lines • Show Last 20 Lines

openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu

	Show All 18 Lines
	INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {			INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
	return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();			return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
	}			}

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Runtime functions for trunk data sharing scheme.			// Runtime functions for trunk data sharing scheme.
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	INLINE static void data_sharing_init_stack_common() {			static constexpr unsigned MinBytes = 8;
	ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
	omptarget_nvptx_TeamDescr *teamDescr =
	&omptarget_nvptx_threadPrivateContext->TeamContext();

	for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
	__kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
	DataSharingState.SlotPtr[WID] = RootS;
	DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
	}
	}

	// Initialize data sharing data structure. This function needs to be called
	// once at the beginning of a data sharing context (coincides with the kernel
	// initialization). This function is called only by the MASTER thread of each
	// team in non-SPMD mode.
	EXTERN void __kmpc_data_sharing_init_stack() {
	ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
	// This function initializes the stack pointer with the pointer to the
	// statically allocated shared memory slots. The size of a shared memory
	// slot is pre-determined to be 256 bytes.
	data_sharing_init_stack_common();
	}

	EXTERN void *__kmpc_alloc_shared(size_t DataSize) {
	return (void *)SafeMalloc(DataSize, "Alloc Shared");
	}

	EXTERN void __kmpc_free_shared(void *FrameStart) {			template <unsigned BytesPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
	SafeFree(FrameStart, "Free Shared");			struct alignas(32) ThreadStackTy {
				static constexpr unsigned MaxSize = NThreads * BytesPerThread;
				static constexpr unsigned NumThreads = NThreads;
				static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
				static constexpr unsigned MaxSizePerWarp = MaxSize / NumWarps;

				unsigned char Data[MaxSize];
				char Sizes[MaxSize / MinBytes];
				char SizeUsage[NumWarps];
				char Usage[NumWarps];
				};

				[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
				tianshilei1992Unsubmitted Not Done Reply Inline Actions IIRC, in the new `deviceRTLs`, we only have one stack where the first chunk, which is bigger than the rest, is for the main thread in non-SPMD mode. Why do we want to have two here? tianshilei1992: IIRC, in the new `deviceRTLs`, we only have one stack where the first chunk, which is bigger…
				jdoerfertUnsubmitted Not Done Reply Inline Actions Was easier to write it like this from scratch, either way works, no real difference, this might be actually nicer. jdoerfert: Was easier to write it like this from scratch, either way works, no real difference, this might…
				#pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)

				[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 2,
				MAX_THREADS_PER_TEAM / 8>
				WorkerSharedStack;
				#pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)

				template <typename AllocTy>
				static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes,
				unsigned WarpBytes) {
				void *Ptr;
				__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
				unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1;
				bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID;
				if (IsWarpLeader)
				Ptr = Alloc();
				// Get address from the first active lane.
				int FP = (int )&Ptr;
				FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], LeaderID);
				if (sizeof(Ptr) == 8)
				FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], LeaderID);
				return (void )&((char )(Ptr))[(GetLaneId() - LeaderID) * Bytes];
				}

				EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
				Bytes = Bytes + (Bytes % MinBytes);
				if (IsMasterThread(isSPMDMode())) {
				// Main thread alone, use shared memory if space is available.
				if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) {
				void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]];
				MainSharedStack.Usage[0] += Bytes;
				MainSharedStack.Sizes[MainSharedStack.SizeUsage[0]++] = Bytes;
				return Ptr;
	}			}
				} else {
	// Initialize data sharing data structure. This function needs to be called			int TID = GetThreadIdInBlock();
	// once at the beginning of a data sharing context (coincides with the kernel			int WID = GetWarpId();
	// initialization). This function is called in SPMD mode only.			unsigned WarpBytes = Bytes * WARPSIZE;
	EXTERN void __kmpc_data_sharing_init_stack_spmd() {			auto AllocSharedStack = [&]() {
	ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");			unsigned WarpOffset = WID * WorkerSharedStack.MaxSizePerWarp;
	// This function initializes the stack pointer with the pointer to the			void *Ptr =
	// statically allocated shared memory slots. The size of a shared memory			&WorkerSharedStack.Data[WarpOffset + WorkerSharedStack.Usage[WID]];
	// slot is pre-determined to be 256 bytes.			WorkerSharedStack.Usage[WID] += WarpBytes;
	if (GetThreadIdInBlock() == 0)			WorkerSharedStack.Sizes[WorkerSharedStack.SizeUsage[WID]++] = WarpBytes;
	data_sharing_init_stack_common();			return Ptr;
				};
	__kmpc_impl_threadfence_block();			if (TID < WorkerSharedStack.NumThreads &&
				WorkerSharedStack.Usage[WID] + WarpBytes <=
				WorkerSharedStack.MaxSizePerWarp)
				return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes);
				}
				// Fallback to malloc
				int TID = GetThreadIdInBlock();
				unsigned WarpBytes = Bytes * WARPSIZE;
				auto AllocGlobal = [&] {
				return SafeMalloc(WarpBytes, "AllocGlobalFallback");
				};
				return __kmpc_alloc_for_warp(AllocGlobal, Bytes, WarpBytes);
	}			}

	INLINE static void *data_sharing_push_stack_common(size_t PushSize) {			EXTERN void __kmpc_free_shared(void *Ptr) {
	ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");

	// Only warp active master threads manage the stack.
	bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;

	// Add worst-case padding to DataSize so that future stack allocations are
	// correctly aligned.
	const size_t Alignment = 8;
	PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;

	// Frame pointer must be visible to all workers in the same warp.
	const unsigned WID = GetWarpId();
	void *FrameP = 0;
	__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();			__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
				unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1;
	if (IsWarpMaster) {			bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID;
	// SlotP will point to either the shared memory slot or an existing			__kmpc_syncwarp(CurActive);
	// global memory slot.			if (IsWarpLeader) {
	__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];			if (Ptr >= &MainSharedStack.Data[0] &&
	void *&StackP = DataSharingState.StackPtr[WID];			Ptr < &MainSharedStack.Data[MainSharedStack.MaxSize]) {
				unsigned Bytes = MainSharedStack.Sizes[--MainSharedStack.SizeUsage[0]];
	// Check if we have room for the data in the current slot.			MainSharedStack.Usage[0] -= Bytes;
	const uintptr_t StartAddress = (uintptr_t)StackP;			return;
	const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
	const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;

	// If we requested more data than there is room for in the rest
	// of the slot then we need to either re-use the next slot, if one exists,
	// or create a new slot.
	if (EndAddress < RequestedEndAddress) {
	__kmpc_data_sharing_slot *NewSlot = 0;
	size_t NewSize = PushSize;

	// Allocate at least the default size for each type of slot.
	// Master is a special case and even though there is only one thread,
	// it can share more things with the workers. For uniformity, it uses
	// the full size of a worker warp slot.
	size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
	if (DefaultSlotSize > NewSize)
	NewSize = DefaultSlotSize;
	NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc(
	sizeof(__kmpc_data_sharing_slot) + NewSize,
	"Global memory slot allocation.");

	NewSlot->Next = 0;
	NewSlot->Prev = SlotP;
	NewSlot->PrevSlotStackPtr = StackP;
	NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;

	// Make previous slot point to the newly allocated slot.
	SlotP->Next = NewSlot;
	// The current slot becomes the new slot.
	SlotP = NewSlot;
	// The stack pointer always points to the next free stack frame.
	StackP = &NewSlot->Data[0] + PushSize;
	// The frame pointer always points to the beginning of the frame.
	FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
	} else {
	// Add the data chunk to the current slot. The frame pointer is set to
	// point to the start of the new frame held in StackP.
	FrameP = DataSharingState.FramePtr[WID] = StackP;
	// Reset stack pointer to the requested address.
	StackP = (void *)RequestedEndAddress;
	}			}
				if (Ptr >= &WorkerSharedStack.Data[0] &&
				Ptr < &WorkerSharedStack.Data[WorkerSharedStack.MaxSize]) {
				int WID = GetWarpId();
				unsigned Bytes =
				WorkerSharedStack.Sizes[--WorkerSharedStack.SizeUsage[WID]];
				WorkerSharedStack.Usage[WID] -= Bytes;
				return;
	}			}
	// Get address from lane 0.			SafeFree(Ptr, "FreeGlobalFallback");
	int FP = (int )&FrameP;
	FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
	if (sizeof(FrameP) == 8)
	FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);

	return FrameP;
	}			}

	EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
	int16_t UseSharedMemory) {
	return data_sharing_push_stack_common(DataSize);
	}			}

	// Called at the time of the kernel initialization. This is used to initilize			EXTERN void __kmpc_data_sharing_init_stack() {
	// the list of references to shared variables and to pre-allocate global storage			for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) {
	// for holding the globalized variables.			MainSharedStack.SizeUsage[i] = 0;
	//			MainSharedStack.Usage[i] = 0;
	// By default the globalized variables are stored in global memory. If the			}
	// UseSharedMemory is set to true, the runtime will attempt to use shared memory			for (unsigned i = 0; i < WorkerSharedStack.NumWarps; ++i) {
	// as long as the size requested fits the pre-allocated size.			WorkerSharedStack.SizeUsage[i] = 0;
	EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,			WorkerSharedStack.Usage[i] = 0;
	int16_t UseSharedMemory) {
	// Compute the total memory footprint of the requested data.
	// The master thread requires a stack only for itself. A worker
	// thread (which at this point is a warp master) will require
	// space for the variables of each thread in the warp,
	// i.e. one DataSize chunk per warp lane.
	// TODO: change WARPSIZE to the number of active threads in the warp.
	size_t PushSize = (isRuntimeUninitialized() \|\| IsMasterThread(isSPMDMode()))
	? DataSize
	: WARPSIZE * DataSize;

	// Compute the start address of the frame of each thread in the warp.
	uintptr_t FrameStartAddress =
	(uintptr_t)data_sharing_push_stack_common(PushSize);
	FrameStartAddress += (uintptr_t)(GetLaneId() * DataSize);
	return (void *)FrameStartAddress;
	}

	// Pop the stack and free any memory which can be reclaimed.
	//
	// When the pop operation removes the last global memory slot,
	// reclaim all outstanding global memory slots since it is
	// likely we have reached the end of the kernel.
	EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
	ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");

	__kmpc_impl_threadfence_block();

	if (GetThreadIdInBlock() % WARPSIZE == 0) {
	unsigned WID = GetWarpId();

	// Current slot
	__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];

	// Pointer to next available stack.
	void *&StackP = DataSharingState.StackPtr[WID];

	// Pop the frame.
	StackP = FrameStart;

	// If the current slot is empty, we need to free the slot after the
	// pop.
	bool SlotEmpty = (StackP == &SlotP->Data[0]);

	if (SlotEmpty && SlotP->Prev) {
	// Before removing the slot we need to reset StackP.
	StackP = SlotP->PrevSlotStackPtr;

	// Remove the slot.
	SlotP = SlotP->Prev;
	SafeFree(SlotP->Next, "Free slot.");
	SlotP->Next = 0;
	}
	}			}
	}			}

	// Begin a data sharing context. Maintain a list of references to shared			// Begin a data sharing context. Maintain a list of references to shared
	// variables. This list of references to shared variables will be passed			// variables. This list of references to shared variables will be passed
	// to one or more threads.			// to one or more threads.
	// In L0 data sharing this is called by master thread.			// In L0 data sharing this is called by master thread.
	// In L1 data sharing this is called by active warp master thread.			// In L1 data sharing this is called by active warp master thread.
	▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines

openmp/libomptarget/deviceRTLs/common/src/omp_data.cu

	Show First 20 Lines • Show All 52 Lines • ▼ Show 20 Lines
	volatile omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);			volatile omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// OpenMP kernel execution parameters			// OpenMP kernel execution parameters
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	uint32_t SHARED(execution_param);			uint32_t SHARED(execution_param);

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Data sharing state
	////////////////////////////////////////////////////////////////////////////////
	DataSharingStateTy SHARED(DataSharingState);

	////////////////////////////////////////////////////////////////////////////////
	// Scratchpad for teams reduction.			// Scratchpad for teams reduction.
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	void *SHARED(ReductionScratchpadPtr);			void *SHARED(ReductionScratchpadPtr);

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Data sharing related variables.			// Data sharing related variables.
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);			omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);

	#pragma omp end declare target			#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/omptarget.cu

Show First 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
omptarget_nvptx_TaskDescr *currTaskDescr =		omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);		omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
nThreads = GetNumberOfThreadsInBlock();		nThreads = GetNumberOfThreadsInBlock();
threadLimit = ThreadLimit;		threadLimit = ThreadLimit;

if (!isSPMDMode())		if (!isSPMDMode())
omptarget_nvptx_globalArgs.Init();		omptarget_nvptx_globalArgs.Init();

		__kmpc_data_sharing_init_stack();
__kmpc_impl_target_init();		__kmpc_impl_target_init();
}		}

EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {		EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");		PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");
ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,		ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
"Generic always requires initialized runtime.");		"Generic always requires initialized runtime.");
// Enqueue omp state object for use by another team.		// Enqueue omp state object for use by another team.
Show All 14 Lines	EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
if (threadId == 0) {		if (threadId == 0) {
usedSlotIdx = __kmpc_impl_smid() % MAX_SM;		usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
parallelLevel[0] =		parallelLevel[0] =
1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);		1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
} else if (GetLaneId() == 0) {		} else if (GetLaneId() == 0) {
parallelLevel[GetWarpId()] =		parallelLevel[GetWarpId()] =
1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);		1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
}		}
		__kmpc_data_sharing_init_stack();
if (!RequiresOMPRuntime) {		if (!RequiresOMPRuntime) {
// Runtime is not required - exit.		// Runtime is not required - exit.
__kmpc_impl_syncthreads();		__kmpc_impl_syncthreads();
return;		return;
}		}

//		//
// Team Context Initialization.		// Team Context Initialization.
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines

openmp/libomptarget/deviceRTLs/interface.h

	Show First 20 Lines • Show All 420 Lines • ▼ Show 20 Lines
	EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,			EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
	int16_t RequiresOMPRuntime);			int16_t RequiresOMPRuntime);
	EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);			EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
	EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn);			EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn);
	EXTERN bool __kmpc_kernel_parallel(void **WorkFn);			EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
	EXTERN void __kmpc_kernel_end_parallel();			EXTERN void __kmpc_kernel_end_parallel();

	EXTERN void __kmpc_data_sharing_init_stack();			EXTERN void __kmpc_data_sharing_init_stack();
	EXTERN void __kmpc_data_sharing_init_stack_spmd();
	EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
	int16_t UseSharedMemory);
	EXTERN void *__kmpc_data_sharing_push_stack(size_t size,
	int16_t UseSharedMemory);
	EXTERN void __kmpc_data_sharing_pop_stack(void *a);
	EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);			EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
	EXTERN void __kmpc_end_sharing_variables();			EXTERN void __kmpc_end_sharing_variables();
	EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);			EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);

	/// Entry point to start a new parallel region.			/// Entry point to start a new parallel region.
	///			///
	/// \param ident The source identifier.			/// \param ident The source identifier.
	/// \param global_tid The global thread ID.			/// \param global_tid The global thread ID.
	Show All 33 Lines