Diff 218469

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

Show All 14 Lines

// Warp ID in the CUDA block		// Warp ID in the CUDA block
INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }		INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
// Lane ID in the CUDA warp.		// Lane ID in the CUDA warp.
INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }		INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }

// Return true if this is the first active thread in the warp.		// Return true if this is the first active thread in the warp.
INLINE static bool IsWarpMasterActiveThread() {		INLINE static bool IsWarpMasterActiveThread() {
unsigned long long Mask = __ACTIVEMASK();		unsigned long long Mask = __kmpc_impl_activemask();
unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);		unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
unsigned long long Sh = Mask << ShNum;		unsigned long long Sh = Mask << ShNum;
// Truncate Sh to the 32 lower bits		// Truncate Sh to the 32 lower bits
return (unsigned)Sh == 0;		return (unsigned)Sh == 0;
}		}
// Return true if this is the master thread.		// Return true if this is the master thread.
INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {		INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();		return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	EXTERN void *__kmpc_data_sharing_environment_begin(
if (!IsOMPRuntimeInitialized)		if (!IsOMPRuntimeInitialized)
return (void *)&DataSharingState;		return (void *)&DataSharingState;

DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);		DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
DSPRINT(DSFLAG, "Default Data Size %016llx\n",		DSPRINT(DSFLAG, "Default Data Size %016llx\n",
(unsigned long long)SharingDefaultDataSize);		(unsigned long long)SharingDefaultDataSize);

unsigned WID = getWarpId();		unsigned WID = getWarpId();
unsigned CurActiveThreads = __ACTIVEMASK();		__kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();

__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];		__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];		void *&StackP = DataSharingState.StackPtr[WID];
void * volatile &FrameP = DataSharingState.FramePtr[WID];		void * volatile &FrameP = DataSharingState.FramePtr[WID];
int32_t &ActiveT = DataSharingState.ActiveThreads[WID];		int32_t &ActiveT = DataSharingState.ActiveThreads[WID];

DSPRINT0(DSFLAG, "Save current slot/stack values.\n");		DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
// Save the current values.		// Save the current values.
▲ Show 20 Lines • Show All 123 Lines • ▼ Show 20 Lines	if (IsWarpMasterActiveThread()) {
S->Next = 0;		S->Next = 0;
}		}
}		}

DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");		DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
return;		return;
}		}

int32_t CurActive = __ACTIVEMASK();		__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();

// Only the warp master can restore the stack and frame information, and only		// Only the warp master can restore the stack and frame information, and only
// if there are no other threads left behind in this environment (i.e. the		// if there are no other threads left behind in this environment (i.e. the
// warp diverged and returns in different places). This only works if we		// warp diverged and returns in different places). This only works if we
// assume that threads will converge right after the call site that started		// assume that threads will converge right after the call site that started
// the environment.		// the environment.
if (IsWarpMasterActiveThread()) {		if (IsWarpMasterActiveThread()) {
int32_t &ActiveT = DataSharingState.ActiveThreads[WID];		int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
▲ Show 20 Lines • Show All 109 Lines • ▼ Show 20 Lines	INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
// Add worst-case padding to DataSize so that future stack allocations are		// Add worst-case padding to DataSize so that future stack allocations are
// correctly aligned.		// correctly aligned.
const size_t Alignment = 8;		const size_t Alignment = 8;
PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;		PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;

// Frame pointer must be visible to all workers in the same warp.		// Frame pointer must be visible to all workers in the same warp.
const unsigned WID = getWarpId();		const unsigned WID = getWarpId();
void *FrameP = 0;		void *FrameP = 0;
int32_t CurActive = __ACTIVEMASK();		__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();

if (IsWarpMaster) {		if (IsWarpMaster) {
// SlotP will point to either the shared memory slot or an existing		// SlotP will point to either the shared memory slot or an existing
// global memory slot.		// global memory slot.
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];		__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];		void *&StackP = DataSharingState.StackPtr[WID];

// Check if we have room for the data in the current slot.		// Check if we have room for the data in the current slot.
▲ Show 20 Lines • Show All 192 Lines • Show Last 20 Lines

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu

Show First 20 Lines • Show All 383 Lines • ▼ Show 20 Lines	INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
uint32_t lo, hi;		uint32_t lo, hi;
__kmpc_impl_unpack(val, lo, hi);		__kmpc_impl_unpack(val, lo, hi);
hi = __kmpc_impl_shfl_sync(active, hi, leader);		hi = __kmpc_impl_shfl_sync(active, hi, leader);
lo = __kmpc_impl_shfl_sync(active, lo, leader);		lo = __kmpc_impl_shfl_sync(active, lo, leader);
return __kmpc_impl_pack(lo, hi);		return __kmpc_impl_pack(lo, hi);
}		}

INLINE static uint64_t NextIter() {		INLINE static uint64_t NextIter() {
__kmpc_impl_lanemask_t active = __ACTIVEMASK();		__kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
uint32_t leader = __kmpc_impl_ffs(active) - 1;		uint32_t leader = __kmpc_impl_ffs(active) - 1;
uint32_t change = __kmpc_impl_popc(active);		uint32_t change = __kmpc_impl_popc(active);
__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();		__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);		unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
uint64_t warp_res;		uint64_t warp_res;
if (rank == 0) {		if (rank == 0) {
warp_res = atomicAdd(		warp_res = atomicAdd(
(unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),		(unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
▲ Show 20 Lines • Show All 406 Lines • Show Last 20 Lines

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

	Show All 39 Lines
	// used by dynamic scheduling			// used by dynamic scheduling
	#define FINISHED 0			#define FINISHED 0
	#define NOT_FINISHED 1			#define NOT_FINISHED 1
	#define LAST_CHUNK 2			#define LAST_CHUNK 2

	#define BARRIER_COUNTER 0			#define BARRIER_COUNTER 0
	#define ORDERED_COUNTER 1			#define ORDERED_COUNTER 1

	// Macros for Cuda intrinsics
	// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
	// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
	#ifndef CUDA_VERSION
	#error CUDA_VERSION macro is undefined, something wrong with cuda.
	#elif CUDA_VERSION >= 9000
	#define __ACTIVEMASK() __activemask()
	#else
	#define __ACTIVEMASK() __ballot(1)
	#endif // CUDA_VERSION

	// arguments needed for L0 parallelism only.			// arguments needed for L0 parallelism only.
	class omptarget_nvptx_SharedArgs {			class omptarget_nvptx_SharedArgs {
	public:			public:
	// All these methods must be called by the master thread only.			// All these methods must be called by the master thread only.
	INLINE void Init() {			INLINE void Init() {
	args = buffer;			args = buffer;
	nArgs = MAX_SHARED_ARGS;			nArgs = MAX_SHARED_ARGS;
	}			}
	▲ Show 20 Lines • Show All 365 Lines • Show Last 20 Lines

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu

Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
bool IsFinal, int32_t LaneSource,		bool IsFinal, int32_t LaneSource,
int32_t LaneId, int32_t NumLanes) {		int32_t LaneId, int32_t NumLanes) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");		PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
uint32_t ConvergentMask = Mask;		uint32_t ConvergentMask = Mask;
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);		int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);		uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __kmpc_impl_ffs(WorkRemaining);		*LaneSource += __kmpc_impl_ffs(WorkRemaining);
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;		*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();		__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
*LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);		*LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);

int threadId = GetLogicalThreadIdInBlock(isSPMDMode());		int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;		int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;

ConvergentSimdJob job = (ConvergentSimdJob )buffer;		ConvergentSimdJob job = (ConvergentSimdJob )buffer;
int32_t SimdLimit =		int32_t SimdLimit =
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);		omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
bool *IsFinal,		bool *IsFinal,
int32_t *LaneSource) {		int32_t *LaneSource) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");		PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
uint32_t ConvergentMask = Mask;		uint32_t ConvergentMask = Mask;
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);		int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);		uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __kmpc_impl_ffs(WorkRemaining);		*LaneSource += __kmpc_impl_ffs(WorkRemaining);
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;		*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();		__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);		uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);

int threadId = GetLogicalThreadIdInBlock(isSPMDMode());		int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;		int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;

ConvergentParallelJob job = (ConvergentParallelJob )buffer;		ConvergentParallelJob job = (ConvergentParallelJob )buffer;
int32_t NumThreadsClause =		int32_t NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);		omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
▲ Show 20 Lines • Show All 313 Lines • Show Last 20 Lines

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	while (mask > 0) {
mask = curr_size / 2;		mask = curr_size / 2;
}		}
}		}

INLINE static uint32_t		INLINE static uint32_t
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {		gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
uint32_t size, remote_id, physical_lane_id;		uint32_t size, remote_id, physical_lane_id;
physical_lane_id = GetThreadIdInBlock() % WARPSIZE;		physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();		__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
uint32_t Liveness = __ACTIVEMASK();		__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;		uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
uint32_t lanemask_gt = __kmpc_impl_lanemask_gt();		__kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
do {		do {
Liveness = __ACTIVEMASK();		Liveness = __kmpc_impl_activemask();
remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);		remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
size = __kmpc_impl_popc(Liveness);		size = __kmpc_impl_popc(Liveness);
logical_lane_id /= 2;		logical_lane_id /= 2;
shflFct(reduce_data, /LaneId =/logical_lane_id,		shflFct(reduce_data, /LaneId =/logical_lane_id,
/Offset=/remote_id - 1 - physical_lane_id, /AlgoVersion=/2);		/Offset=/remote_id - 1 - physical_lane_id, /AlgoVersion=/2);
} while (logical_lane_id % 2 == 0 && size > 1);		} while (logical_lane_id % 2 == 0 && size > 1);
return (logical_lane_id == 0);		return (logical_lane_id == 0);
}		}

EXTERN		EXTERN
int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,		int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
size_t reduce_size, void *reduce_data,		size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct,		kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct) {		kmp_InterWarpCopyFctPtr cpyFct) {
uint32_t Liveness = __ACTIVEMASK();		__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
if (Liveness == 0xffffffff) {		if (Liveness == 0xffffffff) {
gpu_regular_warp_reduce(reduce_data, shflFct);		gpu_regular_warp_reduce(reduce_data, shflFct);
return GetThreadIdInBlock() % WARPSIZE ==		return GetThreadIdInBlock() % WARPSIZE ==
0; // Result on lane 0 of the simd warp.		0; // Result on lane 0 of the simd warp.
} else {		} else {
return gpu_irregular_simd_reduce(		return gpu_irregular_simd_reduce(
reduce_data, shflFct); // Result on the first active lane.		reduce_data, shflFct); // Result on the first active lane.
}		}
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	if (NumThreads > WARPSIZE) {
cpyFct(reduce_data, WarpsNeeded);		cpyFct(reduce_data, WarpsNeeded);

if (WarpId == 0)		if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,		gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);		BlockThreadId);
}		}
return BlockThreadId == 0;		return BlockThreadId == 0;
#else		#else
uint32_t Liveness = __ACTIVEMASK();		__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
if (Liveness == 0xffffffff) // Full warp		if (Liveness == 0xffffffff) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);		gpu_regular_warp_reduce(reduce_data, shflFct);
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes		else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(reduce_data, shflFct,		gpu_irregular_warp_reduce(reduce_data, shflFct,
/LaneCount=/__kmpc_impl_popc(Liveness),		/LaneCount=/__kmpc_impl_popc(Liveness),
/LaneId=/GetThreadIdInBlock() % WARPSIZE);		/LaneId=/GetThreadIdInBlock() % WARPSIZE);
else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2		else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
// parallel region may enter here; return		// parallel region may enter here; return
▲ Show 20 Lines • Show All 158 Lines • ▼ Show 20 Lines	#else

// Load from scratchpad and reduce.		// Load from scratchpad and reduce.
char *scratchpad = GetTeamsReductionScratchpad();		char *scratchpad = GetTeamsReductionScratchpad();
ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /Load only/ 0);		ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /Load only/ 0);
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)		for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
ldFct(reduce_data, scratchpad, i, NumTeams, /Load and reduce/ 1);		ldFct(reduce_data, scratchpad, i, NumTeams, /Load and reduce/ 1);

// Reduce across warps to the warp master.		// Reduce across warps to the warp master.
uint32_t Liveness = __ACTIVEMASK();		__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
if (Liveness == 0xffffffff) // Full warp		if (Liveness == 0xffffffff) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);		gpu_regular_warp_reduce(reduce_data, shflFct);
else // Partial warp but contiguous lanes		else // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(reduce_data, shflFct,		gpu_irregular_warp_reduce(reduce_data, shflFct,
/LaneCount=/__kmpc_impl_popc(Liveness),		/LaneCount=/__kmpc_impl_popc(Liveness),
/LaneId=/ThreadId % WARPSIZE);		/LaneId=/ThreadId % WARPSIZE);

// When we have more than [warpsize] number of threads		// When we have more than [warpsize] number of threads
▲ Show 20 Lines • Show All 206 Lines • Show Last 20 Lines

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h

	Show First 20 Lines • Show All 198 Lines • ▼ Show 20 Lines
	// Masters			// Masters

	INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }			INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Parallel level			// Parallel level

	INLINE void IncParallelLevel(bool ActiveParallel) {			INLINE void IncParallelLevel(bool ActiveParallel) {
	unsigned Active = __ACTIVEMASK();			__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
	__kmpc_impl_syncwarp(Active);			__kmpc_impl_syncwarp(Active);
	unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();			__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
	unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);			unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
	if (Rank == 0) {			if (Rank == 0) {
	parallelLevel[GetWarpId()] +=			parallelLevel[GetWarpId()] +=
	(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));			(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
	__threadfence();			__threadfence();
	}			}
	__kmpc_impl_syncwarp(Active);			__kmpc_impl_syncwarp(Active);
	}			}

	INLINE void DecParallelLevel(bool ActiveParallel) {			INLINE void DecParallelLevel(bool ActiveParallel) {
	unsigned Active = __ACTIVEMASK();			__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
	__kmpc_impl_syncwarp(Active);			__kmpc_impl_syncwarp(Active);
	unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();			__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
	unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);			unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
	if (Rank == 0) {			if (Rank == 0) {
	parallelLevel[GetWarpId()] -=			parallelLevel[GetWarpId()] -=
	(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));			(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
	__threadfence();			__threadfence();
	}			}
	__kmpc_impl_syncwarp(Active);			__kmpc_impl_syncwarp(Active);
	}			}
	▲ Show 20 Lines • Show All 68 Lines • Show Last 20 Lines

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu

	Show First 20 Lines • Show All 136 Lines • ▼ Show 20 Lines
	}			}

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Vote			// Vote
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	EXTERN int32_t __kmpc_warp_active_thread_mask() {			EXTERN int32_t __kmpc_warp_active_thread_mask() {
	PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");			PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
	return __ACTIVEMASK();			return __kmpc_impl_activemask();
	}			}

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Syncwarp			// Syncwarp
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	EXTERN void __kmpc_syncwarp(int32_t Mask) {			EXTERN void __kmpc_syncwarp(int32_t Mask) {
	PRINT0(LD_IO, "call __kmpc_syncwarp\n");			PRINT0(LD_IO, "call __kmpc_syncwarp\n");
	__kmpc_impl_syncwarp(Mask);			__kmpc_impl_syncwarp(Mask);
	}			}

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }			INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }

	INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }			INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }

	#ifndef CUDA_VERSION			#ifndef CUDA_VERSION
	#error CUDA_VERSION macro is undefined, something wrong with cuda.			#error CUDA_VERSION macro is undefined, something wrong with cuda.
	#endif			#endif

				// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().

				INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
				#if CUDA_VERSION >= 9000
				return __activemask();
				#else
				return __ballot(1);
				#endif
				}

	// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.			// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.

	INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,			INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
	int32_t SrcLane) {			int32_t SrcLane) {
	#if CUDA_VERSION >= 9000			#if CUDA_VERSION >= 9000
	return __shfl_sync(Mask, Var, SrcLane);			return __shfl_sync(Mask, Var, SrcLane);
	#else			#else
	return __shfl(Var, SrcLane);			return __shfl(Var, SrcLane);
	Show All 31 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[libomptarget] Refactor activemask macro to inline function
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 218469

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h

This is an archive of the discontinued LLVM Phabricator instance.

[libomptarget] Refactor activemask macro to inline functionClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 218469

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu

openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h

[libomptarget] Refactor activemask macro to inline function
ClosedPublic