This is an archive of the discontinued LLVM Phabricator instance.

Add new OpenMP 4.5 doacross loop nest feature
ClosedPublic

Authored by jlpeyton on Feb 18 2016, 10:29 AM.

Download Raw Diff

Details

Reviewers

andreybokhanko
ABataev
tlwilmar
AndreyChurbanov
hfinkel

Commits

rG71909c57ca96: Add new OpenMP 4.5 doacross loop nest feature
rOMP262532: Add new OpenMP 4.5 doacross loop nest feature
rL262532: Add new OpenMP 4.5 doacross loop nest feature

Summary

From the standard: A doacross loop nest is a loop nest that has cross-iteration
dependence. An iteration is dependent on one or more lexicographically earlier
iterations. The ordered clause parameter on a loop directive identifies the
loop(s) associated with the doacross loop nest.

The init/fini routines allocate/free doacross buffer(s) for each loop for each thread.
The wait routine waits for a flag designated by the dependence vector. The post routine sets the flag designated by current iteration vector.
We use a similar technique of shared buffer indices that covers up to 7 nowait loops executed simultaneously by different threads (number 7 has no real meaning, just heuristic value).
Also, the size of structures are kept intact via reducing dummy arrays.

This needs to be put into the OpenMP runtime library in order for the compiler
team to develop the compiler side of the implementation.

Diff Detail

Repository: rL LLVM

Event Timeline

jlpeyton updated this revision to Diff 48354.Feb 18 2016, 10:29 AM

jlpeyton retitled this revision from to Add new OpenMP 4.5 doacross loop nest feature.

jlpeyton updated this object.

jlpeyton added reviewers: AndreyChurbanov, tlwilmar, hfinkel.

jlpeyton set the repository for this revision to rL LLVM.

jlpeyton added a subscriber: openmp-commits.

jlpeyton added reviewers: ABataev, andreybokhanko.Feb 18 2016, 11:19 AM

jlpeyton added subscribers: ABataev, andreybokhanko.

jlpeyton updated this object.Feb 18 2016, 11:26 AM

LGTM

This revision is now accepted and ready to land.Mar 1 2016, 8:47 AM

Closed by commit rL262532: Add new OpenMP 4.5 doacross loop nest feature (authored by jlpeyton). · Explain WhyMar 2 2016, 2:46 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

openmp/

trunk/

runtime/

src/

4 lines

26 lines

289 lines

7 lines

23 lines

Diff 49670

openmp/trunk/runtime/src/dllexports

Show First 20 Lines • Show All 383 Lines • ▼ Show 20 Lines	%ifndef stub
%endif # OMP_40		%endif # OMP_40
%endif		%endif

# OpenMP 4.1 entry points		# OpenMP 4.1 entry points
%ifndef stub		%ifndef stub
%ifdef OMP_41		%ifdef OMP_41
__kmpc_proxy_task_completed 259		__kmpc_proxy_task_completed 259
__kmpc_proxy_task_completed_ooo 260		__kmpc_proxy_task_completed_ooo 260
		__kmpc_doacross_init 261
		__kmpc_doacross_wait 262
		__kmpc_doacross_post 263
		__kmpc_doacross_fini 264
%endif		%endif
%endif		%endif

# User API entry points that have both lower- and upper- case versions for Fortran.		# User API entry points that have both lower- and upper- case versions for Fortran.
# Number for lowercase version is indicated. Number for uppercase is obtained by adding 1000.		# Number for lowercase version is indicated. Number for uppercase is obtained by adding 1000.
# User API entry points are entry points that start with 'kmp_' or 'omp_'.		# User API entry points are entry points that start with 'kmp_' or 'omp_'.

omp_destroy_lock 700		omp_destroy_lock 700
▲ Show 20 Lines • Show All 590 Lines • Show Last 20 Lines

openmp/trunk/runtime/src/kmp.h

	Show First 20 Lines • Show All 1,659 Lines • ▼ Show 20 Lines
	} dispatch_shared_info32_t;			} dispatch_shared_info32_t;

	typedef struct dispatch_shared_info64 {			typedef struct dispatch_shared_info64 {
	/* chunk index under dynamic, number of idle threads under static-steal;			/* chunk index under dynamic, number of idle threads under static-steal;
	iteration index otherwise */			iteration index otherwise */
	volatile kmp_uint64 iteration;			volatile kmp_uint64 iteration;
	volatile kmp_uint64 num_done;			volatile kmp_uint64 num_done;
	volatile kmp_uint64 ordered_iteration;			volatile kmp_uint64 ordered_iteration;
	kmp_int64 ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size after making ordered_iteration scalar			kmp_int64 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making ordered_iteration scalar
	} dispatch_shared_info64_t;			} dispatch_shared_info64_t;

	typedef struct dispatch_shared_info {			typedef struct dispatch_shared_info {
	union shared_info {			union shared_info {
	dispatch_shared_info32_t s32;			dispatch_shared_info32_t s32;
	dispatch_shared_info64_t s64;			dispatch_shared_info64_t s64;
	} u;			} u;
	/* volatile kmp_int32 dispatch_abort; depricated */
	volatile kmp_uint32 buffer_index;			volatile kmp_uint32 buffer_index;
				#if OMP_41_ENABLED
				volatile kmp_int32 doacross_buf_idx; // teamwise index
				volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
				kmp_int32 doacross_num_done; // count finished threads
				#endif
	} dispatch_shared_info_t;			} dispatch_shared_info_t;

	typedef struct kmp_disp {			typedef struct kmp_disp {
	/* Vector for ORDERED SECTION */			/* Vector for ORDERED SECTION */
	void (th_deo_fcn)( int gtid, int * cid, ident_t *);			void (th_deo_fcn)( int gtid, int * cid, ident_t *);
	/* Vector for END ORDERED SECTION */			/* Vector for END ORDERED SECTION */
	void (th_dxo_fcn)( int gtid, int * cid, ident_t *);			void (th_dxo_fcn)( int gtid, int * cid, ident_t *);

	dispatch_shared_info_t *th_dispatch_sh_current;			dispatch_shared_info_t *th_dispatch_sh_current;
	dispatch_private_info_t *th_dispatch_pr_current;			dispatch_private_info_t *th_dispatch_pr_current;

	dispatch_private_info_t *th_disp_buffer;			dispatch_private_info_t *th_disp_buffer;
	kmp_int32 th_disp_index;			kmp_int32 th_disp_index;
				#if OMP_41_ENABLED
				kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
				volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
				kmp_int64 *th_doacross_info; // info on loop bounds
				#else
	void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64			void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64
				#endif
	#if KMP_USE_INTERNODE_ALIGNMENT			#if KMP_USE_INTERNODE_ALIGNMENT
	char more_padding[INTERNODE_CACHE_LINE];			char more_padding[INTERNODE_CACHE_LINE];
	#endif			#endif
	} kmp_disp_t;			} kmp_disp_t;

	/* ------------------------------------------------------------------------ */			/* ------------------------------------------------------------------------ */
	/* ------------------------------------------------------------------------ */			/* ------------------------------------------------------------------------ */

	▲ Show 20 Lines • Show All 1,838 Lines • ▼ Show 20 Lines
	KMP_EXPORT kmp_int32 __kmpc_in_parallel( ident_t *loc );			KMP_EXPORT kmp_int32 __kmpc_in_parallel( ident_t *loc );
	KMP_EXPORT void __kmpc_pop_num_threads( ident_t *loc, kmp_int32 global_tid );			KMP_EXPORT void __kmpc_pop_num_threads( ident_t *loc, kmp_int32 global_tid );
	KMP_EXPORT void __kmpc_push_num_threads( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads );			KMP_EXPORT void __kmpc_push_num_threads( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads );

	#if OMP_40_ENABLED			#if OMP_40_ENABLED
	KMP_EXPORT void __kmpc_push_proc_bind( ident_t *loc, kmp_int32 global_tid, int proc_bind );			KMP_EXPORT void __kmpc_push_proc_bind( ident_t *loc, kmp_int32 global_tid, int proc_bind );
	KMP_EXPORT void __kmpc_push_num_teams( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads );			KMP_EXPORT void __kmpc_push_num_teams( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads );
	KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...);			KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...);
				#endif
				#if OMP_41_ENABLED
				struct kmp_dim { // loop bounds info casted to kmp_int64
				kmp_int64 lo; // lower
				kmp_int64 up; // upper
				kmp_int64 st; // stride
				};
				KMP_EXPORT void __kmpc_doacross_init(ident_t loc, kmp_int32 gtid, kmp_int32 num_dims, struct kmp_dim dims);
				KMP_EXPORT void __kmpc_doacross_wait(ident_t loc, kmp_int32 gtid, kmp_int64 vec);
				KMP_EXPORT void __kmpc_doacross_post(ident_t loc, kmp_int32 gtid, kmp_int64 vec);
				KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
	#endif			#endif

	KMP_EXPORT void*			KMP_EXPORT void*
	__kmpc_threadprivate_cached( ident_t * loc, kmp_int32 global_tid,			__kmpc_threadprivate_cached( ident_t * loc, kmp_int32 global_tid,
	void * data, size_t size, void *** cache );			void * data, size_t size, void *** cache );

	// Symbols for MS mutual detection.			// Symbols for MS mutual detection.
	extern int _You_must_link_with_exactly_one_OpenMP_library;			extern int _You_must_link_with_exactly_one_OpenMP_library;
	▲ Show 20 Lines • Show All 51 Lines • Show Last 20 Lines

openmp/trunk/runtime/src/kmp_csupport.c

Show First 20 Lines • Show All 3,043 Lines • ▼ Show 20 Lines	void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
}		}
__kmp_place_num_sockets = nS;		__kmp_place_num_sockets = nS;
__kmp_place_socket_offset = sO;		__kmp_place_socket_offset = sO;
__kmp_place_num_cores = nC;		__kmp_place_num_cores = nC;
__kmp_place_core_offset = cO;		__kmp_place_core_offset = cO;
__kmp_place_num_threads_per_core = nT;		__kmp_place_num_threads_per_core = nT;
}		}

		#if OMP_41_ENABLED
		/*!
		@ingroup WORK_SHARING
		@param loc source location information.
		@param gtid global thread number.
		@param num_dims number of associated doacross loops.
		@param dims info on loops bounds.

		Initialize doacross loop information.
		Expect compiler send us inclusive bounds,
		e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
		*/
		void
		__kmpc_doacross_init(ident_t loc, int gtid, int num_dims, struct kmp_dim dims)
		{
		int j, idx;
		kmp_int64 last, trace_count;
		kmp_info_t *th = __kmp_threads[gtid];
		kmp_team_t *team = th->th.th_team;
		kmp_uint32 *flags;
		kmp_disp_t *pr_buf = th->th.th_dispatch;
		dispatch_shared_info_t *sh_buf;

		KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
		gtid, num_dims, !team->t.t_serialized));
		KMP_DEBUG_ASSERT(dims != NULL);
		KMP_DEBUG_ASSERT(num_dims > 0);

		if( team->t.t_serialized ) {
		KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
		return; // no dependencies if team is serialized
		}
		KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
		idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
		sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];

		// Save bounds info into allocated private buffer
		KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
		pr_buf->th_doacross_info =
		(kmp_int64)__kmp_thread_malloc(th, sizeof(kmp_int64)(4 * num_dims + 1));
		KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
		pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
		// Save also address of num_done in order to access it later without knowing the buffer index
		pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
		pr_buf->th_doacross_info[2] = dims[0].lo;
		pr_buf->th_doacross_info[3] = dims[0].up;
		pr_buf->th_doacross_info[4] = dims[0].st;
		last = 5;
		for( j = 1; j < num_dims; ++j ) {
		kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
		if( dims[j].st == 1 ) { // most common case
		// AC: should we care of ranges bigger than LLONG_MAX? (not for now)
		range_length = dims[j].up - dims[j].lo + 1;
		} else {
		if( dims[j].st > 0 ) {
		KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
		range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
		} else { // negative increment
		KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
		range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
		}
		}
		pr_buf->th_doacross_info[last++] = range_length;
		pr_buf->th_doacross_info[last++] = dims[j].lo;
		pr_buf->th_doacross_info[last++] = dims[j].up;
		pr_buf->th_doacross_info[last++] = dims[j].st;
		}

		// Compute total trip count.
		// Start with range of dims[0] which we don't need to keep in the buffer.
		if( dims[0].st == 1 ) { // most common case
		trace_count = dims[0].up - dims[0].lo + 1;
		} else if( dims[0].st > 0 ) {
		KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
		trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
		} else { // negative increment
		KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
		trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
		}
		for( j = 1; j < num_dims; ++j ) {
		trace_count = pr_buf->th_doacross_info[4 j + 1]; // use kept ranges
		}
		KMP_DEBUG_ASSERT(trace_count > 0);

		// Check if shared buffer is not occupied by other loop (idx - KMP_MAX_DISP_BUF)
		if( idx != sh_buf->doacross_buf_idx ) {
		// Shared buffer is occupied, wait for it to be free
		__kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
		}
		// Check if we are the first thread. After the CAS the first thread gets 0,
		// others get 1 if initialization is in progress, allocated pointer otherwise.
		flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
		(kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
		if( flags == NULL ) {
		// we are the first thread, allocate the array of flags
		kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
		sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
		} else if( (kmp_int64)flags == 1 ) {
		// initialization is still in progress, need to wait
		while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
		KMP_YIELD(TRUE);
		}
		}
		KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
		pr_buf->th_doacross_flags = sh_buf->doacross_flags; // save private copy in order to not
		// touch shared buffer on each iteration
		KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
		}

		void
		__kmpc_doacross_wait(ident_t loc, int gtid, long long vec)
		{
		kmp_int32 shft, num_dims, i;
		kmp_uint32 flag;
		kmp_int64 iter_number; // iteration number of "collapsed" loop nest
		kmp_info_t *th = __kmp_threads[gtid];
		kmp_team_t *team = th->th.th_team;
		kmp_disp_t *pr_buf;
		kmp_int64 lo, up, st;

		KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
		if( team->t.t_serialized ) {
		KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
		return; // no dependencies if team is serialized
		}

		// calculate sequential iteration number and check out-of-bounds condition
		pr_buf = th->th.th_dispatch;
		KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
		num_dims = pr_buf->th_doacross_info[0];
		lo = pr_buf->th_doacross_info[2];
		up = pr_buf->th_doacross_info[3];
		st = pr_buf->th_doacross_info[4];
		if( st == 1 ) { // most common case
		if( vec[0] < lo \|\| vec[0] > up ) {
		KA_TRACE(20,(
		"__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
		gtid, vec[0], lo, up));
		return;
		}
		iter_number = vec[0] - lo;
		} else if( st > 0 ) {
		if( vec[0] < lo \|\| vec[0] > up ) {
		KA_TRACE(20,(
		"__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
		gtid, vec[0], lo, up));
		return;
		}
		iter_number = (kmp_uint64)(vec[0] - lo) / st;
		} else { // negative increment
		if( vec[0] > lo \|\| vec[0] < up ) {
		KA_TRACE(20,(
		"__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
		gtid, vec[0], lo, up));
		return;
		}
		iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
		}
		for( i = 1; i < num_dims; ++i ) {
		kmp_int64 iter, ln;
		kmp_int32 j = i * 4;
		ln = pr_buf->th_doacross_info[j + 1];
		lo = pr_buf->th_doacross_info[j + 2];
		up = pr_buf->th_doacross_info[j + 3];
		st = pr_buf->th_doacross_info[j + 4];
		if( st == 1 ) {
		if( vec[i] < lo \|\| vec[i] > up ) {
		KA_TRACE(20,(
		"__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
		gtid, vec[i], lo, up));
		return;
		}
		iter = vec[i] - lo;
		} else if( st > 0 ) {
		if( vec[i] < lo \|\| vec[i] > up ) {
		KA_TRACE(20,(
		"__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
		gtid, vec[i], lo, up));
		return;
		}
		iter = (kmp_uint64)(vec[i] - lo) / st;
		} else { // st < 0
		if( vec[i] > lo \|\| vec[i] < up ) {
		KA_TRACE(20,(
		"__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
		gtid, vec[i], lo, up));
		return;
		}
		iter = (kmp_uint64)(lo - vec[i]) / (-st);
		}
		iter_number = iter + ln * iter_number;
		}
		shft = iter_number % 32; // use 32-bit granularity
		iter_number >>= 5; // divided by 32
		flag = 1 << shft;
		while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
		KMP_YIELD(TRUE);
		}
		KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
		gtid, (iter_number<<5)+shft));
		}

		void
		__kmpc_doacross_post(ident_t loc, int gtid, long long vec)
		{
		kmp_int32 shft, num_dims, i;
		kmp_uint32 flag;
		kmp_int64 iter_number; // iteration number of "collapsed" loop nest
		kmp_info_t *th = __kmp_threads[gtid];
		kmp_team_t *team = th->th.th_team;
		kmp_disp_t *pr_buf;
		kmp_int64 lo, st;

		KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
		if( team->t.t_serialized ) {
		KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
		return; // no dependencies if team is serialized
		}

		// calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
		pr_buf = th->th.th_dispatch;
		KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
		num_dims = pr_buf->th_doacross_info[0];
		lo = pr_buf->th_doacross_info[2];
		st = pr_buf->th_doacross_info[4];
		if( st == 1 ) { // most common case
		iter_number = vec[0] - lo;
		} else if( st > 0 ) {
		iter_number = (kmp_uint64)(vec[0] - lo) / st;
		} else { // negative increment
		iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
		}
		for( i = 1; i < num_dims; ++i ) {
		kmp_int64 iter, ln;
		kmp_int32 j = i * 4;
		ln = pr_buf->th_doacross_info[j + 1];
		lo = pr_buf->th_doacross_info[j + 2];
		st = pr_buf->th_doacross_info[j + 4];
		if( st == 1 ) {
		iter = vec[i] - lo;
		} else if( st > 0 ) {
		iter = (kmp_uint64)(vec[i] - lo) / st;
		} else { // st < 0
		iter = (kmp_uint64)(lo - vec[i]) / (-st);
		}
		iter_number = iter + ln * iter_number;
		}
		shft = iter_number % 32; // use 32-bit granularity
		iter_number >>= 5; // divided by 32
		flag = 1 << shft;
		if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
		KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
		KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
		gtid, (iter_number<<5)+shft));
		}

		void
		__kmpc_doacross_fini(ident_t *loc, int gtid)
		{
		kmp_int64 num_done;
		kmp_info_t *th = __kmp_threads[gtid];
		kmp_team_t *team = th->th.th_team;
		kmp_disp_t *pr_buf = th->th.th_dispatch;

		KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
		if( team->t.t_serialized ) {
		KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
		return; // nothing to do
		}
		num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
		if( num_done == th->th.th_team_nproc ) {
		// we are the last thread, need to free shared resources
		int idx = pr_buf->th_doacross_buf_idx - 1;
		dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];
		KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
		KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
		KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
		__kmp_thread_free(th, (void*)sh_buf->doacross_flags);
		sh_buf->doacross_flags = NULL;
		sh_buf->doacross_num_done = 0;
		sh_buf->doacross_buf_idx += KMP_MAX_DISP_BUF; // free buffer for future re-use
		}
		// free private resources (need to keep buffer index forever)
		__kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
		pr_buf->th_doacross_info = NULL;
		KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
		}
		#endif

// end of file //		// end of file //

openmp/trunk/runtime/src/kmp_dispatch.cpp

	Show First 20 Lines • Show All 157 Lines • ▼ Show 20 Lines
	// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types			// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
	template< typename UT >			template< typename UT >
	struct dispatch_shared_infoXX_template {			struct dispatch_shared_infoXX_template {
	/* chunk index under dynamic, number of idle threads under static-steal;			/* chunk index under dynamic, number of idle threads under static-steal;
	iteration index otherwise */			iteration index otherwise */
	volatile UT iteration;			volatile UT iteration;
	volatile UT num_done;			volatile UT num_done;
	volatile UT ordered_iteration;			volatile UT ordered_iteration;
	UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar			UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
	};			};

	// replaces dispatch_shared_info structure and dispatch_shared_info_t type			// replaces dispatch_shared_info structure and dispatch_shared_info_t type
	template< typename UT >			template< typename UT >
	struct dispatch_shared_info_template {			struct dispatch_shared_info_template {
	// we need union here to keep the structure size			// we need union here to keep the structure size
	union shared_info_tmpl {			union shared_info_tmpl {
	dispatch_shared_infoXX_template< UT > s;			dispatch_shared_infoXX_template< UT > s;
	dispatch_shared_info64_t s64;			dispatch_shared_info64_t s64;
	} u;			} u;
	volatile kmp_uint32 buffer_index;			volatile kmp_uint32 buffer_index;
				#if OMP_41_ENABLED
				volatile kmp_int32 doacross_buf_idx; // teamwise index
				kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
				kmp_int32 doacross_num_done; // count finished threads
				#endif
	};			};

	/* ------------------------------------------------------------------------ */			/* ------------------------------------------------------------------------ */
	/* ------------------------------------------------------------------------ */			/* ------------------------------------------------------------------------ */

	#undef USE_TEST_LOCKS			#undef USE_TEST_LOCKS

	// test_then_add template (general template should NOT be used)			// test_then_add template (general template should NOT be used)
	▲ Show 20 Lines • Show All 2,443 Lines • Show Last 20 Lines

openmp/trunk/runtime/src/kmp_runtime.c

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,040 Lines • ▼ Show 20 Lines	#else
team->t.t_dispatch = (kmp_disp_t) __kmp_allocate( sizeof(kmp_disp_t) max_nth );		team->t.t_dispatch = (kmp_disp_t) __kmp_allocate( sizeof(kmp_disp_t) max_nth );
//team->t.t_set_max_active_levels = (int) __kmp_allocate( sizeof(int) max_nth );		//team->t.t_set_max_active_levels = (int) __kmp_allocate( sizeof(int) max_nth );
//team->t.t_set_sched = (kmp_r_sched_t) __kmp_allocate( sizeof(kmp_r_sched_t) max_nth );		//team->t.t_set_sched = (kmp_r_sched_t) __kmp_allocate( sizeof(kmp_r_sched_t) max_nth );
team->t.t_implicit_task_taskdata = (kmp_taskdata_t) __kmp_allocate( sizeof(kmp_taskdata_t) max_nth );		team->t.t_implicit_task_taskdata = (kmp_taskdata_t) __kmp_allocate( sizeof(kmp_taskdata_t) max_nth );
#endif		#endif
team->t.t_max_nproc = max_nth;		team->t.t_max_nproc = max_nth;

/* setup dispatch buffers */		/* setup dispatch buffers */
for(i = 0 ; i < num_disp_buff; ++i)		for(i = 0 ; i < num_disp_buff; ++i) {
team->t.t_disp_buffer[i].buffer_index = i;		team->t.t_disp_buffer[i].buffer_index = i;
		#if OMP_41_ENABLED
		team->t.t_disp_buffer[i].doacross_buf_idx = i;
		#endif
		}
}		}

static void		static void
__kmp_free_team_arrays(kmp_team_t *team) {		__kmp_free_team_arrays(kmp_team_t *team) {
/* Note: this does not free the threads in t_threads (__kmp_free_threads) */		/* Note: this does not free the threads in t_threads (__kmp_free_threads) */
int i;		int i;
for ( i = 0; i < team->t.t_max_nproc; ++ i ) {		for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {		if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
▲ Show 20 Lines • Show All 1,057 Lines • ▼ Show 20 Lines	/* Initialize dynamic dispatch */
size_t disp_size = sizeof( dispatch_private_info_t ) *		size_t disp_size = sizeof( dispatch_private_info_t ) *
( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );		( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );		KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
KMP_ASSERT( dispatch );		KMP_ASSERT( dispatch );
KMP_DEBUG_ASSERT( team->t.t_dispatch );		KMP_DEBUG_ASSERT( team->t.t_dispatch );
KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );		KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );

dispatch->th_disp_index = 0;		dispatch->th_disp_index = 0;
		#if OMP_41_ENABLED
		dispatch->th_doacross_buf_idx = 0;
		#endif
if( ! dispatch->th_disp_buffer ) {		if( ! dispatch->th_disp_buffer ) {
dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );		dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );

if ( __kmp_storage_map ) {		if ( __kmp_storage_map ) {
__kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],		__kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
&dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],		&dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],
disp_size, "th_%d.th_dispatch.th_disp_buffer "		disp_size, "th_%d.th_dispatch.th_disp_buffer "
"(team_%d.t_dispatch[%d].th_disp_buffer)",		"(team_%d.t_dispatch[%d].th_disp_buffer)",
▲ Show 20 Lines • Show All 2,675 Lines • ▼ Show 20 Lines	#if KMP_CACHE_MANAGE
KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );		KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
#endif /* KMP_CACHE_MANAGE */		#endif /* KMP_CACHE_MANAGE */
dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);		dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
KMP_DEBUG_ASSERT( dispatch );		KMP_DEBUG_ASSERT( dispatch );
KMP_DEBUG_ASSERT( team->t.t_dispatch );		KMP_DEBUG_ASSERT( team->t.t_dispatch );
//KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );		//KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );

dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */		dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
		#if OMP_41_ENABLED
		dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
		#endif
if( __kmp_env_consistency_check )		if( __kmp_env_consistency_check )
__kmp_push_parallel( gtid, team->t.t_ident );		__kmp_push_parallel( gtid, team->t.t_ident );

KMP_MB(); /* Flush all pending memory write invalidates. */		KMP_MB(); /* Flush all pending memory write invalidates. */
}		}

void		void
__kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,		__kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
▲ Show 20 Lines • Show All 220 Lines • ▼ Show 20 Lines	#endif /* KMP_DEBUG */

team->t.t_construct = 0; /* no single directives seen yet */		team->t.t_construct = 0; /* no single directives seen yet */
team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */		team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */

/* Reset the identifiers on the dispatch buffer */		/* Reset the identifiers on the dispatch buffer */
KMP_DEBUG_ASSERT( team->t.t_disp_buffer );		KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
if ( team->t.t_max_nproc > 1 ) {		if ( team->t.t_max_nproc > 1 ) {
int i;		int i;
for (i = 0; i < KMP_MAX_DISP_BUF; ++i)		for (i = 0; i < KMP_MAX_DISP_BUF; ++i) {
team->t.t_disp_buffer[ i ].buffer_index = i;		team->t.t_disp_buffer[ i ].buffer_index = i;
		#if OMP_41_ENABLED
		team->t.t_disp_buffer[i].doacross_buf_idx = i;
		#endif
		}
} else {		} else {
team->t.t_disp_buffer[ 0 ].buffer_index = 0;		team->t.t_disp_buffer[ 0 ].buffer_index = 0;
		#if OMP_41_ENABLED
		team->t.t_disp_buffer[0].doacross_buf_idx = 0;
		#endif
}		}

KMP_MB(); /* Flush all pending memory write invalidates. */		KMP_MB(); /* Flush all pending memory write invalidates. */
KMP_ASSERT( this_thr->th.th_team == team );		KMP_ASSERT( this_thr->th.th_team == team );

#ifdef KMP_DEBUG		#ifdef KMP_DEBUG
for( f=0 ; f<team->t.t_nproc ; f++ ) {		for( f=0 ; f<team->t.t_nproc ; f++ ) {
KMP_DEBUG_ASSERT( team->t.t_threads[f] &&		KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
▲ Show 20 Lines • Show All 602 Lines • Show Last 20 Lines