This is an archive of the discontinued LLVM Phabricator instance.

Differential D11990

Lock-free start of serialized parallel regions
ClosedPublic

Authored by jlpeyton on Aug 12 2015, 1:49 PM.

Download Raw Diff

Details

Reviewers

Summary

Moved initial checks of num_threads vs. 1 from kmp_reserve_threads(), which is called under the forkjoin lock, to the upper level kmp_fork_call().
Thus most cases of serialized parallels do not interfere with the forkjoin lock (e.g. nested parallels when omp nested is disabled).

Diff Detail

Repository: rL LLVM

Event Timeline

jlpeyton updated this revision to Diff 31975.Aug 12 2015, 1:49 PM

jlpeyton retitled this revision from to Lock-free start of serialized parallel regions.

jlpeyton updated this object.

jlpeyton added a reviewer: hfinkel.

jlpeyton set the repository for this revision to rL LLVM.

jlpeyton added subscribers: openmp-commits, AndreyChurbanov.

hfinkel added inline comments.Aug 16 2015, 2:59 AM

runtime/src/kmp_runtime.c
1740	Why do we only release the lock when nthreads == 1? Does __kmp_reserve_threads release it otherwise? (I realize that you've only moved this line from down below, but this seems non-obvious)

AndreyChurbanov added inline comments.Aug 17 2015, 3:50 AM

runtime/src/kmp_runtime.c
1740	No, the __kmp_reserve_threads does not release the lock. Let me detail the rational of the change: Old code: get lock always at the beginning, then release lock for nthreads==1 on line 1756, for nthreads>1 on line 2168 when a lot of multithread-sensitive actions have completed. New code: lock skipped for simple 1-thread cases, but still got lock for other 1-thread cases (e.g. when serial execution caused by dynamic threads adjustment inside __kmp_reserve_threads). As a result the lock releasing for 1-thread moved here, because it now cannot be done for all 1-thread cases. Multi-thread case releases the lock in the same place as earlier. Performance result - 10x or more speedup of the code like <long loop> #pragma omp parallel #pragma omp parallel where inner parallel region are serialized by default because OMP nesting is disabled, and number of threads in outer region is big (e.g. 60 threads on Xeon PHI to keep all cores busy).

hfinkel accepted this revision.Aug 18 2015, 1:30 AM

hfinkel edited edge metadata.

hfinkel added inline comments.

runtime/src/kmp_runtime.c
1740	Okay, sounds good. Please add a comment here explaining that in the multi-thread case the lock is released later on in the function. Otherwise, LGTM.

This revision is now accepted and ready to land.Aug 18 2015, 1:30 AM

Diffusion mentioned this in rL245286: D11990: Lock-free start of serialized parallel regions.Aug 18 2015, 3:09 AM

AndreyChurbanov closed this revision.Aug 18 2015, 10:40 AM

Revision Contents

Path

Size

runtime/

src/

kmp_runtime.c

52 lines

Diff 31975

runtime/src/kmp_runtime.c

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 869 Lines • ▼ Show 20 Lines
	)			)
	{			{
	int capacity;			int capacity;
	int new_nthreads;			int new_nthreads;
	KMP_DEBUG_ASSERT( __kmp_init_serial );			KMP_DEBUG_ASSERT( __kmp_init_serial );
	KMP_DEBUG_ASSERT( root && parent_team );			KMP_DEBUG_ASSERT( root && parent_team );

	//			//
	// Initial check to see if we should use a serialized team.
	//
	if ( set_nthreads == 1 ) {
	KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d reserving 1 thread; requested %d threads\n",
	__kmp_get_gtid(), set_nthreads ));
	return 1;
	}
	if ( ( !get__nested_2(parent_team,master_tid) && (root->r.r_in_parallel
	#if OMP_40_ENABLED
	&& !enter_teams
	#endif /* OMP_40_ENABLED */
	) ) \|\| ( __kmp_library == library_serial ) ) {
	KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team; requested %d threads\n",
	__kmp_get_gtid(), set_nthreads ));
	return 1;
	}

	//
	// If dyn-var is set, dynamically adjust the number of desired threads,			// If dyn-var is set, dynamically adjust the number of desired threads,
	// according to the method specified by dynamic_mode.			// according to the method specified by dynamic_mode.
	//			//
	new_nthreads = set_nthreads;			new_nthreads = set_nthreads;
	if ( ! get__dynamic_2( parent_team, master_tid ) ) {			if ( ! get__dynamic_2( parent_team, master_tid ) ) {
	;			;
	}			}
	#ifdef USE_LOAD_BALANCE			#ifdef USE_LOAD_BALANCE
	▲ Show 20 Lines • Show All 811 Lines • ▼ Show 20 Lines
	#endif /* OMP_40_ENABLED */			#endif /* OMP_40_ENABLED */

	#if KMP_DEBUG			#if KMP_DEBUG
	if ( __kmp_tasking_mode != tskm_immediate_exec ) {			if ( __kmp_tasking_mode != tskm_immediate_exec ) {
	KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);			KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
	}			}
	#endif			#endif

	/* determine how many new threads we can use */
	__kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );

	if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {			if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
	nthreads = 1;			nthreads = 1;
	} else {			} else {
				#if OMP_40_ENABLED
				int enter_teams = ((ap==NULL && active_level==0)\|\|(ap && teams_level>0 && teams_level==level));
				#endif
	nthreads = master_set_numthreads ?			nthreads = master_set_numthreads ?
	master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task			master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task

				// Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
				// This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
				if (nthreads > 1) {
				if ( ( !get__nested(master_th) && (root->r.r_in_parallel
				#if OMP_40_ENABLED
				&& !enter_teams
				#endif /* OMP_40_ENABLED */
				) ) \|\| ( __kmp_library == library_serial ) ) {
				KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
				gtid, nthreads ));
				nthreads = 1;
				}
				}
				if ( nthreads > 1 ) {
				/* determine how many new threads we can use */
				__kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );

	nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads			nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
	#if OMP_40_ENABLED			#if OMP_40_ENABLED
	/* AC: If we execute teams from parallel region (on host), then teams should be created			/* AC: If we execute teams from parallel region (on host), then teams should be created
	but each can only have 1 thread if nesting is disabled. If teams called from serial region,			but each can only have 1 thread if nesting is disabled. If teams called from serial region,
	then teams and their threads should be created regardless of the nesting setting. */			then teams and their threads should be created regardless of the nesting setting. */
	, ((ap==NULL && active_level==0) \|\|			, enter_teams
	(ap && teams_level>0 && teams_level==level))
	#endif /* OMP_40_ENABLED */			#endif /* OMP_40_ENABLED */
	);			);
				if ( nthreads == 1 ) {
				__kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
				hfinkelUnsubmitted Not Done Reply Inline Actions Why do we only release the lock when nthreads == 1? Does __kmp_reserve_threads release it otherwise? (I realize that you've only moved this line from down below, but this seems non-obvious) hfinkel: Why do we only release the lock when nthreads == 1? Does __kmp_reserve_threads release it…
				AndreyChurbanovUnsubmitted Not Done Reply Inline Actions No, the __kmp_reserve_threads does not release the lock. Let me detail the rational of the change: Old code: get lock always at the beginning, then release lock for nthreads==1 on line 1756, for nthreads>1 on line 2168 when a lot of multithread-sensitive actions have completed. New code: lock skipped for simple 1-thread cases, but still got lock for other 1-thread cases (e.g. when serial execution caused by dynamic threads adjustment inside __kmp_reserve_threads). As a result the lock releasing for 1-thread moved here, because it now cannot be done for all 1-thread cases. Multi-thread case releases the lock in the same place as earlier. Performance result - 10x or more speedup of the code like <long loop> #pragma omp parallel #pragma omp parallel where inner parallel region are serialized by default because OMP nesting is disabled, and number of threads in outer region is big (e.g. 60 threads on Xeon PHI to keep all cores busy). AndreyChurbanov: No, the __kmp_reserve_threads does not release the lock. Let me detail the rational of the…
				hfinkelUnsubmitted Not Done Reply Inline Actions Okay, sounds good. Please add a comment here explaining that in the multi-thread case the lock is released later on in the function. Otherwise, LGTM. hfinkel: Okay, sounds good. Please add a comment here explaining that in the multi-thread case the lock…
				}
				}
	}			}
	KMP_DEBUG_ASSERT( nthreads > 0 );			KMP_DEBUG_ASSERT( nthreads > 0 );

	/* If we temporarily changed the set number of threads then restore it now */			/* If we temporarily changed the set number of threads then restore it now */
	master_th->th.th_set_nproc = 0;			master_th->th.th_set_nproc = 0;


	/* create a serialized parallel region? */			/* create a serialized parallel region? */
	if ( nthreads == 1 ) {			if ( nthreads == 1 ) {
	/* josh todo: hypothetical question: what do we do for OS X? /			/* josh todo: hypothetical question: what do we do for OS X? /
	#if KMP_OS_LINUX && ( KMP_ARCH_X86 \|\| KMP_ARCH_X86_64 \|\| KMP_ARCH_ARM \|\| KMP_ARCH_AARCH64)			#if KMP_OS_LINUX && ( KMP_ARCH_X86 \|\| KMP_ARCH_X86_64 \|\| KMP_ARCH_ARM \|\| KMP_ARCH_AARCH64)
	void * args[ argc ];			void * args[ argc ];
	#else			#else
	void * * args = (void*) KMP_ALLOCA( argc sizeof( void * ) );			void * * args = (void*) KMP_ALLOCA( argc sizeof( void * ) );
	#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 \|\| KMP_ARCH_X86_64 \|\| KMP_ARCH_ARM \|\| KMP_ARCH_AARCH64) */			#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 \|\| KMP_ARCH_X86_64 \|\| KMP_ARCH_ARM \|\| KMP_ARCH_AARCH64) */

	__kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
	KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));			KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));

	__kmpc_serialized_parallel(loc, gtid);			__kmpc_serialized_parallel(loc, gtid);

	if ( call_context == fork_context_intel ) {			if ( call_context == fork_context_intel ) {
	/* TODO this sucks, use the compiler itself to pass args! :) */			/* TODO this sucks, use the compiler itself to pass args! :) */
	master_th->th.th_serial_team->t.t_ident = loc;			master_th->th.th_serial_team->t.t_ident = loc;
	#if OMP_40_ENABLED			#if OMP_40_ENABLED
	▲ Show 20 Lines • Show All 5,928 Lines • Show Last 20 Lines