This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
openmp/runtime/
-
runtime/
-
src/
-
kmp.h
-
kmp_affinity.cpp
-
kmp_dispatch.h
3
kmp_dispatch.cpp
-
kmp_global.cpp
-
test/worksharing/for/
-
worksharing/
-
for/
-
omp_for_schedule_dynamic.c

Differential D152955

[OpenMP] Add skewed iteration distribution in static-steal schedule for hybrid systems
ClosedPublic

Authored by jlpeyton on Jun 14 2023, 1:35 PM.

Download Raw Diff

Details

Reviewers

tlwilmar
hbae
Nawrin
jdoerfert

Commits

rG5cc603cb2244: [OpenMP] Add skewed iteration distribution on hybrid systems (#69946)

Summary

This commit adds skewed distribution of iterations in dynamic
schedule (static steal) for hybrid systems when thread affinity is assigned.
Currently, it distributes the iterations at 60:40 ratio. Consider this loop with
dynamic schedule type, for (int i = 0; i < 100; ++i)

In a hybrid system with 20 processes (16 CORE and 4 ATOM core), 88 iterations
will be assigned to performance cores and 12 iterations will be assigned to
efficient cores. Each thread with CORE core will process 5
iterations + extras and with ATOM core will process 3 iterations.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jlpeyton created this revision.Jun 14 2023, 1:35 PM

Herald added a project: Restricted Project. · View Herald TranscriptJun 14 2023, 1:35 PM

Herald added subscribers: sunshaoce, guansong, yaxunl. · View Herald Transcript

jlpeyton requested review of this revision.Jun 14 2023, 1:35 PM

Herald added a reviewer: jdoerfert. · View Herald TranscriptJun 14 2023, 1:35 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: openmp-commits, jplehr, sstefan1. · View Herald Transcript

Harbormaster completed remote builds in B238930: Diff 531488.Jun 14 2023, 1:39 PM

Base on https://reviews.llvm.org/D156727 so topology information is consistent and fix accidental use of ds_gtid instead of ds_tid.

Harbormaster completed remote builds in B249287: Diff 545757.Jul 31 2023, 12:34 PM

can we have a test for this?

openmp/runtime/src/kmp_dispatch.cpp
96	floats are not double precision
154	Probably easier to read if the non weighted case was handled first.
547	Is it really helping that we use a macro to elide the code? It makes the impl more complex for sure.

If its better to move this to GitHub, I can do that. Sorry, I had forgotten about this.

Test case still requires to be run on x86 with hybrid architecture (e.g., Alder Lake machine), but is now enabled.

I've compartmentalized the macro-guarded code for better readability, it's hard to remove the guard completely since the code relies on x86-specific guards (e.g., use of KMP_HW_CORE_TYPE_ATOM).

Harbormaster completed remote builds in B257811: Diff 557677.Oct 10 2023, 3:33 PM

This revision was not accepted when it landed; it landed in state Needs Review.Nov 8 2023, 8:19 AM

Closed by commit rG5cc603cb2244: [OpenMP] Add skewed iteration distribution on hybrid systems (#69946) (authored by jlpeyton, committed by GitHub <noreply@github.com>). · Explain Why

This revision was automatically updated to reflect the committed changes.

GitHub <noreply@github.com> added a commit: rG5cc603cb2244: [OpenMP] Add skewed iteration distribution on hybrid systems (#69946).

Revision Contents

Path

Size

openmp/

runtime/

src/

66 lines

38 lines

14 lines

207 lines

3 lines

test/

worksharing/

for/

omp_for_schedule_dynamic.c

1 line

Diff 557677

openmp/runtime/src/kmp.h

Show All 21 Lines	/* This fix replaces gettimeofday with clock_gettime for better scalability on
the Altix. Requires user code to be linked with -lrt. */		the Altix. Requires user code to be linked with -lrt. */
//#define FIX_SGI_CLOCK		//#define FIX_SGI_CLOCK

/* Defines for OpenMP 3.0 tasking and auto scheduling */		/* Defines for OpenMP 3.0 tasking and auto scheduling */

#ifndef KMP_STATIC_STEAL_ENABLED		#ifndef KMP_STATIC_STEAL_ENABLED
#define KMP_STATIC_STEAL_ENABLED 1		#define KMP_STATIC_STEAL_ENABLED 1
#endif		#endif
		#define KMP_WEIGHTED_ITERATIONS_SUPPORTED \
		(KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED && \
		(KMP_ARCH_X86 \|\| KMP_ARCH_X86_64))

#define TASK_CURRENT_NOT_QUEUED 0		#define TASK_CURRENT_NOT_QUEUED 0
#define TASK_CURRENT_QUEUED 1		#define TASK_CURRENT_QUEUED 1

#ifdef BUILD_TIED_TASK_STACK		#ifdef BUILD_TIED_TASK_STACK
#define TASK_STACK_EMPTY 0 // entries when the stack is empty		#define TASK_STACK_EMPTY 0 // entries when the stack is empty
#define TASK_STACK_BLOCK_BITS 5 // Used in TASK_STACK_SIZE and TASK_STACK_MASK		#define TASK_STACK_BLOCK_BITS 5 // Used in TASK_STACK_SIZE and TASK_STACK_MASK
// Number of entries in each task stack array		// Number of entries in each task stack array
▲ Show 20 Lines • Show All 838 Lines • ▼ Show 20 Lines	typedef struct kmp_affinity_flags_t {
unsigned core_types_gran : 1;		unsigned core_types_gran : 1;
unsigned core_effs_gran : 1;		unsigned core_effs_gran : 1;
unsigned omp_places : 1;		unsigned omp_places : 1;
unsigned reserved : 22;		unsigned reserved : 22;
} kmp_affinity_flags_t;		} kmp_affinity_flags_t;
KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);		KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);

typedef struct kmp_affinity_ids_t {		typedef struct kmp_affinity_ids_t {
		int os_id;
int ids[KMP_HW_LAST];		int ids[KMP_HW_LAST];
int operator[](size_t idx) const { return ids[idx]; }
int &operator[](size_t idx) { return ids[idx]; }
kmp_affinity_ids_t &operator=(const kmp_affinity_ids_t &rhs) {
for (int i = 0; i < KMP_HW_LAST; ++i)
ids[i] = rhs[i];
return *this;
}
} kmp_affinity_ids_t;		} kmp_affinity_ids_t;

typedef struct kmp_affinity_attrs_t {		typedef struct kmp_affinity_attrs_t {
int core_type : 8;		int core_type : 8;
int core_eff : 8;		int core_eff : 8;
unsigned valid : 1;		unsigned valid : 1;
unsigned reserved : 15;		unsigned reserved : 15;
} kmp_affinity_attrs_t;		} kmp_affinity_attrs_t;
Show All 33 Lines
extern kmp_affinity_t *__kmp_affinities[2];		extern kmp_affinity_t *__kmp_affinities[2];

extern void __kmp_affinity_bind_thread(int which);		extern void __kmp_affinity_bind_thread(int which);

extern kmp_affin_mask_t *__kmp_affin_fullMask;		extern kmp_affin_mask_t *__kmp_affin_fullMask;
extern kmp_affin_mask_t *__kmp_affin_origMask;		extern kmp_affin_mask_t *__kmp_affin_origMask;
extern char *__kmp_cpuinfo_file;		extern char *__kmp_cpuinfo_file;

		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
		extern int __kmp_first_osid_with_ecore;
		#endif

#endif /* KMP_AFFINITY_SUPPORTED */		#endif /* KMP_AFFINITY_SUPPORTED */

// This needs to be kept in sync with the values in omp.h !!!		// This needs to be kept in sync with the values in omp.h !!!
typedef enum kmp_proc_bind_t {		typedef enum kmp_proc_bind_t {
proc_bind_false = 0,		proc_bind_false = 0,
proc_bind_true,		proc_bind_true,
proc_bind_primary,		proc_bind_primary,
proc_bind_close,		proc_bind_close,
▲ Show 20 Lines • Show All 882 Lines • ▼ Show 20 Lines
#endif		#endif

typedef struct kmp_sched_flags {		typedef struct kmp_sched_flags {
unsigned ordered : 1;		unsigned ordered : 1;
unsigned nomerge : 1;		unsigned nomerge : 1;
unsigned contains_last : 1;		unsigned contains_last : 1;
#if KMP_USE_HIER_SCHED		#if KMP_USE_HIER_SCHED
unsigned use_hier : 1;		unsigned use_hier : 1;
		#if KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED
		unsigned use_hybrid : 1;
		unsigned unused : 27;
		#else
		unsigned unused : 28;
		#endif
		#elif KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED
		unsigned use_hybrid : 1;
unsigned unused : 28;		unsigned unused : 28;
#else		#else
unsigned unused : 29;		unsigned unused : 29;
#endif		#endif
} kmp_sched_flags_t;		} kmp_sched_flags_t;

KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);		KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);

#if KMP_STATIC_STEAL_ENABLED		#if KMP_STATIC_STEAL_ENABLED
typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {		typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 count;		kmp_int32 count;
kmp_int32 ub;		kmp_int32 ub;
/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */		/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
kmp_int32 lb;		kmp_int32 lb;
kmp_int32 st;		kmp_int32 st;
kmp_int32 tc;		kmp_int32 tc;
kmp_lock_t *steal_lock; // lock used for chunk stealing		kmp_lock_t *steal_lock; // lock used for chunk stealing

		kmp_uint32 ordered_lower;
		kmp_uint32 ordered_upper;

// KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)		// KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
// a) parm3 is properly aligned and		// a) parm3 is properly aligned and
// b) all parm1-4 are on the same cache line.		// b) all parm1-4 are on the same cache line.
// Because of parm1-4 are used together, performance seems to be better		// Because of parm1-4 are used together, performance seems to be better
// if they are on the same cache line (not measured though).		// if they are on the same cache line (not measured though).

struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template		struct KMP_ALIGN(32) {
kmp_int32 parm1; // structures in kmp_dispatch.cpp. This should		kmp_int32 parm1;
kmp_int32 parm2; // make no real change at least while padding is off.		kmp_int32 parm2;
kmp_int32 parm3;		kmp_int32 parm3;
kmp_int32 parm4;		kmp_int32 parm4;
};		};

kmp_uint32 ordered_lower;		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
kmp_uint32 ordered_upper;		kmp_uint32 pchunks;
		kmp_uint32 num_procs_with_pcore;
		kmp_int32 first_thread_with_ecore;
		#endif
#if KMP_OS_WINDOWS		#if KMP_OS_WINDOWS
kmp_int32 last_upper;		kmp_int32 last_upper;
#endif /* KMP_OS_WINDOWS */		#endif /* KMP_OS_WINDOWS */
} dispatch_private_info32_t;		} dispatch_private_info32_t;

		#if CACHE_LINE <= 128
		KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128);
		#endif

typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {		typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 count; // current chunk number for static & static-steal scheduling		kmp_int64 count; // current chunk number for static & static-steal scheduling
kmp_int64 ub; /* upper-bound */		kmp_int64 ub; /* upper-bound */
/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */		/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
kmp_int64 lb; /* lower-bound */		kmp_int64 lb; /* lower-bound */
kmp_int64 st; /* stride */		kmp_int64 st; /* stride */
kmp_int64 tc; /* trip count (number of iterations) */		kmp_int64 tc; /* trip count (number of iterations) */
kmp_lock_t *steal_lock; // lock used for chunk stealing		kmp_lock_t *steal_lock; // lock used for chunk stealing

		kmp_uint64 ordered_lower;
		kmp_uint64 ordered_upper;
/* parm[1-4] are used in different ways by different scheduling algorithms */		/* parm[1-4] are used in different ways by different scheduling algorithms */

// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )		// KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
// a) parm3 is properly aligned and		// a) parm3 is properly aligned and
// b) all parm1-4 are in the same cache line.		// b) all parm1-4 are in the same cache line.
// Because of parm1-4 are used together, performance seems to be better		// Because of parm1-4 are used together, performance seems to be better
// if they are in the same line (not measured though).		// if they are in the same line (not measured though).

struct KMP_ALIGN(32) {		struct KMP_ALIGN(32) {
kmp_int64 parm1;		kmp_int64 parm1;
kmp_int64 parm2;		kmp_int64 parm2;
kmp_int64 parm3;		kmp_int64 parm3;
kmp_int64 parm4;		kmp_int64 parm4;
};		};

kmp_uint64 ordered_lower;		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
kmp_uint64 ordered_upper;		kmp_uint64 pchunks;
		kmp_uint64 num_procs_with_pcore;
		kmp_int64 first_thread_with_ecore;
		#endif

#if KMP_OS_WINDOWS		#if KMP_OS_WINDOWS
kmp_int64 last_upper;		kmp_int64 last_upper;
#endif /* KMP_OS_WINDOWS */		#endif /* KMP_OS_WINDOWS */
} dispatch_private_info64_t;		} dispatch_private_info64_t;

		#if CACHE_LINE <= 128
		KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128);
		#endif

#else /* KMP_STATIC_STEAL_ENABLED */		#else /* KMP_STATIC_STEAL_ENABLED */
typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {		typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 lb;		kmp_int32 lb;
kmp_int32 ub;		kmp_int32 ub;
kmp_int32 st;		kmp_int32 st;
kmp_int32 tc;		kmp_int32 tc;

kmp_int32 parm1;		kmp_int32 parm1;
▲ Show 20 Lines • Show All 1,887 Lines • ▼ Show 20 Lines
extern void __kmp_affinity_determine_capable(const char *env_var);		extern void __kmp_affinity_determine_capable(const char *env_var);
extern int __kmp_aux_set_affinity(void **mask);		extern int __kmp_aux_set_affinity(void **mask);
extern int __kmp_aux_get_affinity(void **mask);		extern int __kmp_aux_get_affinity(void **mask);
extern int __kmp_aux_get_affinity_max_proc();		extern int __kmp_aux_get_affinity_max_proc();
extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);		extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);		extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);		extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);		extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
		extern int __kmp_get_first_osid_with_ecore(void);
		#endif
#if KMP_OS_LINUX \|\| KMP_OS_FREEBSD		#if KMP_OS_LINUX \|\| KMP_OS_FREEBSD
extern int kmp_set_thread_affinity_mask_initial(void);		extern int kmp_set_thread_affinity_mask_initial(void);
#endif		#endif
static inline void __kmp_assign_root_init_mask() {		static inline void __kmp_assign_root_init_mask() {
int gtid = __kmp_entry_gtid();		int gtid = __kmp_entry_gtid();
kmp_root_t *r = __kmp_threads[gtid]->th.th_root;		kmp_root_t *r = __kmp_threads[gtid]->th.th_root;
if (r->r.r_uber_thread == __kmp_threads[gtid] && !r->r.r_affinity_assigned) {		if (r->r.r_uber_thread == __kmp_threads[gtid] && !r->r.r_affinity_assigned) {
__kmp_affinity_set_init_mask(gtid, /isa_root=/TRUE);		__kmp_affinity_set_init_mask(gtid, /isa_root=/TRUE);
▲ Show 20 Lines • Show All 912 Lines • Show Last 20 Lines

openmp/runtime/src/kmp_affinity.cpp

Show First 20 Lines • Show All 4,157 Lines • ▼ Show 20 Lines
static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,		static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
kmp_affinity_ids_t &ids,		kmp_affinity_ids_t &ids,
kmp_affinity_attrs_t &attrs) {		kmp_affinity_attrs_t &attrs) {
if (!KMP_AFFINITY_CAPABLE())		if (!KMP_AFFINITY_CAPABLE())
return;		return;

// Initiailze ids and attrs thread data		// Initiailze ids and attrs thread data
for (int i = 0; i < KMP_HW_LAST; ++i)		for (int i = 0; i < KMP_HW_LAST; ++i)
ids[i] = kmp_hw_thread_t::UNKNOWN_ID;		ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
attrs = KMP_AFFINITY_ATTRS_UNKNOWN;		attrs = KMP_AFFINITY_ATTRS_UNKNOWN;

// Iterate through each os id within the mask and determine		// Iterate through each os id within the mask and determine
// the topology id and attribute information		// the topology id and attribute information
int cpu;		int cpu;
int depth = __kmp_topology->get_depth();		int depth = __kmp_topology->get_depth();
KMP_CPU_SET_ITERATE(cpu, mask) {		KMP_CPU_SET_ITERATE(cpu, mask) {
int osid_idx = __kmp_osid_to_hwthread_map[cpu];		int osid_idx = __kmp_osid_to_hwthread_map[cpu];
		ids.os_id = cpu;
const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);		const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
for (int level = 0; level < depth; ++level) {		for (int level = 0; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);		kmp_hw_t type = __kmp_topology->get_type(level);
int id = hw_thread.sub_ids[level];		int id = hw_thread.sub_ids[level];
if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID \|\| ids[type] == id) {		if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID \|\| ids.ids[type] == id) {
ids[type] = id;		ids.ids[type] = id;
} else {		} else {
// This mask spans across multiple topology units, set it as such		// This mask spans across multiple topology units, set it as such
// and mark every level below as such as well.		// and mark every level below as such as well.
ids[type] = kmp_hw_thread_t::MULTIPLE_ID;		ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
for (; level < depth; ++level) {		for (; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);		kmp_hw_t type = __kmp_topology->get_type(level);
ids[type] = kmp_hw_thread_t::MULTIPLE_ID;		ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
}		}
}		}
}		}
if (!attrs.valid) {		if (!attrs.valid) {
attrs.core_type = hw_thread.attrs.get_core_type();		attrs.core_type = hw_thread.attrs.get_core_type();
attrs.core_eff = hw_thread.attrs.get_core_eff();		attrs.core_eff = hw_thread.attrs.get_core_eff();
attrs.valid = 1;		attrs.valid = 1;
} else {		} else {
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines
}		}

// Called when __kmp_topology is ready		// Called when __kmp_topology is ready
static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {		static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
// Initialize other data structures which depend on the topology		// Initialize other data structures which depend on the topology
if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {		if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
machine_hierarchy.init(__kmp_topology->get_num_hw_threads());		machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
__kmp_affinity_get_topology_info(affinity);		__kmp_affinity_get_topology_info(affinity);
		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
		__kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
		#endif
}		}
}		}

// Create a one element mask array (set of places) which only contains the		// Create a one element mask array (set of places) which only contains the
// initial process's affinity mask		// initial process's affinity mask
static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {		static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
KMP_ASSERT(__kmp_affin_fullMask != NULL);		KMP_ASSERT(__kmp_affin_fullMask != NULL);
KMP_ASSERT(affinity.type == affinity_none);		KMP_ASSERT(affinity.type == affinity_none);
▲ Show 20 Lines • Show All 563 Lines • ▼ Show 20 Lines
// This function initializes the per-thread data concerning affinity including		// This function initializes the per-thread data concerning affinity including
// the mask and topology information		// the mask and topology information
void __kmp_affinity_set_init_mask(int gtid, int isa_root) {		void __kmp_affinity_set_init_mask(int gtid, int isa_root) {

kmp_info_t th = (kmp_info_t )TCR_SYNC_PTR(__kmp_threads[gtid]);		kmp_info_t th = (kmp_info_t )TCR_SYNC_PTR(__kmp_threads[gtid]);

// Set the thread topology information to default of unknown		// Set the thread topology information to default of unknown
for (int id = 0; id < KMP_HW_LAST; ++id)		for (int id = 0; id < KMP_HW_LAST; ++id)
th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID;		th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;		th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;

if (!KMP_AFFINITY_CAPABLE()) {		if (!KMP_AFFINITY_CAPABLE()) {
return;		return;
}		}

if (th->th.th_affin_mask == NULL) {		if (th->th.th_affin_mask == NULL) {
KMP_CPU_ALLOC(th->th.th_affin_mask);		KMP_CPU_ALLOC(th->th.th_affin_mask);
▲ Show 20 Lines • Show All 380 Lines • ▼ Show 20 Lines	int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
}		}
if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {		if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
return 0;		return 0;
}		}

return KMP_CPU_ISSET(proc, (kmp_affin_mask_t )(mask));		return KMP_CPU_ISSET(proc, (kmp_affin_mask_t )(mask));
}		}

		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
		// Returns first os proc id with ATOM core
		int __kmp_get_first_osid_with_ecore(void) {
		int low = 0;
		int high = __kmp_topology->get_num_hw_threads() - 1;
		int mid = 0;
		while (high - low > 1) {
		mid = (high + low) / 2;
		if (__kmp_topology->at(mid).attrs.get_core_type() ==
		KMP_HW_CORE_TYPE_CORE) {
		low = mid + 1;
		} else {
		high = mid;
		}
		}
		if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
		return mid;
		}
		return -1;
		}
		#endif

// Dynamic affinity settings - Affinity balanced		// Dynamic affinity settings - Affinity balanced
void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {		void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
KMP_DEBUG_ASSERT(th);		KMP_DEBUG_ASSERT(th);
bool fine_gran = true;		bool fine_gran = true;
int tid = th->th.th_info.ds.ds_tid;		int tid = th->th.th_info.ds.ds_tid;
const char *env_var = "KMP_AFFINITY";		const char *env_var = "KMP_AFFINITY";

// Do not perform balanced affinity for the hidden helper threads		// Do not perform balanced affinity for the hidden helper threads
▲ Show 20 Lines • Show All 279 Lines • Show Last 20 Lines

openmp/runtime/src/kmp_dispatch.h

Show First 20 Lines • Show All 69 Lines • ▼ Show 20 Lines	template <typename T> struct dispatch_private_infoXX_template {
typedef typename traits_t<T>::signed_t ST;		typedef typename traits_t<T>::signed_t ST;
UT count; // unsigned		UT count; // unsigned
T ub;		T ub;
/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */		/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
T lb;		T lb;
ST st; // signed		ST st; // signed
UT tc; // unsigned		UT tc; // unsigned
kmp_lock_t *steal_lock; // lock used for chunk stealing		kmp_lock_t *steal_lock; // lock used for chunk stealing

		UT ordered_lower; // unsigned
		UT ordered_upper; // unsigned

/* parm[1-4] are used in different ways by different scheduling algorithms */		/* parm[1-4] are used in different ways by different scheduling algorithms */

// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )		// KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
// a) parm3 is properly aligned and		// a) parm3 is properly aligned and
// b) all parm1-4 are in the same cache line.		// b) all parm1-4 are in the same cache line.
// Because of parm1-4 are used together, performance seems to be better		// Because of parm1-4 are used together, performance seems to be better
// if they are in the same line (not measured though).		// if they are in the same line (not measured though).

struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4		struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
T parm1;		T parm1;
T parm2;		T parm2;
T parm3;		T parm3;
T parm4;		T parm4;
};		};

UT ordered_lower; // unsigned		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
UT ordered_upper; // unsigned		UT pchunks; // total number of chunks for processes with p-core
		UT num_procs_with_pcore; // number of threads with p-core
		T first_thread_with_ecore;
		#endif
#if KMP_OS_WINDOWS		#if KMP_OS_WINDOWS
T last_upper;		T last_upper;
#endif /* KMP_OS_WINDOWS */		#endif /* KMP_OS_WINDOWS */
};		};

#else /* KMP_STATIC_STEAL_ENABLED */		#else /* KMP_STATIC_STEAL_ENABLED */

// replaces dispatch_private_info{32,64} structures and		// replaces dispatch_private_info{32,64} structures and
▲ Show 20 Lines • Show All 405 Lines • Show Last 20 Lines

openmp/runtime/src/kmp_dispatch.cpp

Show First 20 Lines • Show All 84 Lines • ▼ Show 20 Lines	static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
else if (SCHEDULE_HAS_NONMONOTONIC(schedule))		else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
monotonicity = SCHEDULE_NONMONOTONIC;		monotonicity = SCHEDULE_NONMONOTONIC;
else if (SCHEDULE_HAS_MONOTONIC(schedule))		else if (SCHEDULE_HAS_MONOTONIC(schedule))
monotonicity = SCHEDULE_MONOTONIC;		monotonicity = SCHEDULE_MONOTONIC;

return monotonicity;		return monotonicity;
}		}

		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
		static inline float __kmp_get_float_val(float num) {
		return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
		}
		jdoerfertUnsubmitted Not Done Reply Inline Actions floats are not double precision jdoerfert: floats are not double precision
		static inline int __kmp_get_round_val(float num) {
		return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
		}
		#endif

		template <typename T>
		inline void
		__kmp_initialize_self_buffer(kmp_team_t *team, T id,
		dispatch_private_info_template<T> *pr,
		typename traits_t<T>::unsigned_t nchunks, T nproc,
		typename traits_t<T>::unsigned_t &init,
		T &small_chunk, T &extras, T &p_extra) {

		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
		if (pr->flags.use_hybrid) {
		kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
		kmp_hw_core_type_t type =
		(kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
		T pchunks = pr->u.p.pchunks;
		T echunks = nchunks - pchunks;
		T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
		T num_procs_with_ecore = nproc - num_procs_with_pcore;
		T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
		T big_chunk =
		pchunks / num_procs_with_pcore; // chunks per thread with p-core
		small_chunk =
		echunks / num_procs_with_ecore; // chunks per thread with e-core

		extras =
		(pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);

		p_extra = (big_chunk - small_chunk);

		if (type == KMP_HW_CORE_TYPE_CORE) {
		if (id < first_thread_with_ecore) {
		init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
		} else {
		init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
		(id < extras ? id : extras);
		}
		} else {
		if (id == first_thread_with_ecore) {
		init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
		} else {
		init = id * small_chunk + first_thread_with_ecore * p_extra +
		(id < extras ? id : extras);
		}
		}
		p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
		return;
		}
		#endif

		small_chunk = nchunks / nproc; // chunks per thread
		extras = nchunks % nproc;
		p_extra = 0;
		init = id * small_chunk + (id < extras ? id : extras);
		}
		jdoerfertUnsubmitted Not Done Reply Inline Actions Probably easier to read if the non weighted case was handled first. jdoerfert: Probably easier to read if the non weighted case was handled first.

#if KMP_STATIC_STEAL_ENABLED		#if KMP_STATIC_STEAL_ENABLED
enum { // values for steal_flag (possible states of private per-loop buffer)		enum { // values for steal_flag (possible states of private per-loop buffer)
UNUSED = 0,		UNUSED = 0,
CLAIMED = 1, // owner thread started initialization		CLAIMED = 1, // owner thread started initialization
READY = 2, // available for stealing		READY = 2, // available for stealing
THIEF = 3 // finished by owner, or claimed by thief		THIEF = 3 // finished by owner, or claimed by thief
// possible state changes:		// possible state changes:
// 0 -> 1 owner only, sync		// 0 -> 1 owner only, sync
▲ Show 20 Lines • Show All 260 Lines • ▼ Show 20 Lines	if (pr->flags.ordered) {
pr->u.p.ordered_lower = 1;		pr->u.p.ordered_lower = 1;
pr->u.p.ordered_upper = 0;		pr->u.p.ordered_upper = 0;
}		}
}		}

switch (schedule) {		switch (schedule) {
#if KMP_STATIC_STEAL_ENABLED		#if KMP_STATIC_STEAL_ENABLED
case kmp_sch_static_steal: {		case kmp_sch_static_steal: {
T ntc, init;		T ntc, init = 0;

KD_TRACE(100,		KD_TRACE(100,
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",		("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
gtid));		gtid));

ntc = (tc % chunk ? 1 : 0) + tc / chunk;		ntc = (tc % chunk ? 1 : 0) + tc / chunk;
if (nproc > 1 && ntc >= nproc) {		if (nproc > 1 && ntc >= nproc) {
KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);		KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
T id = tid;		T id = tid;
T small_chunk, extras;		T small_chunk, extras, p_extra = 0;
kmp_uint32 old = UNUSED;		kmp_uint32 old = UNUSED;
int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);		int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
if (traits_t<T>::type_size > 4) {		if (traits_t<T>::type_size > 4) {
// AC: TODO: check if 16-byte CAS available and use it to		// AC: TODO: check if 16-byte CAS available and use it to
// improve performance (probably wait for explicit request		// improve performance (probably wait for explicit request
// before spending time on this).		// before spending time on this).
// For now use dynamically allocated per-private-buffer lock,		// For now use dynamically allocated per-private-buffer lock,
// free memory in __kmp_dispatch_next when status==0.		// free memory in __kmp_dispatch_next when status==0.
pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));		pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
__kmp_init_lock(pr->u.p.steal_lock);		__kmp_init_lock(pr->u.p.steal_lock);
}		}

		#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
		// Iterations are divided in a 60/40 skewed distribution among CORE and
		// ATOM processors for hybrid systems
		bool use_hybrid = false;
		kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
		T first_thread_with_ecore = 0;
		T num_procs_with_pcore = 0;
		T num_procs_with_ecore = 0;
		T p_ntc = 0, e_ntc = 0;
		if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
		__kmp_affinity.type != affinity_explicit) {
		use_hybrid = true;
		core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
		if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
		__kmp_first_osid_with_ecore > -1) {
		for (int i = 0; i < team->t.t_nproc; ++i) {
		kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
		->th.th_topology_attrs.core_type;
		int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
		if (id == __kmp_first_osid_with_ecore) {
		first_thread_with_ecore =
		team->t.t_threads[i]->th.th_info.ds.ds_tid;
		}
		if (type == KMP_HW_CORE_TYPE_CORE) {
		num_procs_with_pcore++;
		} else if (type == KMP_HW_CORE_TYPE_ATOM) {
		num_procs_with_ecore++;
		} else {
		use_hybrid = false;
		break;
		}
		}
		}
		if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
		float multiplier = 60.0 / 40.0;
		float p_ratio = (float)num_procs_with_pcore / nproc;
		float e_ratio = (float)num_procs_with_ecore / nproc;
		float e_multiplier =
		(float)1 /
		(((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
		float p_multiplier = multiplier * e_multiplier;
		p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
		if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
		e_ntc = (int)(__kmp_get_float_val(ntc * e_ratio * e_multiplier));
		else
		e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
		KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);

		// Use regular static steal if not enough chunks for skewed
		// distribution
		use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
		e_ntc >= num_procs_with_ecore)
		? true
		: false);
		} else {
		use_hybrid = false;
		}
		}
		pr->flags.use_hybrid = use_hybrid;
		pr->u.p.pchunks = p_ntc;
		pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
		pr->u.p.first_thread_with_ecore = first_thread_with_ecore;

		if (use_hybrid) {
		KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
		T big_chunk = p_ntc / num_procs_with_pcore;
		small_chunk = e_ntc / num_procs_with_ecore;

		extras =
		(p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);

		p_extra = (big_chunk - small_chunk);

		if (core_type == KMP_HW_CORE_TYPE_CORE) {
		if (id < first_thread_with_ecore) {
		init =
		id * small_chunk + id * p_extra + (id < extras ? id : extras);
		} else {
		init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
		(id < extras ? id : extras);
		}
		} else {
		if (id == first_thread_with_ecore) {
		init =
		id * small_chunk + id * p_extra + (id < extras ? id : extras);
		} else {
		init = id * small_chunk + first_thread_with_ecore * p_extra +
		(id < extras ? id : extras);
		}
		}
		p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
		} else
		#endif
		jdoerfertUnsubmitted Not Done Reply Inline Actions Is it really helping that we use a macro to elide the code? It makes the impl more complex for sure. jdoerfert: Is it really helping that we use a macro to elide the code? It makes the impl more complex for…
		{
small_chunk = ntc / nproc;		small_chunk = ntc / nproc;
extras = ntc % nproc;		extras = ntc % nproc;

init = id * small_chunk + (id < extras ? id : extras);		init = id * small_chunk + (id < extras ? id : extras);
		p_extra = 0;
		}
pr->u.p.count = init;		pr->u.p.count = init;
if (claimed) { // are we succeeded in claiming own buffer?		if (claimed) { // are we succeeded in claiming own buffer?
pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);		pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
// Other threads will inspect steal_flag when searching for a victim.		// Other threads will inspect steal_flag when searching for a victim.
// READY means other threads may steal from this thread from now on.		// READY means other threads may steal from this thread from now on.
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);		KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
} else {		} else {
// other thread has stolen whole our range		// other thread has stolen whole our range
KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);		KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
pr->u.p.ub = init; // mark there is no iterations to work on		pr->u.p.ub = init; // mark there is no iterations to work on
}		}
▲ Show 20 Lines • Show All 850 Lines • ▼ Show 20 Lines	if (traits_t<T>::type_size > 4) {
}		}
if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {		if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
kmp_uint32 old = UNUSED;		kmp_uint32 old = UNUSED;
// try to steal whole range from inactive victim		// try to steal whole range from inactive victim
status = v->steal_flag.compare_exchange_strong(old, THIEF);		status = v->steal_flag.compare_exchange_strong(old, THIEF);
if (status) {		if (status) {
// initialize self buffer with victim's whole range of chunks		// initialize self buffer with victim's whole range of chunks
T id = victimId;		T id = victimId;
T small_chunk, extras;		T small_chunk = 0, extras = 0, p_extra = 0;
small_chunk = nchunks / nproc; // chunks per thread		__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
extras = nchunks % nproc;		init, small_chunk, extras,
init = id * small_chunk + (id < extras ? id : extras);		p_extra);
__kmp_acquire_lock(lck, gtid);		__kmp_acquire_lock(lck, gtid);
pr->u.p.count = init + 1; // exclude one we execute immediately		pr->u.p.count = init + 1; // exclude one we execute immediately
pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);		pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
__kmp_release_lock(lck, gtid);		__kmp_release_lock(lck, gtid);
pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid		pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
// no need to reinitialize other thread invariants: lb, st, etc.		// no need to reinitialize other thread invariants: lb, st, etc.
#ifdef KMP_DEBUG		#ifdef KMP_DEBUG
{		{
char *buff;		char *buff;
// create format specifiers before the debug output		// create format specifiers before the debug output
buff = __kmp_str_format(		buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "		"stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",		"count:%%%s ub:%%%s\n",
traits_t<UT>::spec, traits_t<T>::spec);		traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));		KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);		__kmp_str_free(&buff);
}		}
#endif		#endif
// activate non-empty buffer and let others steal from us		// activate non-empty buffer and let others steal from us
if (pr->u.p.count < (UT)pr->u.p.ub)		if (pr->u.p.count < (UT)pr->u.p.ub)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);		KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
break;		break;
▲ Show 20 Lines • Show All 109 Lines • ▼ Show 20 Lines	#endif
}		}
if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {		if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
kmp_uint32 old = UNUSED;		kmp_uint32 old = UNUSED;
// try to steal whole range from inactive victim		// try to steal whole range from inactive victim
status = v->steal_flag.compare_exchange_strong(old, THIEF);		status = v->steal_flag.compare_exchange_strong(old, THIEF);
if (status) {		if (status) {
// initialize self buffer with victim's whole range of chunks		// initialize self buffer with victim's whole range of chunks
T id = victimId;		T id = victimId;
T small_chunk, extras;		T small_chunk = 0, extras = 0, p_extra = 0;
small_chunk = nchunks / nproc; // chunks per thread		__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
extras = nchunks % nproc;		init, small_chunk, extras,
init = id * small_chunk + (id < extras ? id : extras);		p_extra);
vnew.p.count = init + 1;		vnew.p.count = init + 1;
vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);		vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
// write pair (count, ub) at once atomically		// write pair (count, ub) at once atomically
#if KMP_ARCH_X86		#if KMP_ARCH_X86
KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);		KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
#else		#else
(volatile kmp_int64 )(&pr->u.p.count) = vnew.b;		(volatile kmp_int64 )(&pr->u.p.count) = vnew.b;
#endif		#endif
pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid		pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
// no need to initialize other thread invariants: lb, st, etc.		// no need to initialize other thread invariants: lb, st, etc.
#ifdef KMP_DEBUG		#ifdef KMP_DEBUG
{		{
char *buff;		char *buff;
// create format specifiers before the debug output		// create format specifiers before the debug output
buff = __kmp_str_format(		buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "		"stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",		"count:%%%s ub:%%%s\n",
traits_t<UT>::spec, traits_t<T>::spec);		traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));		KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);		__kmp_str_free(&buff);
}		}
#endif		#endif
// activate non-empty buffer and let others steal from us		// activate non-empty buffer and let others steal from us
if (pr->u.p.count < (UT)pr->u.p.ub)		if (pr->u.p.count < (UT)pr->u.p.ub)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);		KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
break;		break;
▲ Show 20 Lines • Show All 1,545 Lines • Show Last 20 Lines

openmp/runtime/src/kmp_global.cpp

	Show First 20 Lines • Show All 275 Lines • ▼ Show 20 Lines
	// Regular thread affinity settings from KMP_AFFINITY			// Regular thread affinity settings from KMP_AFFINITY
	kmp_affinity_t __kmp_affinity = KMP_AFFINITY_INIT("KMP_AFFINITY");			kmp_affinity_t __kmp_affinity = KMP_AFFINITY_INIT("KMP_AFFINITY");
	// Hidden helper thread affinity settings from KMP_HIDDEN_HELPER_AFFINITY			// Hidden helper thread affinity settings from KMP_HIDDEN_HELPER_AFFINITY
	kmp_affinity_t __kmp_hh_affinity =			kmp_affinity_t __kmp_hh_affinity =
	KMP_AFFINITY_INIT("KMP_HIDDEN_HELPER_AFFINITY");			KMP_AFFINITY_INIT("KMP_HIDDEN_HELPER_AFFINITY");
	kmp_affinity_t *__kmp_affinities[] = {&__kmp_affinity, &__kmp_hh_affinity};			kmp_affinity_t *__kmp_affinities[] = {&__kmp_affinity, &__kmp_hh_affinity};

	char *__kmp_cpuinfo_file = NULL;			char *__kmp_cpuinfo_file = NULL;
				#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
				int __kmp_first_osid_with_ecore = -1;
				#endif

	#endif /* KMP_AFFINITY_SUPPORTED */			#endif /* KMP_AFFINITY_SUPPORTED */

	kmp_nested_proc_bind_t __kmp_nested_proc_bind = {NULL, 0, 0};			kmp_nested_proc_bind_t __kmp_nested_proc_bind = {NULL, 0, 0};
	kmp_proc_bind_t __kmp_teams_proc_bind = proc_bind_spread;			kmp_proc_bind_t __kmp_teams_proc_bind = proc_bind_spread;
	int __kmp_affinity_num_places = 0;			int __kmp_affinity_num_places = 0;
	int __kmp_display_affinity = FALSE;			int __kmp_display_affinity = FALSE;
	char *__kmp_affinity_format = NULL;			char *__kmp_affinity_format = NULL;
	▲ Show 20 Lines • Show All 283 Lines • Show Last 20 Lines

openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c

	// RUN: %libomp-compile-and-run			// RUN: %libomp-compile-and-run
				// RUN: env KMP_AFFINITY=compact,0 %libomp-run
	/*			/*
	* Test for dynamic scheduling with chunk size			* Test for dynamic scheduling with chunk size
	* Method: calculate how many times the iteration space is dispatched			* Method: calculate how many times the iteration space is dispatched
	* and judge if each dispatch has the requested chunk size			* and judge if each dispatch has the requested chunk size
	* unless it is the last one.			* unless it is the last one.
	* It is possible for two adjacent chunks are assigned to the same thread			* It is possible for two adjacent chunks are assigned to the same thread
	* Modified by Chunhua Liao			* Modified by Chunhua Liao
	*/			*/
	▲ Show 20 Lines • Show All 80 Lines • Show Last 20 Lines