diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -4140,6 +4140,12 @@ #endif /* USE_ITT_BUILD */ kmp_int32 is_constrained); +extern int __kmp_nesting_mode; +extern int __kmp_nesting_mode_nlevels; +extern int *__kmp_nesting_nth_level; +extern void __kmp_init_nesting_mode(); +extern void __kmp_set_nesting_mode_threads(); + /// This class safely opens and closes a C-style FILE* object using RAII /// semantics. There are also methods which allow using stdout or stderr as /// the underlying FILE* object. With the implicit conversion operator to diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h --- a/openmp/runtime/src/kmp_ftn_entry.h +++ b/openmp/runtime/src/kmp_ftn_entry.h @@ -695,6 +695,9 @@ return 0; #else /* TO DO: We want per-task implementation of this internal control */ + if (!TCR_4(__kmp_init_middle)) { + __kmp_middle_initialize(); + } return __kmp_get_max_active_levels(__kmp_entry_gtid()); #endif } diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -548,4 +548,9 @@ // OMP Pause Resources kmp_pause_status_t __kmp_pause_status = kmp_not_paused; +// Nesting mode +int __kmp_nesting_mode = 0; +int __kmp_nesting_mode_nlevels = 1; +int *__kmp_nesting_nth_level; + // end of file // diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -6848,6 +6848,8 @@ __kmp_global.g.g_dynamic = FALSE; __kmp_global.g.g_dynamic_mode = dynamic_default; + __kmp_init_nesting_mode(); + __kmp_env_initialize(NULL); #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT @@ -7040,6 +7042,9 @@ __kmp_dflt_team_nth = __kmp_sys_max_nth; } + if (__kmp_nesting_mode > 0) + __kmp_set_nesting_mode_threads(); + // There's no harm in continuing if the following check fails, // but it indicates an error in the previous logic. KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); @@ -8700,3 +8705,89 @@ __kmp_hidden_helper_threads_deinitz_release(); } + +/* Nesting Mode: + Set via KMP_NESTING_MODE, which takes an integer. + Note: we skip duplicate topology levels, and skip levels with only + one entity. + KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. + KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels + in the topology, and initializes the number of threads at each of those + levels to the number of entities at each level, respectively, below the + entity at the parent level. + KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, + but starts with nesting OFF -- max-active-levels-var is 1 -- and requires + the user to turn nesting on explicitly. This is an even more experimental + option to this experimental feature, and may change or go away in the + future. +*/ + +// Allocate space to store nesting levels +void __kmp_init_nesting_mode() { + int levels = KMP_HW_LAST; + __kmp_nesting_mode_nlevels = levels; + __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); + for (int i = 0; i < levels; ++i) + __kmp_nesting_nth_level[i] = 0; + if (__kmp_nested_nth.size < levels) { + __kmp_nested_nth.nth = + (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); + __kmp_nested_nth.size = levels; + } +} + +// Set # threads for top levels of nesting; must be called after topology set +void __kmp_set_nesting_mode_threads() { + kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; + + if (__kmp_nesting_mode == 1) + __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; + else if (__kmp_nesting_mode > 1) + __kmp_nesting_mode_nlevels = __kmp_nesting_mode; + + if (__kmp_topology) { // use topology info + int loc, hw_level; + for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && + loc < __kmp_nesting_mode_nlevels; + loc++, hw_level++) { + __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); + if (__kmp_nesting_nth_level[loc] == 1) + loc--; + } + // Make sure all cores are used + if (__kmp_nesting_mode > 1 && loc > 1) { + int core_level = __kmp_topology->get_level(KMP_HW_CORE); + int num_cores = __kmp_topology->get_count(core_level); + int upper_levels = 1; + for (int level = 0; level < loc - 1; ++level) + upper_levels *= __kmp_nesting_nth_level[level]; + if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) + __kmp_nesting_nth_level[loc - 1] = + num_cores / __kmp_nesting_nth_level[loc - 2]; + } + __kmp_nesting_mode_nlevels = loc; + __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; + } else { // no topology info available; provide a reasonable guesstimation + if (__kmp_avail_proc >= 4) { + __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; + __kmp_nesting_nth_level[1] = 2; + __kmp_nesting_mode_nlevels = 2; + } else { + __kmp_nesting_nth_level[0] = __kmp_avail_proc; + __kmp_nesting_mode_nlevels = 1; + } + __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; + } + for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { + __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; + } + set__nproc(thread, __kmp_nesting_nth_level[0]); + if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) + __kmp_nesting_mode_nlevels = __kmp_nesting_mode; + if (get__max_active_levels(thread) > 1) { + // if max levels was set, set nesting mode levels to same + __kmp_nesting_mode_nlevels = get__max_active_levels(thread); + } + if (__kmp_nesting_mode == 1) // turn on nesting for this case only + set__max_active_levels(thread, __kmp_nesting_mode_nlevels); +} diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -1015,6 +1015,28 @@ __kmp_stg_print_bool(buffer, name, __kmp_generate_warnings); } // __kmp_stg_print_warnings +// ----------------------------------------------------------------------------- +// KMP_NESTING_MODE + +static void __kmp_stg_parse_nesting_mode(char const *name, char const *value, + void *data) { + __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_nesting_mode); +#if KMP_AFFINITY_SUPPORTED && KMP_USE_HWLOC + if (__kmp_nesting_mode > 0) + __kmp_affinity_top_method = affinity_top_method_hwloc; +#endif +} // __kmp_stg_parse_nesting_mode + +static void __kmp_stg_print_nesting_mode(kmp_str_buf_t *buffer, + char const *name, void *data) { + if (__kmp_env_format) { + KMP_STR_BUF_PRINT_NAME; + } else { + __kmp_str_buf_print(buffer, " %s", name); + } + __kmp_str_buf_print(buffer, "=%d\n", __kmp_nesting_mode); +} // __kmp_stg_print_nesting_mode + // ----------------------------------------------------------------------------- // OMP_NESTED, OMP_NUM_THREADS @@ -5106,6 +5128,8 @@ {"KMP_WARNINGS", __kmp_stg_parse_warnings, __kmp_stg_print_warnings, NULL, 0, 0}, + {"KMP_NESTING_MODE", __kmp_stg_parse_nesting_mode, + __kmp_stg_print_nesting_mode, NULL, 0, 0}, {"OMP_NESTED", __kmp_stg_parse_nested, __kmp_stg_print_nested, NULL, 0, 0}, {"OMP_NUM_THREADS", __kmp_stg_parse_num_threads, __kmp_stg_print_num_threads, NULL, 0, 0},