diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt --- a/openmp/runtime/src/i18n/en_US.txt +++ b/openmp/runtime/src/i18n/en_US.txt @@ -124,6 +124,9 @@ ProcGroups "processor groups" Unknown "unknown" NoLeaf31Support "cpuid leaf 31 not supported" +HwlocFailed "Hwloc api failure" +LLCache "LL cache" +LLCaches "LL caches" @@ -355,6 +358,7 @@ "This issue is fixed in an up-to-date compiler." OmpNoAllocator "Allocator %1$s is not available, will use default allocator." TopologyGeneric "%1$s: %2$s (%3$d total cores)" +AffGranularityBad "%1$s: granularity setting: %2$s does not exist in topology. Using granularity=%3$s instead." # --- OpenMP errors detected at runtime --- # @@ -458,6 +462,11 @@ UserDirectedError "%1$s: Encountered user-directed error: %2$s." UserDirectedWarning "%1$s: Encountered user-directed warning: %2$s." FailedToCreateTeam "Failed to create teams between lower bound (%1$d) and upper bound (%2$d)." +AffHWSubsetManyGeneric "KMP_HW_SUBSET ignored: %1$s: too many requested." +AffHWSubsetNotExistGeneric "KMP_HW_SUBSET ignored: %1$s: level not detected in machine topology." +AffHWSubsetEqvLayers "KMP_HW_SUBSET ignored: %1$s, %2$s: layers are equivalent, please only specify one." +AffHWSubsetOutOfOrder "KMP_HW_SUBSET ignored: %1$s layer should come after %2$s." +AffEqualTopologyTypes "%1$s: topology layer \"%2$s\" is equivalent to \"%3$s\"." # -------------------------------------------------------------------------------------------------- -*- HINTS -*- diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -597,11 +597,11 @@ enum kmp_hw_t : int { KMP_HW_UNKNOWN = -1, - KMP_HW_MACHINE = 0, - KMP_HW_SOCKET, + KMP_HW_SOCKET = 0, KMP_HW_PROC_GROUP, KMP_HW_NUMA, KMP_HW_DIE, + KMP_HW_LLC, KMP_HW_L3, KMP_HW_TILE, KMP_HW_MODULE, @@ -612,13 +612,16 @@ KMP_HW_LAST }; -#define KMP_ASSERT_VALID_HW_TYPE(type) \ +#define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type) \ KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST) +#define KMP_ASSERT_VALID_HW_TYPE(type) \ + KMP_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST) #define KMP_FOREACH_HW_TYPE(type) \ for (kmp_hw_t type = (kmp_hw_t)0; type < KMP_HW_LAST; \ type = (kmp_hw_t)((int)type + 1)) +const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false); const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false); /* Only Linux* OS and Windows* OS support thread affinity. */ @@ -655,8 +658,6 @@ #if KMP_USE_HWLOC extern hwloc_topology_t __kmp_hwloc_topology; extern int __kmp_hwloc_error; -extern int __kmp_numa_detected; -extern int __kmp_tile_depth; #endif extern size_t __kmp_affin_mask_size; @@ -784,23 +785,6 @@ affinity_default }; -enum affinity_gran { - affinity_gran_fine = 0, - affinity_gran_thread, - affinity_gran_core, - affinity_gran_tile, - affinity_gran_die, - affinity_gran_numa, - affinity_gran_package, - affinity_gran_node, -#if KMP_GROUP_AFFINITY - // The "group" granularity isn't necesssarily coarser than all of the - // other levels, but we put it last in the enum. - affinity_gran_group, -#endif /* KMP_GROUP_AFFINITY */ - affinity_gran_default -}; - enum affinity_top_method { affinity_top_method_all = 0, // try all (supported) methods, in order #if KMP_ARCH_X86 || KMP_ARCH_X86_64 @@ -822,7 +806,7 @@ #define affinity_respect_mask_default (-1) extern enum affinity_type __kmp_affinity_type; /* Affinity type */ -extern enum affinity_gran __kmp_affinity_gran; /* Affinity granularity */ +extern kmp_hw_t __kmp_affinity_gran; /* Affinity granularity */ extern int __kmp_affinity_gran_levels; /* corresponding int value */ extern int __kmp_affinity_dups; /* Affinity duplicate masks */ extern enum affinity_top_method __kmp_affinity_top_method; diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h --- a/openmp/runtime/src/kmp_affinity.h +++ b/openmp/runtime/src/kmp_affinity.h @@ -598,91 +598,274 @@ #endif /* KMP_OS_WINDOWS */ #endif /* KMP_AFFINITY_SUPPORTED */ -class Address { +class kmp_hw_thread_t { public: - static const unsigned maxDepth = 32; - unsigned labels[maxDepth]; - unsigned childNums[maxDepth]; - unsigned depth; - unsigned leader; - Address(unsigned _depth) : depth(_depth), leader(FALSE) {} - Address &operator=(const Address &b) { - depth = b.depth; - for (unsigned i = 0; i < depth; i++) { - labels[i] = b.labels[i]; - childNums[i] = b.childNums[i]; - } - leader = FALSE; - return *this; - } - bool operator==(const Address &b) const { - if (depth != b.depth) - return false; - for (unsigned i = 0; i < depth; i++) - if (labels[i] != b.labels[i]) - return false; - return true; - } - bool isClose(const Address &b, int level) const { - if (depth != b.depth) - return false; - if ((unsigned)level >= depth) - return true; - for (unsigned i = 0; i < (depth - level); i++) - if (labels[i] != b.labels[i]) - return false; - return true; - } - bool operator!=(const Address &b) const { return !operator==(b); } - void print() const { - unsigned i; - printf("Depth: %u --- ", depth); - for (i = 0; i < depth; i++) { - printf("%u ", labels[i]); - } + static const int UNKNOWN_ID = -1; + static int compare_ids(const void *a, const void *b); + static int compare_compact(const void *a, const void *b); + int ids[KMP_HW_LAST]; + int sub_ids[KMP_HW_LAST]; + bool leader; + int os_id; + void print() const; + void clear() { + for (int i = 0; i < (int)KMP_HW_LAST; ++i) + ids[i] = UNKNOWN_ID; + leader = false; } }; -class AddrUnsPair { +class kmp_topology_t { + + struct flags_t { + int uniform : 1; + int reserved : 31; + }; + + int depth; + + // The following arrays are all 'depth' long + + // Orderd array of the types in the topology + kmp_hw_t *types; + + // Keep quick topology ratios, for non-uniform topologies, + // this ratio holds the max number of itemAs per itemB + // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] + int *ratio; + + // Storage containing the absolute number of each topology layer + int *count; + + // The hardware threads array + // hw_threads is num_hw_threads long + // Each hw_thread's ids and sub_ids are depth deep + int num_hw_threads; + kmp_hw_thread_t *hw_threads; + + // Equivalence hash where the key is the hardware topology item + // and the value is the equivalent hardware topology type in the + // types[] array, if the value is KMP_HW_UNKNOWN, then there is no + // known equivalence for the topology type + kmp_hw_t equivalent[KMP_HW_LAST]; + + // Flags describing the topology + flags_t flags; + + // Count each item & get the num x's per y + // e.g., get the number of cores and the number of threads per core + // for each (x, y) in (KMP_HW_* , KMP_HW_*) + void _gather_enumeration_information(); + + // Remove layers that don't add information to the topology. + // This is done by having the layer take on the id = UNKNOWN_ID (-1) + void _remove_radix1_layers(); + + // Find out if the topology is uniform + void _discover_uniformity(); + + // Set all the sub_ids for each hardware thread + void _set_sub_ids(); + + // Set global affinity variables describing the number of threads per + // core, the number of packages, the number of cores per package, and + // the number of cores. + void _set_globals(); + + // Set the last level cache equivalent type + void _set_last_level_cache(); + public: - Address first; - unsigned second; - AddrUnsPair(Address _first, unsigned _second) - : first(_first), second(_second) {} - AddrUnsPair &operator=(const AddrUnsPair &b) { - first = b.first; - second = b.second; - return *this; - } - void print() const { - printf("first = "); - first.print(); - printf(" --- second = %u", second); - } - bool operator==(const AddrUnsPair &b) const { - if (first != b.first) - return false; - if (second != b.second) - return false; - return true; - } - bool operator!=(const AddrUnsPair &b) const { return !operator==(b); } -}; + // Force use of allocate()/deallocate() + kmp_topology_t() = delete; + kmp_topology_t(const kmp_topology_t &t) = delete; + kmp_topology_t(kmp_topology_t &&t) = delete; + kmp_topology_t &operator=(const kmp_topology_t &t) = delete; + kmp_topology_t &operator=(kmp_topology_t &&t) = delete; + + static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); + static void deallocate(kmp_topology_t *); + + // Functions used in create_map() routines + kmp_hw_thread_t &at(int index) { + KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); + return hw_threads[index]; + } + const kmp_hw_thread_t &at(int index) const { + KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); + return hw_threads[index]; + } + int get_num_hw_threads() const { return num_hw_threads; } + void sort_ids() { + qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), + kmp_hw_thread_t::compare_ids); + } + // Check if the hardware ids are unique, if they are + // return true, otherwise return false + bool check_ids() const; -static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) { - const Address *aa = &(((const AddrUnsPair *)a)->first); - const Address *bb = &(((const AddrUnsPair *)b)->first); - unsigned depth = aa->depth; - unsigned i; - KMP_DEBUG_ASSERT(depth == bb->depth); - for (i = 0; i < depth; i++) { - if (aa->labels[i] < bb->labels[i]) + // Function to call after the create_map() routine + void canonicalize(); + void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); + + // Functions used after canonicalize() called + bool filter_hw_subset(); + bool is_close(int hwt1, int hwt2, int level) const; + bool is_uniform() const { return flags.uniform; } + // Tell whether a type is a valid type in the topology + // returns KMP_HW_UNKNOWN when there is no equivalent type + kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; } + // Set type1 = type2 + void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { + KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); + KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); + kmp_hw_t real_type2 = equivalent[type2]; + if (real_type2 == KMP_HW_UNKNOWN) + real_type2 = type2; + equivalent[type1] = real_type2; + // This loop is required since any of the types may have been set to + // be equivalent to type1. They all must be checked and reset to type2. + KMP_FOREACH_HW_TYPE(type) { + if (equivalent[type] == type1) { + equivalent[type] = real_type2; + } + } + } + // Calculate number of types corresponding to level1 + // per types corresponding to level2 (e.g., number of threads per core) + int calculate_ratio(int level1, int level2) const { + KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); + KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); + int r = 1; + for (int level = level1; level > level2; --level) + r *= ratio[level]; + return r; + } + int get_ratio(int level) const { + KMP_DEBUG_ASSERT(level >= 0 && level < depth); + return ratio[level]; + } + int get_depth() const { return depth; }; + kmp_hw_t get_type(int level) const { + KMP_DEBUG_ASSERT(level >= 0 && level < depth); + return types[level]; + } + int get_level(kmp_hw_t type) const { + KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); + int eq_type = equivalent[type]; + if (eq_type == KMP_HW_UNKNOWN) return -1; - if (aa->labels[i] > bb->labels[i]) - return 1; + for (int i = 0; i < depth; ++i) + if (types[i] == eq_type) + return i; + return -1; + } + int get_count(int level) const { + KMP_DEBUG_ASSERT(level >= 0 && level < depth); + return count[level]; + } +#if KMP_AFFINITY_SUPPORTED + void sort_compact() { + qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), + kmp_hw_thread_t::compare_compact); + } +#endif + void print(const char *env_var = "KMP_AFFINITY") const; + void dump() const; +}; + +class kmp_hw_subset_t { +public: + struct item_t { + int num; + kmp_hw_t type; + int offset; + }; + +private: + int depth; + int capacity; + item_t *items; + kmp_uint64 set; + bool absolute; + // The set must be able to handle up to KMP_HW_LAST number of layers + KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); + +public: + // Force use of allocate()/deallocate() + kmp_hw_subset_t() = delete; + kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; + kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; + kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; + kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; + + static kmp_hw_subset_t *allocate() { + int initial_capacity = 5; + kmp_hw_subset_t *retval = + (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); + retval->depth = 0; + retval->capacity = initial_capacity; + retval->set = 0ull; + retval->absolute = false; + retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); + return retval; + } + static void deallocate(kmp_hw_subset_t *subset) { + __kmp_free(subset->items); + __kmp_free(subset); + } + void set_absolute() { absolute = true; } + bool is_absolute() const { return absolute; } + void push_back(int num, kmp_hw_t type, int offset) { + if (depth == capacity - 1) { + capacity *= 2; + item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); + for (int i = 0; i < depth; ++i) + new_items[i] = items[i]; + __kmp_free(items); + items = new_items; + } + items[depth].num = num; + items[depth].type = type; + items[depth].offset = offset; + depth++; + set |= (1ull << type); + } + int get_depth() const { return depth; } + const item_t &at(int index) const { + KMP_DEBUG_ASSERT(index >= 0 && index < depth); + return items[index]; + } + item_t &at(int index) { + KMP_DEBUG_ASSERT(index >= 0 && index < depth); + return items[index]; } - return 0; -} + void remove(int index) { + KMP_DEBUG_ASSERT(index >= 0 && index < depth); + set &= ~(1ull << items[index].type); + for (int j = index + 1; j < depth; ++j) { + items[j - 1] = items[j]; + } + depth--; + } + bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } + void dump() const { + printf("**********************\n"); + printf("*** kmp_hw_subset: ***\n"); + printf("* depth: %d\n", depth); + printf("* items:\n"); + for (int i = 0; i < depth; ++i) { + printf("num: %d, type: %s, offset: %d\n", items[i].num, + __kmp_hw_get_keyword(items[i].type), items[i].offset); + } + printf("* set: 0x%llx\n", set); + printf("* absolute: %d\n", absolute); + printf("**********************\n"); + } +}; + +extern kmp_topology_t *__kmp_topology; +extern kmp_hw_subset_t *__kmp_hw_subset; /* A structure for holding machine-specific hierarchy info to be computed once at init. This structure represents a mapping of threads to the actual machine @@ -721,18 +904,10 @@ kmp_uint32 *numPerLevel; kmp_uint32 *skipPerLevel; - void deriveLevels(AddrUnsPair *adr2os, int num_addrs) { - int hier_depth = adr2os[0].first.depth; - int level = 0; - for (int i = hier_depth - 1; i >= 0; --i) { - int max = -1; - for (int j = 0; j < num_addrs; ++j) { - int next = adr2os[j].first.childNums[i]; - if (next > max) - max = next; - } - numPerLevel[level] = max + 1; - ++level; + void deriveLevels() { + int hier_depth = __kmp_topology->get_depth(); + for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { + numPerLevel[level] = __kmp_topology->get_ratio(i); } } @@ -747,7 +922,7 @@ } } - void init(AddrUnsPair *adr2os, int num_addrs) { + void init(int num_addrs) { kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( &uninitialized, not_initialized, initializing); if (bool_result == 0) { // Wait for initialization @@ -774,10 +949,8 @@ } // Sort table by physical ID - if (adr2os) { - qsort(adr2os, num_addrs, sizeof(*adr2os), - __kmp_affinity_cmp_Address_labels); - deriveLevels(adr2os, num_addrs); + if (__kmp_topology && __kmp_topology->get_depth() > 0) { + deriveLevels(); } else { numPerLevel[0] = maxLeaves; numPerLevel[1] = num_addrs / maxLeaves; diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -21,9 +21,17 @@ #endif #if KMP_USE_HWLOC // Copied from hwloc +#define HWLOC_GROUP_KIND_INTEL_MODULE 102 +#define HWLOC_GROUP_KIND_INTEL_TILE 103 #define HWLOC_GROUP_KIND_INTEL_DIE 104 +#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 #endif +// The machine topology +kmp_topology_t *__kmp_topology = nullptr; +// KMP_HW_SUBSET environment variable +kmp_hw_subset_t *__kmp_hw_subset = nullptr; + // Store the real or imagined machine hierarchy here static hierarchy_info machine_hierarchy; @@ -34,7 +42,7 @@ // The test below is true if affinity is available, but set to "none". Need to // init on first use of hierarchical barrier. if (TCR_1(machine_hierarchy.uninitialized)) - machine_hierarchy.init(NULL, nproc); + machine_hierarchy.init(nproc); // Adjust the hierarchy in case num threads exceeds original if (nproc > machine_hierarchy.base_num_threads) @@ -49,7 +57,11 @@ thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; } -#if KMP_AFFINITY_SUPPORTED +static int nCoresPerPkg, nPackages; +static int __kmp_nThreadsPerCore; +#ifndef KMP_DFLT_NTH_CORES +static int __kmp_ncores; +#endif const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { switch (type) { @@ -69,6 +81,8 @@ return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); case KMP_HW_L1: return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); + case KMP_HW_LLC: + return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache)); case KMP_HW_CORE: return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); case KMP_HW_THREAD: @@ -79,13 +93,96 @@ return KMP_I18N_STR(Unknown); } -// This function removes the topology levels that are radix 1 and don't offer -// further information about the topology. The most common example is when you -// have one thread context per core, we don't want the extra thread context -// level if it offers no unique labels. So they are removed. -// return value: the new depth of address2os -static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh, - int depth, kmp_hw_t *types) { +const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { + switch (type) { + case KMP_HW_SOCKET: + return ((plural) ? "sockets" : "socket"); + case KMP_HW_DIE: + return ((plural) ? "dice" : "die"); + case KMP_HW_MODULE: + return ((plural) ? "modules" : "module"); + case KMP_HW_TILE: + return ((plural) ? "tiles" : "tile"); + case KMP_HW_NUMA: + return ((plural) ? "numa_domains" : "numa_domain"); + case KMP_HW_L3: + return ((plural) ? "l3_caches" : "l3_cache"); + case KMP_HW_L2: + return ((plural) ? "l2_caches" : "l2_cache"); + case KMP_HW_L1: + return ((plural) ? "l1_caches" : "l1_cache"); + case KMP_HW_LLC: + return ((plural) ? "ll_caches" : "ll_cache"); + case KMP_HW_CORE: + return ((plural) ? "cores" : "core"); + case KMP_HW_THREAD: + return ((plural) ? "threads" : "thread"); + case KMP_HW_PROC_GROUP: + return ((plural) ? "proc_groups" : "proc_group"); + } + return ((plural) ? "unknowns" : "unknown"); +} + +//////////////////////////////////////////////////////////////////////////////// +// kmp_hw_thread_t methods +int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { + const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a; + const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b; + int depth = __kmp_topology->get_depth(); + for (int level = 0; level < depth; ++level) { + if (ahwthread->ids[level] < bhwthread->ids[level]) + return -1; + else if (ahwthread->ids[level] > bhwthread->ids[level]) + return 1; + } + if (ahwthread->os_id < bhwthread->os_id) + return -1; + else if (ahwthread->os_id > bhwthread->os_id) + return 1; + return 0; +} + +#if KMP_AFFINITY_SUPPORTED +int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { + int i; + const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a; + const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b; + int depth = __kmp_topology->get_depth(); + KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); + KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth); + for (i = 0; i < __kmp_affinity_compact; i++) { + int j = depth - i - 1; + if (aa->sub_ids[j] < bb->sub_ids[j]) + return -1; + if (aa->sub_ids[j] > bb->sub_ids[j]) + return 1; + } + for (; i < depth; i++) { + int j = i - __kmp_affinity_compact; + if (aa->sub_ids[j] < bb->sub_ids[j]) + return -1; + if (aa->sub_ids[j] > bb->sub_ids[j]) + return 1; + } + return 0; +} +#endif + +void kmp_hw_thread_t::print() const { + int depth = __kmp_topology->get_depth(); + printf("%4d ", os_id); + for (int i = 0; i < depth; ++i) { + printf("%4d ", ids[i]); + } + printf("\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// kmp_topology_t methods + +// Remove layers that don't add information to the topology. +// This is done by having the layer take on the id = UNKNOWN_ID (-1) +void kmp_topology_t::_remove_radix1_layers() { int preference[KMP_HW_LAST]; int top_index1, top_index2; // Set up preference associative array @@ -93,40 +190,46 @@ preference[KMP_HW_SOCKET] = 100; preference[KMP_HW_CORE] = 95; preference[KMP_HW_THREAD] = 90; - preference[KMP_HW_DIE] = 85; - preference[KMP_HW_NUMA] = 80; + preference[KMP_HW_NUMA] = 85; + preference[KMP_HW_DIE] = 80; preference[KMP_HW_TILE] = 75; preference[KMP_HW_MODULE] = 73; preference[KMP_HW_L3] = 70; preference[KMP_HW_L2] = 65; preference[KMP_HW_L1] = 60; + preference[KMP_HW_LLC] = 5; top_index1 = 0; top_index2 = 1; while (top_index1 < depth - 1 && top_index2 < depth) { - KMP_DEBUG_ASSERT(top_index1 >= 0 && top_index1 < depth); - KMP_DEBUG_ASSERT(top_index2 >= 0 && top_index2 < depth); kmp_hw_t type1 = types[top_index1]; kmp_hw_t type2 = types[top_index2]; - if (type1 == KMP_HW_SOCKET && type2 == KMP_HW_CORE) { + KMP_ASSERT_VALID_HW_TYPE(type1); + KMP_ASSERT_VALID_HW_TYPE(type2); + // Do not allow the three main topology levels (sockets, cores, threads) to + // be compacted down + if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE || + type1 == KMP_HW_SOCKET) && + (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE || + type2 == KMP_HW_SOCKET)) { top_index1 = top_index2++; continue; } bool radix1 = true; bool all_same = true; - unsigned id1 = addrP[0].first.labels[top_index1]; - unsigned id2 = addrP[0].first.labels[top_index2]; + int id1 = hw_threads[0].ids[top_index1]; + int id2 = hw_threads[0].ids[top_index2]; int pref1 = preference[type1]; int pref2 = preference[type2]; - for (int hwidx = 1; hwidx < nTh; ++hwidx) { - if (addrP[hwidx].first.labels[top_index1] == id1 && - addrP[hwidx].first.labels[top_index2] != id2) { + for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) { + if (hw_threads[hwidx].ids[top_index1] == id1 && + hw_threads[hwidx].ids[top_index2] != id2) { radix1 = false; break; } - if (addrP[hwidx].first.labels[top_index2] != id2) + if (hw_threads[hwidx].ids[top_index2] != id2) all_same = false; - id1 = addrP[hwidx].first.labels[top_index1]; - id2 = addrP[hwidx].first.labels[top_index2]; + id1 = hw_threads[hwidx].ids[top_index1]; + id2 = hw_threads[hwidx].ids[top_index2]; } if (radix1) { // Select the layer to remove based on preference @@ -147,11 +250,11 @@ remove_layer_ids = top_index2; // Remove radix one type by setting the equivalence, removing the id from // the hw threads and removing the layer from types and depth - for (int idx = 0; idx < nTh; ++idx) { - Address &hw_thread = addrP[idx].first; + set_equivalent_type(remove_type, keep_type); + for (int idx = 0; idx < num_hw_threads; ++idx) { + kmp_hw_thread_t &hw_thread = hw_threads[idx]; for (int d = remove_layer_ids; d < depth - 1; ++d) - hw_thread.labels[d] = hw_thread.labels[d + 1]; - hw_thread.depth--; + hw_thread.ids[d] = hw_thread.ids[d + 1]; } for (int idx = remove_layer; idx < depth - 1; ++idx) types[idx] = types[idx + 1]; @@ -161,29 +264,51 @@ } } KMP_ASSERT(depth > 0); - return depth; } + +void kmp_topology_t::_set_last_level_cache() { + if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN) + set_equivalent_type(KMP_HW_LLC, KMP_HW_L3); + else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) + set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); +#if KMP_MIC_SUPPORTED + else if (__kmp_mic_type == mic3) { + if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) + set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); + else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN) + set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE); + // L2/Tile wasn't detected so just say L1 + else + set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); + } +#endif + else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN) + set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); + // Fallback is to set last level cache to socket or core + if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) { + if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN) + set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET); + else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN) + set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE); + } + KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN); +} + // Gather the count of each topology layer and the ratio -// ratio contains the number of types[i] / types[i+1] and so forth -// count contains the absolute number of types[i] -static void __kmp_affinity_gather_enumeration_information(AddrUnsPair *addrP, - int nTh, int depth, - kmp_hw_t *types, - int *ratio, - int *count) { +void kmp_topology_t::_gather_enumeration_information() { int previous_id[KMP_HW_LAST]; int max[KMP_HW_LAST]; for (int i = 0; i < depth; ++i) { - previous_id[i] = -1; + previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; max[i] = 0; count[i] = 0; ratio[i] = 0; } - for (int i = 0; i < nTh; ++i) { - Address &hw_thread = addrP[i].first; + for (int i = 0; i < num_hw_threads; ++i) { + kmp_hw_thread_t &hw_thread = hw_threads[i]; for (int layer = 0; layer < depth; ++layer) { - int id = hw_thread.labels[layer]; + int id = hw_thread.ids[layer]; if (id != previous_id[layer]) { // Add an additional increment to each count for (int l = layer; l < depth; ++l) @@ -199,7 +324,7 @@ } } for (int layer = 0; layer < depth; ++layer) { - previous_id[layer] = hw_thread.labels[layer]; + previous_id[layer] = hw_thread.ids[layer]; } } for (int layer = 0; layer < depth; ++layer) { @@ -209,75 +334,484 @@ } // Find out if the topology is uniform -static bool __kmp_affinity_discover_uniformity(int depth, int *ratio, - int *count) { +void kmp_topology_t::_discover_uniformity() { int num = 1; for (int level = 0; level < depth; ++level) num *= ratio[level]; - return (num == count[depth - 1]); + flags.uniform = (num == count[depth - 1]); +} + +// Set all the sub_ids for each hardware thread +void kmp_topology_t::_set_sub_ids() { + int previous_id[KMP_HW_LAST]; + int sub_id[KMP_HW_LAST]; + + for (int i = 0; i < depth; ++i) { + previous_id[i] = -1; + sub_id[i] = -1; + } + for (int i = 0; i < num_hw_threads; ++i) { + kmp_hw_thread_t &hw_thread = hw_threads[i]; + // Setup the sub_id + for (int j = 0; j < depth; ++j) { + if (hw_thread.ids[j] != previous_id[j]) { + sub_id[j]++; + for (int k = j + 1; k < depth; ++k) { + sub_id[k] = 0; + } + break; + } + } + // Set previous_id + for (int j = 0; j < depth; ++j) { + previous_id[j] = hw_thread.ids[j]; + } + // Set the sub_ids field + for (int j = 0; j < depth; ++j) { + hw_thread.sub_ids[j] = sub_id[j]; + } + } +} + +void kmp_topology_t::_set_globals() { + // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores + int core_level, thread_level, package_level; + package_level = get_level(KMP_HW_SOCKET); +#if KMP_GROUP_AFFINITY + if (package_level == -1) + package_level = get_level(KMP_HW_PROC_GROUP); +#endif + core_level = get_level(KMP_HW_CORE); + thread_level = get_level(KMP_HW_THREAD); + + KMP_ASSERT(core_level != -1); + KMP_ASSERT(thread_level != -1); + + __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level); + if (package_level != -1) { + nCoresPerPkg = calculate_ratio(core_level, package_level); + nPackages = get_count(package_level); + } else { + // assume one socket + nCoresPerPkg = get_count(core_level); + nPackages = 1; + } +#ifndef KMP_DFLT_NTH_CORES + __kmp_ncores = get_count(core_level); +#endif } -// calculate the number of X's per Y -static inline int __kmp_affinity_calculate_ratio(int *ratio, int deep_level, - int shallow_level) { - int retval = 1; - if (deep_level < 0 || shallow_level < 0) - return retval; - for (int level = deep_level; level > shallow_level; --level) - retval *= ratio[level]; +kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, + const kmp_hw_t *types) { + kmp_topology_t *retval; + // Allocate all data in one large allocation + size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + + sizeof(int) * ndepth * 3; + char *bytes = (char *)__kmp_allocate(size); + retval = (kmp_topology_t *)bytes; + if (nproc > 0) { + retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t)); + } else { + retval->hw_threads = nullptr; + } + retval->num_hw_threads = nproc; + retval->depth = ndepth; + int *arr = + (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); + retval->types = (kmp_hw_t *)arr; + retval->ratio = arr + ndepth; + retval->count = arr + 2 * ndepth; + KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } + for (int i = 0; i < ndepth; ++i) { + retval->types[i] = types[i]; + retval->equivalent[types[i]] = types[i]; + } return retval; } -static void __kmp_affinity_print_topology(AddrUnsPair *addrP, int len, - int depth, kmp_hw_t *types) { - int proc; +void kmp_topology_t::deallocate(kmp_topology_t *topology) { + if (topology) + __kmp_free(topology); +} + +bool kmp_topology_t::check_ids() const { + // Assume ids have been sorted + if (num_hw_threads == 0) + return true; + for (int i = 1; i < num_hw_threads; ++i) { + kmp_hw_thread_t ¤t_thread = hw_threads[i]; + kmp_hw_thread_t &previous_thread = hw_threads[i - 1]; + bool unique = false; + for (int j = 0; j < depth; ++j) { + if (previous_thread.ids[j] != current_thread.ids[j]) { + unique = true; + break; + } + } + if (unique) + continue; + return false; + } + return true; +} + +void kmp_topology_t::dump() const { + printf("***********************\n"); + printf("*** __kmp_topology: ***\n"); + printf("***********************\n"); + printf("* depth: %d\n", depth); + + printf("* types: "); + for (int i = 0; i < depth; ++i) + printf("%15s ", __kmp_hw_get_keyword(types[i])); + printf("\n"); + + printf("* ratio: "); + for (int i = 0; i < depth; ++i) { + printf("%15d ", ratio[i]); + } + printf("\n"); + + printf("* count: "); + for (int i = 0; i < depth; ++i) { + printf("%15d ", count[i]); + } + printf("\n"); + + printf("* equivalent map:\n"); + KMP_FOREACH_HW_TYPE(i) { + const char *key = __kmp_hw_get_keyword(i); + const char *value = __kmp_hw_get_keyword(equivalent[i]); + printf("%-15s -> %-15s\n", key, value); + } + + printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No")); + + printf("* num_hw_threads: %d\n", num_hw_threads); + printf("* hw_threads:\n"); + for (int i = 0; i < num_hw_threads; ++i) { + hw_threads[i].print(); + } + printf("***********************\n"); +} + +void kmp_topology_t::print(const char *env_var) const { kmp_str_buf_t buf; + int print_types_depth; __kmp_str_buf_init(&buf); - KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); - for (proc = 0; proc < len; proc++) { - for (int i = 0; i < depth; ++i) { - __kmp_str_buf_print(&buf, "%s %d ", __kmp_hw_get_catalog_string(types[i]), - addrP[proc].first.labels[i]); + kmp_hw_t print_types[KMP_HW_LAST + 2]; + + // Num Available Threads + KMP_INFORM(AvailableOSProc, env_var, num_hw_threads); + + // Uniform or not + if (is_uniform()) { + KMP_INFORM(Uniform, env_var); + } else { + KMP_INFORM(NonUniform, env_var); + } + + // Equivalent types + KMP_FOREACH_HW_TYPE(type) { + kmp_hw_t eq_type = equivalent[type]; + if (eq_type != KMP_HW_UNKNOWN && eq_type != type) { + KMP_INFORM(AffEqualTopologyTypes, env_var, + __kmp_hw_get_catalog_string(type), + __kmp_hw_get_catalog_string(eq_type)); + } + } + + // Quick topology + KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST); + // Create a print types array that always guarantees printing + // the core and thread level + print_types_depth = 0; + for (int level = 0; level < depth; ++level) + print_types[print_types_depth++] = types[level]; + if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) { + // Force in the core level for quick topology + if (print_types[print_types_depth - 1] == KMP_HW_THREAD) { + // Force core before thread e.g., 1 socket X 2 threads/socket + // becomes 1 socket X 1 core/socket X 2 threads/socket + print_types[print_types_depth - 1] = KMP_HW_CORE; + print_types[print_types_depth++] = KMP_HW_THREAD; + } else { + print_types[print_types_depth++] = KMP_HW_CORE; + } + } + // Always put threads at very end of quick topology + if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD) + print_types[print_types_depth++] = KMP_HW_THREAD; + + __kmp_str_buf_clear(&buf); + kmp_hw_t numerator_type; + kmp_hw_t denominator_type = KMP_HW_UNKNOWN; + int core_level = get_level(KMP_HW_CORE); + int ncores = get_count(core_level); + + for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) { + int c; + bool plural; + numerator_type = print_types[plevel]; + KMP_ASSERT_VALID_HW_TYPE(numerator_type); + if (equivalent[numerator_type] != numerator_type) + c = 1; + else + c = get_ratio(level++); + plural = (c > 1); + if (plevel == 0) { + __kmp_str_buf_print(&buf, "%d %s", c, + __kmp_hw_get_catalog_string(numerator_type, plural)); + } else { + __kmp_str_buf_print(&buf, " x %d %s/%s", c, + __kmp_hw_get_catalog_string(numerator_type, plural), + __kmp_hw_get_catalog_string(denominator_type)); } - KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str); + denominator_type = numerator_type; + } + KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); + + if (num_hw_threads <= 0) { + __kmp_str_buf_free(&buf); + return; + } + + // Full OS proc to hardware thread map + KMP_INFORM(OSProcToPhysicalThreadMap, env_var); + for (int i = 0; i < num_hw_threads; i++) { __kmp_str_buf_clear(&buf); + for (int level = 0; level < depth; ++level) { + kmp_hw_t type = types[level]; + __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); + __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); + } + KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); } + __kmp_str_buf_free(&buf); } -// Print out the detailed machine topology map, i.e. the physical locations -// of each OS proc. -static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, - int depth, int pkgLevel, - int coreLevel, int threadLevel) { - int proc; - - KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); - for (proc = 0; proc < len; proc++) { - int level; - kmp_str_buf_t buf; - __kmp_str_buf_init(&buf); - for (level = 0; level < depth; level++) { - if (level == threadLevel) { - __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); - } else if (level == coreLevel) { - __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); - } else if (level == pkgLevel) { - __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); - } else if (level > pkgLevel) { - __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), - level - pkgLevel - 1); - } else { - __kmp_str_buf_print(&buf, "L%d ", level); +void kmp_topology_t::canonicalize() { + _remove_radix1_layers(); + _gather_enumeration_information(); + _discover_uniformity(); + _set_sub_ids(); + _set_globals(); + _set_last_level_cache(); + + // Perform post canonicalization checking + KMP_ASSERT(depth > 0); + for (int level = 0; level < depth; ++level) { + // All counts, ratios, and types must be valid + KMP_ASSERT(count[level] > 0 && ratio[level] > 0); + KMP_ASSERT_VALID_HW_TYPE(types[level]); + // Detected types must point to themselves + KMP_ASSERT(equivalent[types[level]] == types[level]); + } + +#if KMP_AFFINITY_SUPPORTED + // Set the number of affinity granularity levels + if (__kmp_affinity_gran_levels < 0) { + kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran); + // Check if user's granularity request is valid + if (gran_type == KMP_HW_UNKNOWN) { + // First try core, then thread, then package + kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; + for (auto g : gran_types) { + if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) { + gran_type = g; + break; + } } - __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); + KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); + // Warn user what granularity setting will be used instead + KMP_WARNING(AffGranularityBad, "KMP_AFFINITY", + __kmp_hw_get_catalog_string(__kmp_affinity_gran), + __kmp_hw_get_catalog_string(gran_type)); + __kmp_affinity_gran = gran_type; } - KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, - buf.str); - __kmp_str_buf_free(&buf); + __kmp_affinity_gran_levels = 0; + for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) + __kmp_affinity_gran_levels++; + } +#endif // KMP_AFFINITY_SUPPORTED +} + +// Canonicalize an explicit packages X cores/pkg X threads/core topology +void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, + int nthreads_per_core, int ncores) { + int ndepth = 3; + depth = ndepth; + KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; } + for (int level = 0; level < depth; ++level) { + count[level] = 0; + ratio[level] = 0; + } + count[0] = npackages; + count[1] = ncores; + count[2] = __kmp_xproc; + ratio[0] = npackages; + ratio[1] = ncores_per_pkg; + ratio[2] = nthreads_per_core; + equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET; + equivalent[KMP_HW_CORE] = KMP_HW_CORE; + equivalent[KMP_HW_THREAD] = KMP_HW_THREAD; + types[0] = KMP_HW_SOCKET; + types[1] = KMP_HW_CORE; + types[2] = KMP_HW_THREAD; + //__kmp_avail_proc = __kmp_xproc; + _discover_uniformity(); +} + +// Apply the KMP_HW_SUBSET envirable to the topology +// Returns true if KMP_HW_SUBSET filtered any processors +// otherwise, returns false +bool kmp_topology_t::filter_hw_subset() { + // If KMP_HW_SUBSET wasn't requested, then do nothing. + if (!__kmp_hw_subset) + return false; + + // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology + int hw_subset_depth = __kmp_hw_subset->get_depth(); + kmp_hw_t specified[KMP_HW_LAST]; + KMP_ASSERT(hw_subset_depth > 0); + KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } + for (int i = 0; i < hw_subset_depth; ++i) { + int max_count; + int num = __kmp_hw_subset->at(i).num; + int offset = __kmp_hw_subset->at(i).offset; + kmp_hw_t type = __kmp_hw_subset->at(i).type; + kmp_hw_t equivalent_type = equivalent[type]; + int level = get_level(type); + + // Check to see if current layer is in detected machine topology + if (equivalent_type != KMP_HW_UNKNOWN) { + __kmp_hw_subset->at(i).type = equivalent_type; + } else { + KMP_WARNING(AffHWSubsetNotExistGeneric, + __kmp_hw_get_catalog_string(type)); + return false; + } + + // Check to see if current layer has already been specified + // either directly or through an equivalent type + if (specified[equivalent_type] != KMP_HW_UNKNOWN) { + KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), + __kmp_hw_get_catalog_string(specified[equivalent_type])); + return false; + } + specified[equivalent_type] = type; + + // Check to see if layers are in order + if (i + 1 < hw_subset_depth) { + kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type); + if (next_type == KMP_HW_UNKNOWN) { + KMP_WARNING( + AffHWSubsetNotExistGeneric, + __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type)); + return false; + } + int next_topology_level = get_level(next_type); + if (level > next_topology_level) { + KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type), + __kmp_hw_get_catalog_string(next_type)); + return false; + } + } + + // Check to see if each layer's num & offset parameters are valid + max_count = get_ratio(level); + if (max_count < 0 || num + offset > max_count) { + bool plural = (num > 1); + KMP_WARNING(AffHWSubsetManyGeneric, + __kmp_hw_get_catalog_string(type, plural)); + return false; + } + } + + // Apply the filtered hardware subset + int new_index = 0; + for (int i = 0; i < num_hw_threads; ++i) { + kmp_hw_thread_t &hw_thread = hw_threads[i]; + // Check to see if this hardware thread should be filtered + bool should_be_filtered = false; + for (int level = 0, hw_subset_index = 0; + level < depth && hw_subset_index < hw_subset_depth; ++level) { + kmp_hw_t topology_type = types[level]; + auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); + kmp_hw_t hw_subset_type = hw_subset_item.type; + if (topology_type != hw_subset_type) + continue; + int num = hw_subset_item.num; + int offset = hw_subset_item.offset; + hw_subset_index++; + if (hw_thread.sub_ids[level] < offset || + hw_thread.sub_ids[level] >= offset + num) { + should_be_filtered = true; + break; + } + } + if (!should_be_filtered) { + if (i != new_index) + hw_threads[new_index] = hw_thread; + new_index++; + } else { +#if KMP_AFFINITY_SUPPORTED + KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); +#endif + __kmp_avail_proc--; + } + } + KMP_DEBUG_ASSERT(new_index <= num_hw_threads); + num_hw_threads = new_index; + + // Post hardware subset canonicalization + _gather_enumeration_information(); + _discover_uniformity(); + _set_globals(); + _set_last_level_cache(); + return true; +} + +bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { + if (hw_level >= depth) + return true; + bool retval = true; + const kmp_hw_thread_t &t1 = hw_threads[hwt1]; + const kmp_hw_thread_t &t2 = hw_threads[hwt2]; + for (int i = 0; i < (depth - hw_level); ++i) { + if (t1.ids[i] != t2.ids[i]) + return false; } + return retval; } +//////////////////////////////////////////////////////////////////////////////// + +#if KMP_AFFINITY_SUPPORTED +class kmp_affinity_raii_t { + kmp_affin_mask_t *mask; + bool restored; + +public: + kmp_affinity_raii_t() : restored(false) { + KMP_CPU_ALLOC(mask); + KMP_ASSERT(mask != NULL); + __kmp_get_system_affinity(mask, TRUE); + } + void restore() { + __kmp_set_system_affinity(mask, TRUE); + KMP_CPU_FREE(mask); + restored = true; + } + ~kmp_affinity_raii_t() { + if (!restored) { + __kmp_set_system_affinity(mask, TRUE); + KMP_CPU_FREE(mask); + } + } +}; + bool KMPAffinity::picked_api = false; void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } @@ -469,85 +1003,12 @@ } } -// When sorting by labels, __kmp_affinity_assign_child_nums() must first be -// called to renumber the labels from [0..n] and place them into the child_num -// vector of the address object. This is done in case the labels used for -// the children at one node of the hierarchy differ from those used for -// another node at the same level. Example: suppose the machine has 2 nodes -// with 2 packages each. The first node contains packages 601 and 602, and -// second node contains packages 603 and 604. If we try to sort the table -// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 -// because we are paying attention to the labels themselves, not the ordinal -// child numbers. By using the child numbers in the sort, the result is -// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. -static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, - int numAddrs) { - KMP_DEBUG_ASSERT(numAddrs > 0); - int depth = address2os->first.depth; - unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); - unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); - int labCt; - for (labCt = 0; labCt < depth; labCt++) { - address2os[0].first.childNums[labCt] = counts[labCt] = 0; - lastLabel[labCt] = address2os[0].first.labels[labCt]; - } - int i; - for (i = 1; i < numAddrs; i++) { - for (labCt = 0; labCt < depth; labCt++) { - if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { - int labCt2; - for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { - counts[labCt2] = 0; - lastLabel[labCt2] = address2os[i].first.labels[labCt2]; - } - counts[labCt]++; - lastLabel[labCt] = address2os[i].first.labels[labCt]; - break; - } - } - for (labCt = 0; labCt < depth; labCt++) { - address2os[i].first.childNums[labCt] = counts[labCt]; - } - for (; labCt < (int)Address::maxDepth; labCt++) { - address2os[i].first.childNums[labCt] = 0; - } - } - __kmp_free(lastLabel); - __kmp_free(counts); -} - -// All of the __kmp_affinity_create_*_map() routines should set -// __kmp_affinity_masks to a vector of affinity mask objects of length -// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return -// the number of levels in the machine topology tree (zero if -// __kmp_affinity_type == affinity_none). -// -// All of the __kmp_affinity_create_*_map() routines should set -// *__kmp_affin_fullMask to the affinity mask for the initialization thread. -// They need to save and restore the mask, and it could be needed later, so -// saving it is just an optimization to avoid calling kmp_get_system_affinity() -// again. +// All of the __kmp_affinity_create_*_map() routines should allocate the +// internal topology object and set the layer ids for it. Each routine +// returns a boolean on whether it was successful at doing so. kmp_affin_mask_t *__kmp_affin_fullMask = NULL; -static int nCoresPerPkg, nPackages; -static int __kmp_nThreadsPerCore; -#ifndef KMP_DFLT_NTH_CORES -static int __kmp_ncores; -#endif -static int *__kmp_pu_os_idx = NULL; -static int nDiesPerPkg = 1; - -// __kmp_affinity_uniform_topology() doesn't work when called from -// places which support arbitrarily many levels in the machine topology -// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() -// __kmp_affinity_create_x2apicid_map(). -inline static bool __kmp_affinity_uniform_topology() { - return __kmp_avail_proc == - (__kmp_nThreadsPerCore * nCoresPerPkg * nDiesPerPkg * nPackages); -} - #if KMP_USE_HWLOC - static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { #if HWLOC_API_VERSION >= 0x00020000 return hwloc_obj_type_is_cache(obj->type); @@ -590,6 +1051,13 @@ case HWLOC_OBJ_GROUP: if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) return KMP_HW_DIE; + else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE) + return KMP_HW_TILE; + else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE) + return KMP_HW_MODULE; + else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP) + return KMP_HW_PROC_GROUP; + return KMP_HW_UNKNOWN; #if HWLOC_API_VERSION >= 0x00020100 case HWLOC_OBJ_DIE: return KMP_HW_DIE; @@ -617,35 +1085,6 @@ return retval; } -static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, - hwloc_obj_t o, - kmp_hwloc_depth_t depth, - hwloc_obj_t *f) { - if (o->depth == depth) { - if (*f == NULL) - *f = o; // output first descendant found - return 1; - } - int sum = 0; - for (unsigned i = 0; i < o->arity; i++) - sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); - return sum; // will be 0 if no one found (as PU arity is 0) -} - -static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, - hwloc_obj_type_t type, - hwloc_obj_t *f) { - if (!hwloc_compare_types(o->type, type)) { - if (*f == NULL) - *f = o; // output first descendant found - return 1; - } - int sum = 0; - for (unsigned i = 0; i < o->arity; i++) - sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); - return sum; // will be 0 if no one found (as PU arity is 0) -} - // This gets the sub_id for a lower object under a higher object in the // topology tree static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, @@ -671,27 +1110,23 @@ return sub_id; } -static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, - kmp_i18n_id_t *const msg_id) { +static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { kmp_hw_t type; - int hw_thread_index, sub_id, nActiveThreads; + int hw_thread_index, sub_id; int depth; hwloc_obj_t pu, obj, root, prev; - int ratio[KMP_HW_LAST]; - int count[KMP_HW_LAST]; kmp_hw_t types[KMP_HW_LAST]; + hwloc_obj_type_t hwloc_types[KMP_HW_LAST]; hwloc_topology_t tp = __kmp_hwloc_topology; *msg_id = kmp_i18n_null; - - // Save the affinity mask for the current thread. - kmp_affin_mask_t *oldMask; - KMP_CPU_ALLOC(oldMask); - __kmp_get_system_affinity(oldMask, TRUE); + if (__kmp_affinity_verbose) { + KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); + } if (!KMP_AFFINITY_CAPABLE()) { // Hack to try and infer the machine topology using only the data - // available from cpuid on the current thread, and __kmp_xproc. + // available from hwloc on the current thread, and __kmp_xproc. KMP_ASSERT(__kmp_affinity_type == affinity_none); // hwloc only guarantees existance of PU object, so check PACKAGE and CORE hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); @@ -708,19 +1143,7 @@ if (nCoresPerPkg == 0) nCoresPerPkg = 1; // to prevent possible division by 0 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; - if (__kmp_affinity_verbose) { - KMP_INFORM(AffNotUsingHwloc, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - if (__kmp_affinity_uniform_topology()) { - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } else { - KMP_INFORM(NonUniform, "KMP_AFFINITY"); - } - KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); - } - KMP_CPU_FREE(oldMask); - return 0; + return true; } root = hwloc_get_root_obj(tp); @@ -728,8 +1151,10 @@ // Figure out the depth and types in the topology depth = 0; pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); + KMP_ASSERT(pu); obj = pu; types[depth] = KMP_HW_THREAD; + hwloc_types[depth] = obj->type; depth++; while (obj != root && obj != NULL) { obj = obj->parent; @@ -743,6 +1168,7 @@ } if (memory && memory->type == HWLOC_OBJ_NUMANODE) { types[depth] = KMP_HW_NUMA; + hwloc_types[depth] = memory->type; depth++; } } @@ -750,36 +1176,36 @@ type = __kmp_hwloc_type_2_topology_type(obj); if (type != KMP_HW_UNKNOWN) { types[depth] = type; + hwloc_types[depth] = obj->type; depth++; } } - KMP_ASSERT(depth > 0 && depth <= KMP_HW_LAST); + KMP_ASSERT(depth > 0); // Get the order for the types correct for (int i = 0, j = depth - 1; i < j; ++i, --j) { + hwloc_obj_type_t hwloc_temp = hwloc_types[i]; kmp_hw_t temp = types[i]; types[i] = types[j]; types[j] = temp; + hwloc_types[i] = hwloc_types[j]; + hwloc_types[j] = hwloc_temp; } // Allocate the data structure to be returned. - AddrUnsPair *retval = - (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); - KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); - __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); + __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); hw_thread_index = 0; pu = NULL; - nActiveThreads = 0; while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { int index = depth - 1; bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); - Address hw_thread(depth); + kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); if (included) { - hw_thread.labels[index] = pu->logical_index; - __kmp_pu_os_idx[hw_thread_index] = pu->os_index; + hw_thread.clear(); + hw_thread.ids[index] = pu->logical_index; + hw_thread.os_id = pu->os_index; index--; - nActiveThreads++; } obj = pu; prev = obj; @@ -799,214 +1225,54 @@ if (memory && memory->type == HWLOC_OBJ_NUMANODE) { sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); if (included) { - hw_thread.labels[index] = memory->logical_index; - hw_thread.labels[index + 1] = sub_id; + hw_thread.ids[index] = memory->logical_index; + hw_thread.ids[index + 1] = sub_id; index--; } prev = memory; } + prev = obj; } #endif type = __kmp_hwloc_type_2_topology_type(obj); if (type != KMP_HW_UNKNOWN) { sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); if (included) { - hw_thread.labels[index] = obj->logical_index; - hw_thread.labels[index + 1] = sub_id; + hw_thread.ids[index] = obj->logical_index; + hw_thread.ids[index + 1] = sub_id; index--; } prev = obj; } } - if (included) { - retval[hw_thread_index] = AddrUnsPair(hw_thread, pu->os_index); + if (included) hw_thread_index++; - } } + __kmp_topology->sort_ids(); + return true; +} +#endif // KMP_USE_HWLOC - // If there's only one thread context to bind to, return now. - KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); - KMP_ASSERT(nActiveThreads > 0); - if (nActiveThreads == 1) { - __kmp_ncores = nPackages = 1; - __kmp_nThreadsPerCore = nCoresPerPkg = 1; - if (__kmp_affinity_verbose) { - KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - KMP_INFORM(Uniform, "KMP_AFFINITY"); - KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); - } - - if (__kmp_affinity_type == affinity_none) { - __kmp_free(retval); - KMP_CPU_FREE(oldMask); - return 0; - } - - // Form an Address object which only includes the package level. - Address addr(1); - addr.labels[0] = retval[0].first.labels[0]; - retval[0].first = addr; - - if (__kmp_affinity_gran_levels < 0) { - __kmp_affinity_gran_levels = 0; - } - - if (__kmp_affinity_verbose) { - __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); - } +// If we don't know how to retrieve the machine's processor topology, or +// encounter an error in doing so, this routine is called to form a "flat" +// mapping of os thread id's <-> processor id's. +static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { + *msg_id = kmp_i18n_null; + int depth = 3; + kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; - *address2os = retval; - KMP_CPU_FREE(oldMask); - return 1; + if (__kmp_affinity_verbose) { + KMP_INFORM(UsingFlatOS, "KMP_AFFINITY"); } - // Sort the table by physical Id. - qsort(retval, nActiveThreads, sizeof(*retval), - __kmp_affinity_cmp_Address_labels); - - // Find any levels with radiix 1, and remove them from the map - // (except for the package level). - depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, - types); - - __kmp_affinity_gather_enumeration_information(retval, nActiveThreads, depth, - types, ratio, count); - - for (int level = 0; level < depth; ++level) { - if ((types[level] == KMP_HW_L2 || types[level] == KMP_HW_L3)) - __kmp_tile_depth = level; - } - - // This routine should set __kmp_ncores, as well as - // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. - int thread_level, core_level, tile_level, numa_level, socket_level; - thread_level = core_level = tile_level = numa_level = socket_level = -1; - for (int level = 0; level < depth; ++level) { - if (types[level] == KMP_HW_THREAD) - thread_level = level; - else if (types[level] == KMP_HW_CORE) - core_level = level; - else if (types[level] == KMP_HW_SOCKET) - socket_level = level; - else if (types[level] == KMP_HW_TILE) - tile_level = level; - else if (types[level] == KMP_HW_NUMA) - numa_level = level; - } - __kmp_nThreadsPerCore = - __kmp_affinity_calculate_ratio(ratio, thread_level, core_level); - nCoresPerPkg = - __kmp_affinity_calculate_ratio(ratio, core_level, socket_level); - if (socket_level >= 0) - nPackages = count[socket_level]; - else - nPackages = 1; - if (core_level >= 0) - __kmp_ncores = count[core_level]; - else - __kmp_ncores = 1; - - unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count); - - // Print the machine topology summary. - if (__kmp_affinity_verbose) { - kmp_hw_t numerator_type, denominator_type; - kmp_str_buf_t buf; - __kmp_str_buf_init(&buf); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - if (uniform) { - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } else { - KMP_INFORM(NonUniform, "KMP_AFFINITY"); - } - - __kmp_str_buf_clear(&buf); - - if (core_level < 0) - core_level = depth - 1; - int ncores = count[core_level]; - - denominator_type = KMP_HW_UNKNOWN; - for (int level = 0; level < depth; ++level) { - int c; - bool plural; - numerator_type = types[level]; - c = ratio[level]; - plural = (c > 1); - if (level == 0) { - __kmp_str_buf_print( - &buf, "%d %s", c, - __kmp_hw_get_catalog_string(numerator_type, plural)); - } else { - __kmp_str_buf_print(&buf, " x %d %s/%s", c, - __kmp_hw_get_catalog_string(numerator_type, plural), - __kmp_hw_get_catalog_string(denominator_type)); - } - denominator_type = numerator_type; - } - KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores); - __kmp_str_buf_free(&buf); - } - - if (__kmp_affinity_type == affinity_none) { - __kmp_free(retval); - KMP_CPU_FREE(oldMask); - return 0; - } - - // Set the granularity level based on what levels are modeled - // in the machine topology map. - if (__kmp_affinity_gran == affinity_gran_node) - __kmp_affinity_gran = affinity_gran_numa; - KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default); - if (__kmp_affinity_gran_levels < 0) { - __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine) - if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) - __kmp_affinity_gran_levels++; - if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core)) - __kmp_affinity_gran_levels++; - if ((tile_level >= 0) && (__kmp_affinity_gran > affinity_gran_tile)) - __kmp_affinity_gran_levels++; - if ((numa_level >= 0) && (__kmp_affinity_gran > affinity_gran_numa)) - __kmp_affinity_gran_levels++; - if ((socket_level >= 0) && (__kmp_affinity_gran > affinity_gran_package)) - __kmp_affinity_gran_levels++; - } - - if (__kmp_affinity_verbose) - __kmp_affinity_print_topology(retval, nActiveThreads, depth, types); - - KMP_CPU_FREE(oldMask); - *address2os = retval; - return depth; -} -#endif // KMP_USE_HWLOC - -// If we don't know how to retrieve the machine's processor topology, or -// encounter an error in doing so, this routine is called to form a "flat" -// mapping of os thread id's <-> processor id's. -static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, - kmp_i18n_id_t *const msg_id) { - *address2os = NULL; - *msg_id = kmp_i18n_null; - - // Even if __kmp_affinity_type == affinity_none, this routine might still - // called to set __kmp_ncores, as well as - // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. - if (!KMP_AFFINITY_CAPABLE()) { - KMP_ASSERT(__kmp_affinity_type == affinity_none); - __kmp_ncores = nPackages = __kmp_xproc; - __kmp_nThreadsPerCore = nCoresPerPkg = 1; - if (__kmp_affinity_verbose) { - KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - KMP_INFORM(Uniform, "KMP_AFFINITY"); - KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); - } - return 0; + // Even if __kmp_affinity_type == affinity_none, this routine might still + // called to set __kmp_ncores, as well as + // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. + if (!KMP_AFFINITY_CAPABLE()) { + KMP_ASSERT(__kmp_affinity_type == affinity_none); + __kmp_ncores = nPackages = __kmp_xproc; + __kmp_nThreadsPerCore = nCoresPerPkg = 1; + return true; } // When affinity is off, this routine will still be called to set @@ -1015,29 +1281,9 @@ // not enabled. __kmp_ncores = nPackages = __kmp_avail_proc; __kmp_nThreadsPerCore = nCoresPerPkg = 1; - if (__kmp_affinity_verbose) { - KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - KMP_INFORM(Uniform, "KMP_AFFINITY"); - KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); - } - KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); - __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); - if (__kmp_affinity_type == affinity_none) { - int avail_ct = 0; - int i; - KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { - if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) - continue; - __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat - } - return 0; - } // Construct the data structure to be returned. - *address2os = - (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); + __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); int avail_ct = 0; int i; KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { @@ -1045,50 +1291,47 @@ if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { continue; } - __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat - Address addr(1); - addr.labels[0] = i; - (*address2os)[avail_ct++] = AddrUnsPair(addr, i); + kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct); + hw_thread.clear(); + hw_thread.os_id = i; + hw_thread.ids[0] = i; + hw_thread.ids[1] = 0; + hw_thread.ids[2] = 0; + avail_ct++; } if (__kmp_affinity_verbose) { KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); } - - if (__kmp_affinity_gran_levels < 0) { - // Only the package level is modeled in the machine topology map, - // so the #levels of granularity is either 0 or 1. - if (__kmp_affinity_gran > affinity_gran_package) { - __kmp_affinity_gran_levels = 1; - } else { - __kmp_affinity_gran_levels = 0; - } - } - return 1; + return true; } #if KMP_GROUP_AFFINITY - // If multiple Windows* OS processor groups exist, we can create a 2-level // topology map with the groups at level 0 and the individual procs at level 1. // This facilitates letting the threads float among all procs in a group, // if granularity=group (the default when there are multiple groups). -static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, - kmp_i18n_id_t *const msg_id) { - *address2os = NULL; +static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) { *msg_id = kmp_i18n_null; + int depth = 3; + kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD}; + const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR); + + if (__kmp_affinity_verbose) { + KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); + } - // If we aren't affinity capable, then return now. - // The flat mapping will be used. + // If we aren't affinity capable, then use flat topology if (!KMP_AFFINITY_CAPABLE()) { - // FIXME set *msg_id - return -1; + KMP_ASSERT(__kmp_affinity_type == affinity_none); + nPackages = __kmp_num_proc_groups; + __kmp_nThreadsPerCore = 1; + __kmp_ncores = __kmp_xproc; + nCoresPerPkg = nPackages / __kmp_ncores; + return true; } // Construct the data structure to be returned. - *address2os = - (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); - KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); - __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); + __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); int avail_ct = 0; int i; KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { @@ -1096,77 +1339,18 @@ if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { continue; } - __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat - Address addr(2); - addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); - addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); - (*address2os)[avail_ct++] = AddrUnsPair(addr, i); - - if (__kmp_affinity_verbose) { - KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], - addr.labels[1]); - } + kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++); + hw_thread.clear(); + hw_thread.os_id = i; + hw_thread.ids[0] = i / BITS_PER_GROUP; + hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP; } - - if (__kmp_affinity_gran_levels < 0) { - if (__kmp_affinity_gran == affinity_gran_group) { - __kmp_affinity_gran_levels = 1; - } else if ((__kmp_affinity_gran == affinity_gran_fine) || - (__kmp_affinity_gran == affinity_gran_thread)) { - __kmp_affinity_gran_levels = 0; - } else { - const char *gran_str = NULL; - if (__kmp_affinity_gran == affinity_gran_core) { - gran_str = "core"; - } else if (__kmp_affinity_gran == affinity_gran_package) { - gran_str = "package"; - } else if (__kmp_affinity_gran == affinity_gran_node) { - gran_str = "node"; - } else { - KMP_ASSERT(0); - } - - // Warning: can't use affinity granularity \"gran\" with group topology - // method, using "thread" - __kmp_affinity_gran_levels = 0; - } - } - return 2; + return true; } - #endif /* KMP_GROUP_AFFINITY */ #if KMP_ARCH_X86 || KMP_ARCH_X86_64 -/* - * CPUID.B or 1F, Input ECX (sub leaf # aka level number) - Bits Bits Bits Bits - 31-16 15-8 7-4 4-0 ----+-----------+--------------+-------------+-----------------+ -EAX| reserved | reserved | reserved | Bits to Shift | ----+-----------|--------------+-------------+-----------------| -EBX| reserved | Num logical processors at level (16 bits) | ----+-----------|--------------+-------------------------------| -ECX| reserved | Level Type | Level Number (8 bits) | ----+-----------+--------------+-------------------------------| -EDX| X2APIC ID (32 bits) | ----+----------------------------------------------------------+ -*/ - -enum { - INTEL_LEVEL_TYPE_INVALID = 0, // Package level - INTEL_LEVEL_TYPE_SMT = 1, - INTEL_LEVEL_TYPE_CORE = 2, - INTEL_LEVEL_TYPE_TILE = 3, - INTEL_LEVEL_TYPE_MODULE = 4, - INTEL_LEVEL_TYPE_DIE = 5, - INTEL_LEVEL_TYPE_LAST = 6, -}; - -struct cpuid_level_info_t { - unsigned level_type, mask, mask_width, nitems, cache_mask; -}; - template static inline unsigned __kmp_extract_bits(kmp_uint32 v) { const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; @@ -1177,84 +1361,6 @@ return retval; } -static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { - switch (intel_type) { - case INTEL_LEVEL_TYPE_INVALID: - return KMP_HW_SOCKET; - case INTEL_LEVEL_TYPE_SMT: - return KMP_HW_THREAD; - case INTEL_LEVEL_TYPE_CORE: - return KMP_HW_CORE; - // TODO: add support for the tile and module - case INTEL_LEVEL_TYPE_TILE: - return KMP_HW_UNKNOWN; - case INTEL_LEVEL_TYPE_MODULE: - return KMP_HW_UNKNOWN; - case INTEL_LEVEL_TYPE_DIE: - return KMP_HW_DIE; - } - return KMP_HW_UNKNOWN; -} - -// This function takes the topology leaf, a levels array to store the levels -// detected and a bitmap of the known levels. -// Returns the number of levels in the topology -static unsigned -__kmp_x2apicid_get_levels(int leaf, - cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], - kmp_uint64 known_levels) { - unsigned level, levels_index; - unsigned level_type, mask_width, nitems; - kmp_cpuid buf; - - // The new algorithm has known topology layers act as highest unknown topology - // layers when unknown topology layers exist. - // e.g., Suppose layers were SMT CORE PACKAGE - // Then CORE will take the characteristics (nitems and mask width) of . - // In developing the id mask for each layer, this eliminates unknown portions - // of the topology while still keeping the correct underlying structure. - level = levels_index = 0; - do { - __kmp_x86_cpuid(leaf, level, &buf); - level_type = __kmp_extract_bits<8, 15>(buf.ecx); - mask_width = __kmp_extract_bits<0, 4>(buf.eax); - nitems = __kmp_extract_bits<0, 15>(buf.ebx); - if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) - return 0; - - if (known_levels & (1ull << level_type)) { - // Add a new level to the topology - KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); - levels[levels_index].level_type = level_type; - levels[levels_index].mask_width = mask_width; - levels[levels_index].nitems = nitems; - levels_index++; - } else { - // If it is an unknown level, then logically move the previous layer up - if (levels_index > 0) { - levels[levels_index - 1].mask_width = mask_width; - levels[levels_index - 1].nitems = nitems; - } - } - level++; - } while (level_type != INTEL_LEVEL_TYPE_INVALID); - - // Set the masks to & with apicid - for (unsigned i = 0; i < levels_index; ++i) { - if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { - levels[i].mask = ~((-1) << levels[i].mask_width); - levels[i].cache_mask = (-1) << levels[i].mask_width; - for (unsigned j = 0; j < i; ++j) - levels[i].mask ^= levels[j].mask; - } else { - KMP_DEBUG_ASSERT(levels_index > 0); - levels[i].mask = (-1) << levels[i - 1].mask_width; - levels[i].cache_mask = 0; - } - } - return levels_index; -} - static int __kmp_cpuid_mask_width(int count) { int r = 0; @@ -1293,21 +1399,78 @@ return 0; } +class kmp_cache_info_t { +public: + struct info_t { + unsigned level, mask; + }; + kmp_cache_info_t() : depth(0) { get_leaf4_levels(); } + size_t get_depth() const { return depth; } + info_t &operator[](size_t index) { return table[index]; } + const info_t &operator[](size_t index) const { return table[index]; } + + static kmp_hw_t get_topology_type(unsigned level) { + KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL); + switch (level) { + case 1: + return KMP_HW_L1; + case 2: + return KMP_HW_L2; + case 3: + return KMP_HW_L3; + } + return KMP_HW_UNKNOWN; + } + +private: + static const int MAX_CACHE_LEVEL = 3; + + size_t depth; + info_t table[MAX_CACHE_LEVEL]; + + void get_leaf4_levels() { + unsigned level = 0; + while (depth < MAX_CACHE_LEVEL) { + unsigned cache_type, max_threads_sharing; + unsigned cache_level, cache_mask_width; + kmp_cpuid buf2; + __kmp_x86_cpuid(4, level, &buf2); + cache_type = __kmp_extract_bits<0, 4>(buf2.eax); + if (!cache_type) + break; + // Skip instruction caches + if (cache_type == 2) { + level++; + continue; + } + max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1; + cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing); + cache_level = __kmp_extract_bits<5, 7>(buf2.eax); + table[depth].level = cache_level; + table[depth].mask = ((-1) << cache_mask_width); + depth++; + level++; + } + } +}; + // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use // an algorithm which cycles through the available os threads, setting // the current thread's affinity mask to that thread, and then retrieves // the Apic Id for each thread context using the cpuid instruction. -static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, - kmp_i18n_id_t *const msg_id) { +static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { kmp_cpuid buf; - *address2os = NULL; *msg_id = kmp_i18n_null; + if (__kmp_affinity_verbose) { + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); + } + // Check if cpuid leaf 4 is supported. __kmp_x86_cpuid(0, 0, &buf); if (buf.eax < 4) { *msg_id = kmp_i18n_str_NoLeaf4Support; - return -1; + return false; } // The algorithm used starts by setting the affinity to each available thread @@ -1365,18 +1528,7 @@ __kmp_ncores = __kmp_xproc; nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; __kmp_nThreadsPerCore = 1; - if (__kmp_affinity_verbose) { - KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - if (__kmp_affinity_uniform_topology()) { - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } else { - KMP_INFORM(NonUniform, "KMP_AFFINITY"); - } - KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); - } - return 0; + return true; } // From here on, we can assume that it is safe to call @@ -1384,10 +1536,7 @@ // __kmp_affinity_type = affinity_none. // Save the affinity mask for the current thread. - kmp_affin_mask_t *oldMask; - KMP_CPU_ALLOC(oldMask); - KMP_ASSERT(oldMask != NULL); - __kmp_get_system_affinity(oldMask, TRUE); + kmp_affinity_raii_t previous_affinity; // Run through each of the available contexts, binding the current thread // to it, and obtaining the pertinent information using the cpuid instr. @@ -1431,11 +1580,9 @@ // The apic id and max threads per pkg come from cpuid(1). __kmp_x86_cpuid(1, 0, &buf); if (((buf.edx >> 9) & 1) == 0) { - __kmp_set_system_affinity(oldMask, TRUE); __kmp_free(threadInfo); - KMP_CPU_FREE(oldMask); *msg_id = kmp_i18n_str_ApicNotPresent; - return -1; + return false; } threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; @@ -1467,11 +1614,9 @@ // I've never seen this one happen, but I suppose it could, if the cpuid // instruction on a chip was really screwed up. Make sure to restore the // affinity mask before the tail call. - __kmp_set_system_affinity(oldMask, TRUE); __kmp_free(threadInfo); - KMP_CPU_FREE(oldMask); *msg_id = kmp_i18n_str_InvalidCpuidInfo; - return -1; + return false; } int maskC = (1 << widthC) - 1; @@ -1485,50 +1630,7 @@ // We've collected all the info we need. // Restore the old affinity mask for this thread. - __kmp_set_system_affinity(oldMask, TRUE); - - // If there's only one thread context to bind to, form an Address object - // with depth 1 and return immediately (or, if affinity is off, set - // address2os to NULL and return). - // - // If it is configured to omit the package level when there is only a single - // package, the logic at the end of this routine won't work if there is only - // a single thread - it would try to form an Address object with depth 0. - KMP_ASSERT(nApics > 0); - if (nApics == 1) { - __kmp_ncores = nPackages = 1; - __kmp_nThreadsPerCore = nCoresPerPkg = 1; - if (__kmp_affinity_verbose) { - KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - KMP_INFORM(Uniform, "KMP_AFFINITY"); - KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); - } - - if (__kmp_affinity_type == affinity_none) { - __kmp_free(threadInfo); - KMP_CPU_FREE(oldMask); - return 0; - } - - *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); - Address addr(1); - addr.labels[0] = threadInfo[0].pkgId; - (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); - - if (__kmp_affinity_gran_levels < 0) { - __kmp_affinity_gran_levels = 0; - } - - if (__kmp_affinity_verbose) { - __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); - } - - __kmp_free(threadInfo); - KMP_CPU_FREE(oldMask); - return 1; - } + previous_affinity.restore(); // Sort the threadInfo table by physical Id. qsort(threadInfo, nApics, sizeof(*threadInfo), @@ -1597,9 +1699,8 @@ lastThreadId = threadInfo[i].threadId; } else { __kmp_free(threadInfo); - KMP_CPU_FREE(oldMask); *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; - return -1; + return false; } // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg @@ -1607,108 +1708,183 @@ if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { __kmp_free(threadInfo); - KMP_CPU_FREE(oldMask); *msg_id = kmp_i18n_str_InconsistentCpuidInfo; - return -1; + return false; } } + // When affinity is off, this routine will still be called to set + // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. + // Make sure all these vars are set correctly nPackages = pkgCt; if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; - - // When affinity is off, this routine will still be called to set - // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. - // Make sure all these vars are set correctly, and return now if affinity is - // not enabled. __kmp_ncores = nCores; - if (__kmp_affinity_verbose) { - KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - if (__kmp_affinity_uniform_topology()) { - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } else { - KMP_INFORM(NonUniform, "KMP_AFFINITY"); - } - KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); - } - KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); - __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); - for (i = 0; i < nApics; ++i) { - __kmp_pu_os_idx[i] = threadInfo[i].osId; - } - if (__kmp_affinity_type == affinity_none) { - __kmp_free(threadInfo); - KMP_CPU_FREE(oldMask); - return 0; - } // Now that we've determined the number of packages, the number of cores per // package, and the number of threads per core, we can construct the data // structure that is to be returned. + int idx = 0; int pkgLevel = 0; - int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; - int threadLevel = - (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); - unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); + int coreLevel = 1; + int threadLevel = 2; + //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); + int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); + kmp_hw_t types[3]; + if (pkgLevel >= 0) + types[idx++] = KMP_HW_SOCKET; + if (coreLevel >= 0) + types[idx++] = KMP_HW_CORE; + if (threadLevel >= 0) + types[idx++] = KMP_HW_THREAD; KMP_ASSERT(depth > 0); - *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); + __kmp_topology = kmp_topology_t::allocate(nApics, depth, types); for (i = 0; i < nApics; ++i) { - Address addr(depth); + idx = 0; unsigned os = threadInfo[i].osId; - int d = 0; + kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); + hw_thread.clear(); if (pkgLevel >= 0) { - addr.labels[d++] = threadInfo[i].pkgId; + hw_thread.ids[idx++] = threadInfo[i].pkgId; } if (coreLevel >= 0) { - addr.labels[d++] = threadInfo[i].coreId; + hw_thread.ids[idx++] = threadInfo[i].coreId; } if (threadLevel >= 0) { - addr.labels[d++] = threadInfo[i].threadId; + hw_thread.ids[idx++] = threadInfo[i].threadId; } - (*address2os)[i] = AddrUnsPair(addr, os); - } - - if (__kmp_affinity_gran_levels < 0) { - // Set the granularity level based on what levels are modeled in the machine - // topology map. - __kmp_affinity_gran_levels = 0; - if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { - __kmp_affinity_gran_levels++; - } - if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { - __kmp_affinity_gran_levels++; - } - if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { - __kmp_affinity_gran_levels++; - } - } - - if (__kmp_affinity_verbose) { - __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, - coreLevel, threadLevel); + hw_thread.os_id = os; } __kmp_free(threadInfo); - KMP_CPU_FREE(oldMask); - return depth; + __kmp_topology->sort_ids(); + if (!__kmp_topology->check_ids()) { + kmp_topology_t::deallocate(__kmp_topology); + __kmp_topology = nullptr; + *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; + return false; + } + return true; } // Intel(R) microarchitecture code name Nehalem, Dunnington and later // architectures support a newer interface for specifying the x2APIC Ids, // based on CPUID.B or CPUID.1F -static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, - kmp_i18n_id_t *const msg_id) { +/* + * CPUID.B or 1F, Input ECX (sub leaf # aka level number) + Bits Bits Bits Bits + 31-16 15-8 7-4 4-0 +---+-----------+--------------+-------------+-----------------+ +EAX| reserved | reserved | reserved | Bits to Shift | +---+-----------|--------------+-------------+-----------------| +EBX| reserved | Num logical processors at level (16 bits) | +---+-----------|--------------+-------------------------------| +ECX| reserved | Level Type | Level Number (8 bits) | +---+-----------+--------------+-------------------------------| +EDX| X2APIC ID (32 bits) | +---+----------------------------------------------------------+ +*/ + +enum { + INTEL_LEVEL_TYPE_INVALID = 0, // Package level + INTEL_LEVEL_TYPE_SMT = 1, + INTEL_LEVEL_TYPE_CORE = 2, + INTEL_LEVEL_TYPE_TILE = 3, + INTEL_LEVEL_TYPE_MODULE = 4, + INTEL_LEVEL_TYPE_DIE = 5, + INTEL_LEVEL_TYPE_LAST = 6, +}; + +struct cpuid_level_info_t { + unsigned level_type, mask, mask_width, nitems, cache_mask; +}; + +static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { + switch (intel_type) { + case INTEL_LEVEL_TYPE_INVALID: + return KMP_HW_SOCKET; + case INTEL_LEVEL_TYPE_SMT: + return KMP_HW_THREAD; + case INTEL_LEVEL_TYPE_CORE: + return KMP_HW_CORE; + case INTEL_LEVEL_TYPE_TILE: + return KMP_HW_TILE; + case INTEL_LEVEL_TYPE_MODULE: + return KMP_HW_MODULE; + case INTEL_LEVEL_TYPE_DIE: + return KMP_HW_DIE; + } + return KMP_HW_UNKNOWN; +} + +// This function takes the topology leaf, a levels array to store the levels +// detected and a bitmap of the known levels. +// Returns the number of levels in the topology +static unsigned +__kmp_x2apicid_get_levels(int leaf, + cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], + kmp_uint64 known_levels) { + unsigned level, levels_index; + unsigned level_type, mask_width, nitems; + kmp_cpuid buf; + + // New algorithm has known topology layers act as highest unknown topology + // layers when unknown topology layers exist. + // e.g., Suppose layers were SMT CORE PACKAGE, where + // are unknown topology layers, Then SMT will take the characteristics of + // (SMT x ) and CORE will take the characteristics of (CORE x x ). + // This eliminates unknown portions of the topology while still keeping the + // correct structure. + level = levels_index = 0; + do { + __kmp_x86_cpuid(leaf, level, &buf); + level_type = __kmp_extract_bits<8, 15>(buf.ecx); + mask_width = __kmp_extract_bits<0, 4>(buf.eax); + nitems = __kmp_extract_bits<0, 15>(buf.ebx); + if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) + return 0; + + if (known_levels & (1ull << level_type)) { + // Add a new level to the topology + KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); + levels[levels_index].level_type = level_type; + levels[levels_index].mask_width = mask_width; + levels[levels_index].nitems = nitems; + levels_index++; + } else { + // If it is an unknown level, then logically move the previous layer up + if (levels_index > 0) { + levels[levels_index - 1].mask_width = mask_width; + levels[levels_index - 1].nitems = nitems; + } + } + level++; + } while (level_type != INTEL_LEVEL_TYPE_INVALID); + + // Set the masks to & with apicid + for (unsigned i = 0; i < levels_index; ++i) { + if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { + levels[i].mask = ~((-1) << levels[i].mask_width); + levels[i].cache_mask = (-1) << levels[i].mask_width; + for (unsigned j = 0; j < i; ++j) + levels[i].mask ^= levels[j].mask; + } else { + KMP_DEBUG_ASSERT(levels_index > 0); + levels[i].mask = (-1) << levels[i - 1].mask_width; + levels[i].cache_mask = 0; + } + } + return levels_index; +} + +static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; - int ratio[KMP_HW_LAST]; - int count[KMP_HW_LAST]; kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; unsigned levels_index; kmp_cpuid buf; @@ -1722,6 +1898,9 @@ KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); *msg_id = kmp_i18n_null; + if (__kmp_affinity_verbose) { + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); + } // Figure out the known topology levels known_levels = 0ull; @@ -1771,7 +1950,7 @@ } if (topology_leaf == -1 || levels_index == 0) { *msg_id = leaf_message_id; - return -1; + return false; } KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); @@ -1784,30 +1963,40 @@ // Hack to try and infer the machine topology using only the data // available from cpuid on the current thread, and __kmp_xproc. KMP_ASSERT(__kmp_affinity_type == affinity_none); - for (unsigned i = 0; i < levels_index; ++i) { if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { __kmp_nThreadsPerCore = levels[i].nitems; } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { nCoresPerPkg = levels[i].nitems; - } else if (levels[i].level_type == INTEL_LEVEL_TYPE_DIE) { - nDiesPerPkg = levels[i].nitems; } } __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; - if (__kmp_affinity_verbose) { - KMP_INFORM(AffNotCapableUseLocCpuidL, "KMP_AFFINITY", topology_leaf); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - if (__kmp_affinity_uniform_topology()) { - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } else { - KMP_INFORM(NonUniform, "KMP_AFFINITY"); + return true; + } + + // Allocate the data structure to be returned. + int depth = levels_index; + for (int i = depth - 1, j = 0; i >= 0; --i, ++j) + types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); + __kmp_topology = + kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types); + + // Insert equivalent cache types if they exist + kmp_cache_info_t cache_info; + for (size_t i = 0; i < cache_info.get_depth(); ++i) { + const kmp_cache_info_t::info_t &info = cache_info[i]; + unsigned cache_mask = info.mask; + unsigned cache_level = info.level; + for (unsigned j = 0; j < levels_index; ++j) { + unsigned hw_cache_mask = levels[j].cache_mask; + kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level); + if (hw_cache_mask == cache_mask && j < levels_index - 1) { + kmp_hw_t type = + __kmp_intel_type_2_topology_type(levels[j + 1].level_type); + __kmp_topology->set_equivalent_type(cache_type, type); } - KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); } - return 0; } // From here on, we can assume that it is safe to call @@ -1815,21 +2004,12 @@ // __kmp_affinity_type = affinity_none. // Save the affinity mask for the current thread. - kmp_affin_mask_t *oldMask; - KMP_CPU_ALLOC(oldMask); - __kmp_get_system_affinity(oldMask, TRUE); - - // Allocate the data structure to be returned. - int depth = levels_index; - for (int i = depth - 1, j = 0; i >= 0; --i, ++j) - types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); - AddrUnsPair *retval = - (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); + kmp_affinity_raii_t previous_affinity; // Run through each of the available contexts, binding the current thread // to it, and obtaining the pertinent information using the cpuid instr. unsigned int proc; - int nApics = 0; + int hw_thread_index = 0; KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; unsigned my_levels_index; @@ -1838,216 +2018,41 @@ if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { continue; } - KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); + KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc); __kmp_affinity_dispatch->bind_thread(proc); // New algorithm __kmp_x86_cpuid(topology_leaf, 0, &buf); apic_id = buf.edx; - Address addr(depth); + kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); my_levels_index = __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); if (my_levels_index == 0 || my_levels_index != levels_index) { - KMP_CPU_FREE(oldMask); *msg_id = kmp_i18n_str_InvalidCpuidInfo; - return -1; + return false; } + hw_thread.clear(); + hw_thread.os_id = proc; // Put in topology information for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { - addr.labels[idx] = apic_id & my_levels[j].mask; - if (j > 0) - addr.labels[idx] >>= my_levels[j - 1].mask_width; - } - retval[nApics++] = AddrUnsPair(addr, proc); - } - - // We've collected all the info we need. - // Restore the old affinity mask for this thread. - __kmp_set_system_affinity(oldMask, TRUE); - - // If there's only one thread context to bind to, return now. - KMP_ASSERT(nApics > 0); - if (nApics == 1) { - int pkg_level; - __kmp_ncores = nPackages = 1; - __kmp_nThreadsPerCore = nCoresPerPkg = 1; - if (__kmp_affinity_verbose) { - KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - KMP_INFORM(Uniform, "KMP_AFFINITY"); - KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); - } - - if (__kmp_affinity_type == affinity_none) { - __kmp_free(retval); - KMP_CPU_FREE(oldMask); - return 0; - } - - pkg_level = 0; - for (int i = 0; i < depth; ++i) - if (types[i] == KMP_HW_SOCKET) { - pkg_level = i; - break; - } - // Form an Address object which only includes the package level. - Address addr(1); - addr.labels[0] = retval[0].first.labels[pkg_level]; - retval[0].first = addr; - - if (__kmp_affinity_gran_levels < 0) { - __kmp_affinity_gran_levels = 0; - } - - if (__kmp_affinity_verbose) { - __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); - } - - *address2os = retval; - KMP_CPU_FREE(oldMask); - return 1; - } - - // Sort the table by physical Id. - qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); - - __kmp_affinity_gather_enumeration_information(retval, nApics, depth, types, - ratio, count); - - // When affinity is off, this routine will still be called to set - // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. - // Make sure all these vars are set correctly, and return if affinity is not - // enabled. - int thread_level, core_level, socket_level, die_level; - thread_level = core_level = die_level = socket_level = -1; - for (int level = 0; level < depth; ++level) { - if (types[level] == KMP_HW_THREAD) - thread_level = level; - else if (types[level] == KMP_HW_CORE) - core_level = level; - else if (types[level] == KMP_HW_DIE) - die_level = level; - else if (types[level] == KMP_HW_SOCKET) - socket_level = level; - } - __kmp_nThreadsPerCore = - __kmp_affinity_calculate_ratio(ratio, thread_level, core_level); - if (die_level > 0) { - nDiesPerPkg = - __kmp_affinity_calculate_ratio(ratio, die_level, socket_level); - nCoresPerPkg = __kmp_affinity_calculate_ratio(ratio, core_level, die_level); - } else { - nCoresPerPkg = - __kmp_affinity_calculate_ratio(ratio, core_level, socket_level); - } - if (socket_level >= 0) - nPackages = count[socket_level]; - else - nPackages = 1; - if (core_level >= 0) - __kmp_ncores = count[core_level]; - else - __kmp_ncores = 1; - - // Check to see if the machine topology is uniform - unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count); - - // Print the machine topology summary. - if (__kmp_affinity_verbose) { - kmp_hw_t numerator_type, denominator_type; - KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - if (uniform) { - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } else { - KMP_INFORM(NonUniform, "KMP_AFFINITY"); - } - - kmp_str_buf_t buf; - __kmp_str_buf_init(&buf); - - if (core_level < 0) - core_level = depth - 1; - int ncores = count[core_level]; - - denominator_type = KMP_HW_UNKNOWN; - for (int level = 0; level < depth; ++level) { - int c; - bool plural; - numerator_type = types[level]; - c = ratio[level]; - plural = (c > 1); - if (level == 0) { - __kmp_str_buf_print( - &buf, "%d %s", c, - __kmp_hw_get_catalog_string(numerator_type, plural)); - } else { - __kmp_str_buf_print(&buf, " x %d %s/%s", c, - __kmp_hw_get_catalog_string(numerator_type, plural), - __kmp_hw_get_catalog_string(denominator_type)); + hw_thread.ids[idx] = apic_id & my_levels[j].mask; + if (j > 0) { + hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; } - denominator_type = numerator_type; - } - KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores); - __kmp_str_buf_free(&buf); - } - - KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); - KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); - __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); - for (proc = 0; (int)proc < nApics; ++proc) { - __kmp_pu_os_idx[proc] = retval[proc].second; - } - if (__kmp_affinity_type == affinity_none) { - __kmp_free(retval); - KMP_CPU_FREE(oldMask); - return 0; - } - - // Find any levels with radix 1, and remove them from the map - // (except for the package level). - depth = __kmp_affinity_remove_radix_one_levels(retval, nApics, depth, types); - thread_level = core_level = die_level = socket_level = -1; - for (int level = 0; level < depth; ++level) { - if (types[level] == KMP_HW_THREAD) - thread_level = level; - else if (types[level] == KMP_HW_CORE) - core_level = level; - else if (types[level] == KMP_HW_DIE) - die_level = level; - else if (types[level] == KMP_HW_SOCKET) - socket_level = level; - } - - if (__kmp_affinity_gran_levels < 0) { - // Set the granularity level based on what levels are modeled - // in the machine topology map. - __kmp_affinity_gran_levels = 0; - if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { - __kmp_affinity_gran_levels++; - } - if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { - __kmp_affinity_gran_levels++; - } - if ((die_level >= 0) && (__kmp_affinity_gran > affinity_gran_die)) { - __kmp_affinity_gran_levels++; - } - if (__kmp_affinity_gran > affinity_gran_package) { - __kmp_affinity_gran_levels++; } + hw_thread_index++; } - - if (__kmp_affinity_verbose) { - __kmp_affinity_print_topology(retval, nApics, depth, types); + KMP_ASSERT(hw_thread_index > 0); + __kmp_topology->sort_ids(); + if (!__kmp_topology->check_ids()) { + kmp_topology_t::deallocate(__kmp_topology); + __kmp_topology = nullptr; + *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; + return false; } - - KMP_CPU_FREE(oldMask); - *address2os = retval; - return depth; + return true; } - #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ #define osIdIndex 0 @@ -2147,15 +2152,36 @@ } #endif // KMP_USE_HIER_SCHED +static inline const char *__kmp_cpuinfo_get_filename() { + const char *filename; + if (__kmp_cpuinfo_file != nullptr) + filename = __kmp_cpuinfo_file; + else + filename = "/proc/cpuinfo"; + return filename; +} + +static inline const char *__kmp_cpuinfo_get_envvar() { + const char *envvar = nullptr; + if (__kmp_cpuinfo_file != nullptr) + envvar = "KMP_CPUINFO_FILE"; + return envvar; +} + // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the // affinity map. -static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, - int *line, - kmp_i18n_id_t *const msg_id, - FILE *f) { - *address2os = NULL; +static bool __kmp_affinity_create_cpuinfo_map(int *line, + kmp_i18n_id_t *const msg_id) { + const char *filename = __kmp_cpuinfo_get_filename(); + const char *envvar = __kmp_cpuinfo_get_envvar(); *msg_id = kmp_i18n_null; + if (__kmp_affinity_verbose) { + KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); + } + + kmp_safe_raii_file_t f(filename, "r", envvar); + // Scan of the file, and count the number of "processor" (osId) fields, // and find the highest value of for a node_ field. char buf[256]; @@ -2190,14 +2216,12 @@ // Check for empty file / no valid processor records, or too many. The number // of records can't exceed the number of valid bits in the affinity mask. if (num_records == 0) { - *line = 0; *msg_id = kmp_i18n_str_NoProcRecords; - return -1; + return false; } if (num_records > (unsigned)__kmp_xproc) { - *line = 0; *msg_id = kmp_i18n_str_TooManyProcRecords; - return -1; + return false; } // Set the file pointer back to the beginning, so that we can scan the file @@ -2206,9 +2230,8 @@ // at the end allows us to remove a lot of extra checks for termination // conditions. if (fseek(f, 0, SEEK_SET) != 0) { - *line = 0; *msg_id = kmp_i18n_str_CantRewindCpuinfo; - return -1; + return false; } // Allocate the array of records to store the proc info in. The dummy @@ -2271,7 +2294,7 @@ if (long_line) { \ CLEANUP_THREAD_INFO; \ *msg_id = kmp_i18n_str_LongLineCpuinfo; \ - return -1; \ + return false; \ } } (*line)++; @@ -2379,7 +2402,7 @@ if ((int)num_avail == __kmp_xproc) { CLEANUP_THREAD_INFO; *msg_id = kmp_i18n_str_TooManyEntries; - return -1; + return false; } // Check for missing fields. The osId field must be there, and we @@ -2387,12 +2410,12 @@ if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { CLEANUP_THREAD_INFO; *msg_id = kmp_i18n_str_MissingProcField; - return -1; + return false; } if (threadInfo[0][pkgIdIndex] == UINT_MAX) { CLEANUP_THREAD_INFO; *msg_id = kmp_i18n_str_MissingPhysicalIDField; - return -1; + return false; } // Skip this proc if it is not included in the machine model. @@ -2413,12 +2436,12 @@ no_val: CLEANUP_THREAD_INFO; *msg_id = kmp_i18n_str_MissingValCpuinfo; - return -1; + return false; dup_field: CLEANUP_THREAD_INFO; *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; - return -1; + return false; } *line = 0; @@ -2428,60 +2451,11 @@ // check for num_records == __kmp_xproc ??? - // If there's only one thread context to bind to, form an Address object with - // depth 1 and return immediately (or, if affinity is off, set address2os to - // NULL and return). - // - // If it is configured to omit the package level when there is only a single - // package, the logic at the end of this routine won't work if there is only a - // single thread - it would try to form an Address object with depth 0. - KMP_ASSERT(num_avail > 0); - KMP_ASSERT(num_avail <= num_records); - if (num_avail == 1) { - __kmp_ncores = 1; - __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; - if (__kmp_affinity_verbose) { - if (!KMP_AFFINITY_CAPABLE()) { - KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } else { - KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } - int index; - kmp_str_buf_t buf; - __kmp_str_buf_init(&buf); - __kmp_str_buf_print(&buf, "1"); - for (index = maxIndex - 1; index > pkgIdIndex; index--) { - __kmp_str_buf_print(&buf, " x 1"); - } - KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); - __kmp_str_buf_free(&buf); - } - - if (__kmp_affinity_type == affinity_none) { - CLEANUP_THREAD_INFO; - return 0; - } - - *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); - Address addr(1); - addr.labels[0] = threadInfo[0][pkgIdIndex]; - (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); - - if (__kmp_affinity_gran_levels < 0) { - __kmp_affinity_gran_levels = 0; - } - - if (__kmp_affinity_verbose) { - __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); - } - - CLEANUP_THREAD_INFO; - return 1; - } + // If it is configured to omit the package level when there is only a single + // package, the logic at the end of this routine won't work if there is only a + // single thread + KMP_ASSERT(num_avail > 0); + KMP_ASSERT(num_avail <= num_records); // Sort the threadInfo table by physical Id. qsort(threadInfo, num_avail, sizeof(*threadInfo), @@ -2598,7 +2572,7 @@ __kmp_free(counts); CLEANUP_THREAD_INFO; *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; - return -1; + return false; } // If the thread ids were not specified and we see entries entries that @@ -2629,43 +2603,15 @@ for (index = threadIdIndex; index < maxIndex; index++) { prod *= maxCt[index]; } - bool uniform = (prod == totals[threadIdIndex]); // When affinity is off, this routine will still be called to set // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. // Make sure all these vars are set correctly, and return now if affinity is // not enabled. __kmp_ncores = totals[coreIdIndex]; - - if (__kmp_affinity_verbose) { - if (!KMP_AFFINITY_CAPABLE()) { - KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - if (uniform) { - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } else { - KMP_INFORM(NonUniform, "KMP_AFFINITY"); - } - } else { - KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); - KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); - if (uniform) { - KMP_INFORM(Uniform, "KMP_AFFINITY"); - } else { - KMP_INFORM(NonUniform, "KMP_AFFINITY"); - } - } - kmp_str_buf_t buf; - __kmp_str_buf_init(&buf); - - __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); - for (index = maxIndex - 1; index >= pkgIdIndex; index--) { - __kmp_str_buf_print(&buf, " x %d", maxCt[index]); - } - KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], - maxCt[threadIdIndex], __kmp_ncores); - - __kmp_str_buf_free(&buf); + if (!KMP_AFFINITY_CAPABLE()) { + KMP_ASSERT(__kmp_affinity_type == affinity_none); + return true; } #if KMP_MIC && REDUCE_TEAM_SIZE @@ -2678,21 +2624,7 @@ } #endif // KMP_MIC && REDUCE_TEAM_SIZE - KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); - __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); - for (i = 0; i < num_avail; ++i) { // fill the os indices - __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; - } - - if (__kmp_affinity_type == affinity_none) { - __kmp_free(lastId); - __kmp_free(totals); - __kmp_free(maxCt); - __kmp_free(counts); - CLEANUP_THREAD_INFO; - return 0; - } // Count the number of levels which have more nodes at that level than at the // parent's level (with there being an implicit root node of the top level). @@ -2706,78 +2638,59 @@ } inMap[maxIndex] = (totals[maxIndex] > 1); inMap[pkgIdIndex] = true; + inMap[coreIdIndex] = true; + inMap[threadIdIndex] = true; int depth = 0; + int idx = 0; + kmp_hw_t types[KMP_HW_LAST]; + int pkgLevel = -1; + int coreLevel = -1; + int threadLevel = -1; for (index = threadIdIndex; index <= maxIndex; index++) { if (inMap[index]) { depth++; } } + if (inMap[pkgIdIndex]) { + pkgLevel = idx; + types[idx++] = KMP_HW_SOCKET; + } + if (inMap[coreIdIndex]) { + coreLevel = idx; + types[idx++] = KMP_HW_CORE; + } + if (inMap[threadIdIndex]) { + threadLevel = idx; + types[idx++] = KMP_HW_THREAD; + } KMP_ASSERT(depth > 0); // Construct the data structure that is to be returned. - *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); - int pkgLevel = -1; - int coreLevel = -1; - int threadLevel = -1; + __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types); for (i = 0; i < num_avail; ++i) { - Address addr(depth); unsigned os = threadInfo[i][osIdIndex]; int src_index; int dst_index = 0; + kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); + hw_thread.clear(); + hw_thread.os_id = os; + idx = 0; for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { if (!inMap[src_index]) { continue; } - addr.labels[dst_index] = threadInfo[i][src_index]; if (src_index == pkgIdIndex) { - pkgLevel = dst_index; + hw_thread.ids[pkgLevel] = threadInfo[i][src_index]; } else if (src_index == coreIdIndex) { - coreLevel = dst_index; + hw_thread.ids[coreLevel] = threadInfo[i][src_index]; } else if (src_index == threadIdIndex) { - threadLevel = dst_index; + hw_thread.ids[threadLevel] = threadInfo[i][src_index]; } dst_index++; } - (*address2os)[i] = AddrUnsPair(addr, os); - } - - if (__kmp_affinity_gran_levels < 0) { - // Set the granularity level based on what levels are modeled - // in the machine topology map. - unsigned src_index; - __kmp_affinity_gran_levels = 0; - for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { - if (!inMap[src_index]) { - continue; - } - switch (src_index) { - case threadIdIndex: - if (__kmp_affinity_gran > affinity_gran_thread) { - __kmp_affinity_gran_levels++; - } - - break; - case coreIdIndex: - if (__kmp_affinity_gran > affinity_gran_core) { - __kmp_affinity_gran_levels++; - } - break; - - case pkgIdIndex: - if (__kmp_affinity_gran > affinity_gran_package) { - __kmp_affinity_gran_levels++; - } - break; - } - } - } - - if (__kmp_affinity_verbose) { - __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, - coreLevel, threadLevel); } __kmp_free(inMap); @@ -2786,27 +2699,32 @@ __kmp_free(maxCt); __kmp_free(counts); CLEANUP_THREAD_INFO; - return depth; + __kmp_topology->sort_ids(); + if (!__kmp_topology->check_ids()) { + kmp_topology_t::deallocate(__kmp_topology); + __kmp_topology = nullptr; + *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; + return false; + } + return true; } // Create and return a table of affinity masks, indexed by OS thread ID. // This routine handles OR'ing together all the affinity masks of threads // that are sufficiently close, if granularity > fine. static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, - unsigned *numUnique, - AddrUnsPair *address2os, - unsigned numAddrs) { + unsigned *numUnique) { // First form a table of affinity masks in order of OS thread id. - unsigned depth; - unsigned maxOsId; - unsigned i; - - KMP_ASSERT(numAddrs > 0); - depth = address2os[0].first.depth; + int maxOsId; + int i; + int numAddrs = __kmp_topology->get_num_hw_threads(); + int depth = __kmp_topology->get_depth(); + KMP_ASSERT(numAddrs); + KMP_ASSERT(depth); maxOsId = 0; for (i = numAddrs - 1;; --i) { - unsigned osId = address2os[i].second; + int osId = __kmp_topology->at(i).os_id; if (osId > maxOsId) { maxOsId = osId; } @@ -2815,12 +2733,6 @@ } kmp_affin_mask_t *osId2Mask; KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); - - // Sort the address2os table according to physical order. Doing so will put - // all threads on the same core/package/node in consecutive locations. - qsort(address2os, numAddrs, sizeof(*address2os), - __kmp_affinity_cmp_Address_labels); - KMP_ASSERT(__kmp_affinity_gran_levels >= 0); if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); @@ -2833,52 +2745,50 @@ } // Run through the table, forming the masks for all threads on each core. - // Threads on the same core will have identical "Address" objects, not + // Threads on the same core will have identical kmp_hw_thread_t objects, not // considering the last level, which must be the thread id. All threads on a // core will appear consecutively. - unsigned unique = 0; - unsigned j = 0; // index of 1st thread on core - unsigned leader = 0; - Address *leaderAddr = &(address2os[0].first); + int unique = 0; + int j = 0; // index of 1st thread on core + int leader = 0; kmp_affin_mask_t *sum; KMP_CPU_ALLOC_ON_STACK(sum); KMP_CPU_ZERO(sum); - KMP_CPU_SET(address2os[0].second, sum); + KMP_CPU_SET(__kmp_topology->at(0).os_id, sum); for (i = 1; i < numAddrs; i++) { // If this thread is sufficiently close to the leader (within the // granularity setting), then set the bit for this os thread in the // affinity mask for this group, and go on to the next thread. - if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { - KMP_CPU_SET(address2os[i].second, sum); + if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) { + KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); continue; } // For every thread in this group, copy the mask to the thread's entry in // the osId2Mask table. Mark the first address as a leader. for (; j < i; j++) { - unsigned osId = address2os[j].second; + int osId = __kmp_topology->at(j).os_id; KMP_DEBUG_ASSERT(osId <= maxOsId); kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); KMP_CPU_COPY(mask, sum); - address2os[j].first.leader = (j == leader); + __kmp_topology->at(j).leader = (j == leader); } unique++; // Start a new mask. leader = i; - leaderAddr = &(address2os[i].first); KMP_CPU_ZERO(sum); - KMP_CPU_SET(address2os[i].second, sum); + KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); } // For every thread in last group, copy the mask to the thread's // entry in the osId2Mask table. for (; j < i; j++) { - unsigned osId = address2os[j].second; + int osId = __kmp_topology->at(j).os_id; KMP_DEBUG_ASSERT(osId <= maxOsId); kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); KMP_CPU_COPY(mask, sum); - address2os[j].first.leader = (j == leader); + __kmp_topology->at(j).leader = (j == leader); } unique++; KMP_CPU_FREE_FROM_STACK(sum); @@ -3468,673 +3378,15 @@ #undef ADD_MASK #undef ADD_MASK_OSID -#if KMP_USE_HWLOC -static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { - // skip PUs descendants of the object o - int skipped = 0; - hwloc_obj_t hT = NULL; - int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); - for (int i = 0; i < N; ++i) { - KMP_DEBUG_ASSERT(hT); - unsigned idx = hT->os_index; - if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { - KMP_CPU_CLR(idx, __kmp_affin_fullMask); - KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); - ++skipped; - } - hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); - } - return skipped; // count number of skipped units -} - -static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { - // check if obj has PUs present in fullMask - hwloc_obj_t hT = NULL; - int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); - for (int i = 0; i < N; ++i) { - KMP_DEBUG_ASSERT(hT); - unsigned idx = hT->os_index; - if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) - return 1; // found PU - hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); - } - return 0; // no PUs found -} -#endif // KMP_USE_HWLOC - -static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { - AddrUnsPair *newAddr; - if (__kmp_hws_requested == 0) - goto _exit; // no topology limiting actions requested, exit -#if KMP_USE_HWLOC - if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { - // Number of subobjects calculated dynamically, this works fine for - // any non-uniform topology. - // L2 cache objects are determined by depth, other objects - by type. - hwloc_topology_t tp = __kmp_hwloc_topology; - int nS = 0, nN = 0, nL = 0, nC = 0, - nT = 0; // logical index including skipped - int nCr = 0, nTr = 0; // number of requested units - int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters - hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) - int L2depth, idx; - - // check support of extensions ---------------------------------- - int numa_support = 0, tile_support = 0; - if (__kmp_pu_os_idx) - hT = hwloc_get_pu_obj_by_os_index(tp, - __kmp_pu_os_idx[__kmp_avail_proc - 1]); - else - hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); - if (hT == NULL) { // something's gone wrong - KMP_WARNING(AffHWSubsetUnsupported); - goto _exit; - } - // check NUMA node - hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); - hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); - if (hN != NULL && hN->depth > hS->depth) { - numa_support = 1; // 1 in case socket includes node(s) - } else if (__kmp_hws_node.num > 0) { - // don't support sockets inside NUMA node (no such HW found for testing) - KMP_WARNING(AffHWSubsetUnsupported); - goto _exit; - } - // check L2 cahce, get object by depth because of multiple caches - L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); - hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); - if (hL != NULL && - __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { - tile_support = 1; // no sense to count L2 if it includes single core - } else if (__kmp_hws_tile.num > 0) { - if (__kmp_hws_core.num == 0) { - __kmp_hws_core = __kmp_hws_tile; // replace L2 with core - __kmp_hws_tile.num = 0; - } else { - // L2 and core are both requested, but represent same object - KMP_WARNING(AffHWSubsetInvalid); - goto _exit; - } - } - // end of check of extensions ----------------------------------- - - // fill in unset items, validate settings ----------------------- - if (__kmp_hws_socket.num == 0) - __kmp_hws_socket.num = nPackages; // use all available sockets - if (__kmp_hws_socket.offset >= nPackages) { - KMP_WARNING(AffHWSubsetManySockets); - goto _exit; - } - if (numa_support) { - hN = NULL; - int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, - &hN); // num nodes in socket - if (__kmp_hws_node.num == 0) - __kmp_hws_node.num = NN; // use all available nodes - if (__kmp_hws_node.offset >= NN) { - KMP_WARNING(AffHWSubsetManyNodes); - goto _exit; - } - if (tile_support) { - // get num tiles in node - int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); - if (__kmp_hws_tile.num == 0) { - __kmp_hws_tile.num = NL + 1; - } // use all available tiles, some node may have more tiles, thus +1 - if (__kmp_hws_tile.offset >= NL) { - KMP_WARNING(AffHWSubsetManyTiles); - goto _exit; - } - int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, - &hC); // num cores in tile - if (__kmp_hws_core.num == 0) - __kmp_hws_core.num = NC; // use all available cores - if (__kmp_hws_core.offset >= NC) { - KMP_WARNING(AffHWSubsetManyCores); - goto _exit; - } - } else { // tile_support - int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, - &hC); // num cores in node - if (__kmp_hws_core.num == 0) - __kmp_hws_core.num = NC; // use all available cores - if (__kmp_hws_core.offset >= NC) { - KMP_WARNING(AffHWSubsetManyCores); - goto _exit; - } - } // tile_support - } else { // numa_support - if (tile_support) { - // get num tiles in socket - int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); - if (__kmp_hws_tile.num == 0) - __kmp_hws_tile.num = NL; // use all available tiles - if (__kmp_hws_tile.offset >= NL) { - KMP_WARNING(AffHWSubsetManyTiles); - goto _exit; - } - int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, - &hC); // num cores in tile - if (__kmp_hws_core.num == 0) - __kmp_hws_core.num = NC; // use all available cores - if (__kmp_hws_core.offset >= NC) { - KMP_WARNING(AffHWSubsetManyCores); - goto _exit; - } - } else { // tile_support - int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, - &hC); // num cores in socket - if (__kmp_hws_core.num == 0) - __kmp_hws_core.num = NC; // use all available cores - if (__kmp_hws_core.offset >= NC) { - KMP_WARNING(AffHWSubsetManyCores); - goto _exit; - } - } // tile_support - } - if (__kmp_hws_proc.num == 0) - __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs - if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { - KMP_WARNING(AffHWSubsetManyProcs); - goto _exit; - } - // end of validation -------------------------------------------- - - if (pAddr) // pAddr is NULL in case of affinity_none - newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * - __kmp_avail_proc); // max size - // main loop to form HW subset ---------------------------------- - hS = NULL; - int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); - for (int s = 0; s < NP; ++s) { - // Check Socket ----------------------------------------------- - hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); - if (!__kmp_hwloc_obj_has_PUs(tp, hS)) - continue; // skip socket if all PUs are out of fullMask - ++nS; // only count objects those have PUs in affinity mask - if (nS <= __kmp_hws_socket.offset || - nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { - n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket - continue; // move to next socket - } - nCr = 0; // count number of cores per socket - // socket requested, go down the topology tree - // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) - if (numa_support) { - nN = 0; - hN = NULL; - // num nodes in current socket - int NN = - __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN); - for (int n = 0; n < NN; ++n) { - // Check NUMA Node ---------------------------------------- - if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { - hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); - continue; // skip node if all PUs are out of fullMask - } - ++nN; - if (nN <= __kmp_hws_node.offset || - nN > __kmp_hws_node.num + __kmp_hws_node.offset) { - // skip node as not requested - n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node - hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); - continue; // move to next node - } - // node requested, go down the topology tree - if (tile_support) { - nL = 0; - hL = NULL; - int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); - for (int l = 0; l < NL; ++l) { - // Check L2 (tile) ------------------------------------ - if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { - hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); - continue; // skip tile if all PUs are out of fullMask - } - ++nL; - if (nL <= __kmp_hws_tile.offset || - nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { - // skip tile as not requested - n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile - hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); - continue; // move to next tile - } - // tile requested, go down the topology tree - nC = 0; - hC = NULL; - // num cores in current tile - int NC = __kmp_hwloc_count_children_by_type(tp, hL, - HWLOC_OBJ_CORE, &hC); - for (int c = 0; c < NC; ++c) { - // Check Core --------------------------------------- - if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - continue; // skip core if all PUs are out of fullMask - } - ++nC; - if (nC <= __kmp_hws_core.offset || - nC > __kmp_hws_core.num + __kmp_hws_core.offset) { - // skip node as not requested - n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - continue; // move to next node - } - // core requested, go down to PUs - nT = 0; - nTr = 0; - hT = NULL; - // num procs in current core - int NT = __kmp_hwloc_count_children_by_type(tp, hC, - HWLOC_OBJ_PU, &hT); - for (int t = 0; t < NT; ++t) { - // Check PU --------------------------------------- - idx = hT->os_index; - if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - continue; // skip PU if not in fullMask - } - ++nT; - if (nT <= __kmp_hws_proc.offset || - nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { - // skip PU - KMP_CPU_CLR(idx, __kmp_affin_fullMask); - ++n_old; - KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - continue; // move to next node - } - ++nTr; - if (pAddr) // collect requested thread's data - newAddr[n_new] = (*pAddr)[n_old]; - ++n_new; - ++n_old; - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - } // threads loop - if (nTr > 0) { - ++nCr; // num cores per socket - ++nCo; // total num cores - if (nTr > nTpC) - nTpC = nTr; // calc max threads per core - } - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - } // cores loop - hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); - } // tiles loop - } else { // tile_support - // no tiles, check cores - nC = 0; - hC = NULL; - // num cores in current node - int NC = - __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC); - for (int c = 0; c < NC; ++c) { - // Check Core --------------------------------------- - if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - continue; // skip core if all PUs are out of fullMask - } - ++nC; - if (nC <= __kmp_hws_core.offset || - nC > __kmp_hws_core.num + __kmp_hws_core.offset) { - // skip node as not requested - n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - continue; // move to next node - } - // core requested, go down to PUs - nT = 0; - nTr = 0; - hT = NULL; - int NT = - __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); - for (int t = 0; t < NT; ++t) { - // Check PU --------------------------------------- - idx = hT->os_index; - if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - continue; // skip PU if not in fullMask - } - ++nT; - if (nT <= __kmp_hws_proc.offset || - nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { - // skip PU - KMP_CPU_CLR(idx, __kmp_affin_fullMask); - ++n_old; - KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - continue; // move to next node - } - ++nTr; - if (pAddr) // collect requested thread's data - newAddr[n_new] = (*pAddr)[n_old]; - ++n_new; - ++n_old; - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - } // threads loop - if (nTr > 0) { - ++nCr; // num cores per socket - ++nCo; // total num cores - if (nTr > nTpC) - nTpC = nTr; // calc max threads per core - } - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - } // cores loop - } // tiles support - hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); - } // nodes loop - } else { // numa_support - // no NUMA support - if (tile_support) { - nL = 0; - hL = NULL; - // num tiles in current socket - int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); - for (int l = 0; l < NL; ++l) { - // Check L2 (tile) ------------------------------------ - if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { - hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); - continue; // skip tile if all PUs are out of fullMask - } - ++nL; - if (nL <= __kmp_hws_tile.offset || - nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { - // skip tile as not requested - n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile - hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); - continue; // move to next tile - } - // tile requested, go down the topology tree - nC = 0; - hC = NULL; - // num cores per tile - int NC = - __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC); - for (int c = 0; c < NC; ++c) { - // Check Core --------------------------------------- - if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - continue; // skip core if all PUs are out of fullMask - } - ++nC; - if (nC <= __kmp_hws_core.offset || - nC > __kmp_hws_core.num + __kmp_hws_core.offset) { - // skip node as not requested - n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - continue; // move to next node - } - // core requested, go down to PUs - nT = 0; - nTr = 0; - hT = NULL; - // num procs per core - int NT = - __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); - for (int t = 0; t < NT; ++t) { - // Check PU --------------------------------------- - idx = hT->os_index; - if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - continue; // skip PU if not in fullMask - } - ++nT; - if (nT <= __kmp_hws_proc.offset || - nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { - // skip PU - KMP_CPU_CLR(idx, __kmp_affin_fullMask); - ++n_old; - KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - continue; // move to next node - } - ++nTr; - if (pAddr) // collect requested thread's data - newAddr[n_new] = (*pAddr)[n_old]; - ++n_new; - ++n_old; - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - } // threads loop - if (nTr > 0) { - ++nCr; // num cores per socket - ++nCo; // total num cores - if (nTr > nTpC) - nTpC = nTr; // calc max threads per core - } - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - } // cores loop - hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); - } // tiles loop - } else { // tile_support - // no tiles, check cores - nC = 0; - hC = NULL; - // num cores in socket - int NC = - __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC); - for (int c = 0; c < NC; ++c) { - // Check Core ------------------------------------------- - if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - continue; // skip core if all PUs are out of fullMask - } - ++nC; - if (nC <= __kmp_hws_core.offset || - nC > __kmp_hws_core.num + __kmp_hws_core.offset) { - // skip node as not requested - n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - continue; // move to next node - } - // core requested, go down to PUs - nT = 0; - nTr = 0; - hT = NULL; - // num procs per core - int NT = - __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); - for (int t = 0; t < NT; ++t) { - // Check PU --------------------------------------- - idx = hT->os_index; - if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - continue; // skip PU if not in fullMask - } - ++nT; - if (nT <= __kmp_hws_proc.offset || - nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { - // skip PU - KMP_CPU_CLR(idx, __kmp_affin_fullMask); - ++n_old; - KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - continue; // move to next node - } - ++nTr; - if (pAddr) // collect requested thread's data - newAddr[n_new] = (*pAddr)[n_old]; - ++n_new; - ++n_old; - hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); - } // threads loop - if (nTr > 0) { - ++nCr; // num cores per socket - ++nCo; // total num cores - if (nTr > nTpC) - nTpC = nTr; // calc max threads per core - } - hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); - } // cores loop - } // tiles support - } // numa_support - if (nCr > 0) { // found cores? - ++nPkg; // num sockets - if (nCr > nCpP) - nCpP = nCr; // calc max cores per socket - } - } // sockets loop - - // check the subset is valid - KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); - KMP_DEBUG_ASSERT(nPkg > 0); - KMP_DEBUG_ASSERT(nCpP > 0); - KMP_DEBUG_ASSERT(nTpC > 0); - KMP_DEBUG_ASSERT(nCo > 0); - KMP_DEBUG_ASSERT(nPkg <= nPackages); - KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); - KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); - KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); - - nPackages = nPkg; // correct num sockets - nCoresPerPkg = nCpP; // correct num cores per socket - __kmp_nThreadsPerCore = nTpC; // correct num threads per core - __kmp_avail_proc = n_new; // correct num procs - __kmp_ncores = nCo; // correct num cores - // hwloc topology method end - } else -#endif // KMP_USE_HWLOC - { - int n_old = 0, n_new = 0, proc_num = 0; - if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { - KMP_WARNING(AffHWSubsetNoHWLOC); - goto _exit; - } - if (__kmp_hws_socket.num == 0) - __kmp_hws_socket.num = nPackages; // use all available sockets - if (__kmp_hws_die.num == 0) - __kmp_hws_die.num = nDiesPerPkg; // use all available dies - if (__kmp_hws_core.num == 0) - __kmp_hws_core.num = nCoresPerPkg; // use all available cores - if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) - __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts - if (!__kmp_affinity_uniform_topology()) { - KMP_WARNING(AffHWSubsetNonUniform); - goto _exit; // don't support non-uniform topology - } - if (depth > 4) { - KMP_WARNING(AffHWSubsetNonThreeLevel); - goto _exit; // don't support not-3-level topology - } - if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { - KMP_WARNING(AffHWSubsetManySockets); - goto _exit; - } - if (depth == 4 && __kmp_hws_die.offset + __kmp_hws_die.num > nDiesPerPkg) { - KMP_WARNING(AffHWSubsetManyDies); - goto _exit; - } - if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { - KMP_WARNING(AffHWSubsetManyCores); - goto _exit; - } - // Form the requested subset - if (pAddr) // pAddr is NULL in case of affinity_none - newAddr = (AddrUnsPair *)__kmp_allocate( - sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_die.num * - __kmp_hws_core.num * __kmp_hws_proc.num); - for (int i = 0; i < nPackages; ++i) { - if (i < __kmp_hws_socket.offset || - i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { - // skip not-requested socket - n_old += nDiesPerPkg * nCoresPerPkg * __kmp_nThreadsPerCore; - if (__kmp_pu_os_idx != NULL) { - // walk through skipped socket - for (int l = 0; l < nDiesPerPkg; ++l) { - for (int j = 0; j < nCoresPerPkg; ++j) { - for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { - KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); - ++proc_num; - } - } - } - } - } else { - // walk through requested socket - for (int l = 0; l < nDiesPerPkg; ++l) { - // skip unwanted die - if (l < __kmp_hws_die.offset || - l >= __kmp_hws_die.offset + __kmp_hws_die.num) { - n_old += nCoresPerPkg; - if (__kmp_pu_os_idx != NULL) { - for (int k = 0; k < nCoresPerPkg; ++k) { - KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); - ++proc_num; - } - } - } else { - for (int j = 0; j < nCoresPerPkg; ++j) { - if (j < __kmp_hws_core.offset || - j >= __kmp_hws_core.offset + - __kmp_hws_core.num) { // skip not-requested core - n_old += __kmp_nThreadsPerCore; - if (__kmp_pu_os_idx != NULL) { - for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { - KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], - __kmp_affin_fullMask); - ++proc_num; - } - } - } else { - // walk through requested core - for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { - if (k < __kmp_hws_proc.num) { - if (pAddr) // collect requested thread's data - newAddr[n_new] = (*pAddr)[n_old]; - n_new++; - } else { - if (__kmp_pu_os_idx != NULL) - KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], - __kmp_affin_fullMask); - } - n_old++; - ++proc_num; - } - } - } - } - } - } - } - KMP_DEBUG_ASSERT(n_old == nPackages * nDiesPerPkg * nCoresPerPkg * - __kmp_nThreadsPerCore); - KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_die.num * - __kmp_hws_core.num * __kmp_hws_proc.num); - nPackages = __kmp_hws_socket.num; // correct nPackages - nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg - nDiesPerPkg = __kmp_hws_die.num; // correct nDiesPerPkg - __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore - __kmp_avail_proc = n_new; // correct avail_proc - __kmp_ncores = - nPackages * nDiesPerPkg * __kmp_hws_core.num; // correct ncores - } // non-hwloc topology method - if (pAddr) { - __kmp_free(*pAddr); - *pAddr = newAddr; // replace old topology with new one - } - if (__kmp_affinity_verbose) { - KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); - kmp_str_buf_t buf; - __kmp_str_buf_init(&buf); - __kmp_str_buf_print(&buf, "%d", nPackages); - KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); - __kmp_str_buf_free(&buf); - } -_exit: - if (__kmp_pu_os_idx != NULL) { - __kmp_free(__kmp_pu_os_idx); - __kmp_pu_os_idx = NULL; - } -} - // This function figures out the deepest level at which there is at least one // cluster/core with more than one processing unit bound to it. -static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, - int nprocs, int bottom_level) { +static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { int core_level = 0; for (int i = 0; i < nprocs; i++) { + const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); for (int j = bottom_level; j > 0; j--) { - if (address2os[i].first.labels[j] > 0) { + if (hw_thread.ids[j] > 0) { if (core_level < (j - 1)) { core_level = j - 1; } @@ -4145,83 +3397,42 @@ } // This function counts number of clusters/cores at given level. -static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, - int nprocs, int bottom_level, +static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level, int core_level) { - int ncores = 0; - int i, j; - - j = bottom_level; - for (i = 0; i < nprocs; i++) { - for (j = bottom_level; j > core_level; j--) { - if ((i + 1) < nprocs) { - if (address2os[i + 1].first.labels[j] > 0) { + return __kmp_topology->get_count(core_level); +} +// This function finds to which cluster/core given processing unit is bound. +static int __kmp_affinity_find_core(int proc, int bottom_level, + int core_level) { + int core = 0; + KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads()); + for (int i = 0; i <= proc; ++i) { + if (i + 1 <= proc) { + for (int j = 0; j <= core_level; ++j) { + if (__kmp_topology->at(i + 1).sub_ids[j] != + __kmp_topology->at(i).sub_ids[j]) { + core++; break; } } } - if (j == core_level) { - ncores++; - } - } - if (j > core_level) { - // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one - // core. May occur when called from __kmp_affinity_find_core(). - ncores++; } - return ncores; -} - -// This function finds to which cluster/core given processing unit is bound. -static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, - int bottom_level, int core_level) { - return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, - core_level) - - 1; + return core; } // This function finds maximal number of processing units bound to a // cluster/core at given level. -static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, - int nprocs, int bottom_level, +static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level, int core_level) { - int maxprocpercore = 0; - - if (core_level < bottom_level) { - for (int i = 0; i < nprocs; i++) { - int percore = address2os[i].first.labels[core_level + 1] + 1; - - if (percore > maxprocpercore) { - maxprocpercore = percore; - } - } - } else { - maxprocpercore = 1; - } - return maxprocpercore; + if (core_level >= bottom_level) + return 1; + int thread_level = __kmp_topology->get_level(KMP_HW_THREAD); + return __kmp_topology->calculate_ratio(thread_level, core_level); } -static AddrUnsPair *address2os = NULL; static int *procarr = NULL; static int __kmp_aff_depth = 0; -#if KMP_USE_HIER_SCHED -#define KMP_EXIT_AFF_NONE \ - KMP_ASSERT(__kmp_affinity_type == affinity_none); \ - KMP_ASSERT(address2os == NULL); \ - __kmp_apply_thread_places(NULL, 0); \ - __kmp_create_affinity_none_places(); \ - __kmp_dispatch_set_hierarchy_values(); \ - return; -#else -#define KMP_EXIT_AFF_NONE \ - KMP_ASSERT(__kmp_affinity_type == affinity_none); \ - KMP_ASSERT(address2os == NULL); \ - __kmp_apply_thread_places(NULL, 0); \ - __kmp_create_affinity_none_places(); \ - return; -#endif - // Create a one element mask array (set of places) which only contains the // initial process's affinity mask static void __kmp_create_affinity_none_places() { @@ -4233,31 +3444,6 @@ KMP_CPU_COPY(dest, __kmp_affin_fullMask); } -static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { - const Address *aa = &(((const AddrUnsPair *)a)->first); - const Address *bb = &(((const AddrUnsPair *)b)->first); - unsigned depth = aa->depth; - unsigned i; - KMP_DEBUG_ASSERT(depth == bb->depth); - KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); - KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); - for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { - int j = depth - i - 1; - if (aa->childNums[j] < bb->childNums[j]) - return -1; - if (aa->childNums[j] > bb->childNums[j]) - return 1; - } - for (; i < depth; i++) { - int j = i - __kmp_affinity_compact; - if (aa->childNums[j] < bb->childNums[j]) - return -1; - if (aa->childNums[j] > bb->childNums[j]) - return 1; - } - return 0; -} - static void __kmp_aux_affinity_initialize(void) { if (__kmp_affinity_masks != NULL) { KMP_ASSERT(__kmp_affin_fullMask != NULL); @@ -4317,14 +3503,6 @@ } } - if (__kmp_affinity_gran == affinity_gran_tile && - // check if user's request is valid - __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) { - KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY"); - __kmp_affinity_gran = affinity_gran_package; - } - - int depth = -1; kmp_i18n_id_t msg_id = kmp_i18n_null; // For backward compatibility, setting KMP_CPUINFO_FILE => @@ -4334,22 +3512,18 @@ __kmp_affinity_top_method = affinity_top_method_cpuinfo; } + bool success = false; if (__kmp_affinity_top_method == affinity_top_method_all) { - // In the default code path, errors are not fatal - we just try using - // another method. We only emit a warning message if affinity is on, or the - // verbose flag is set, and the nowarnings flag was not set. - const char *file_name = NULL; - int line = 0; +// In the default code path, errors are not fatal - we just try using +// another method. We only emit a warning message if affinity is on, or the +// verbose flag is set, an the nowarnings flag was not set. #if KMP_USE_HWLOC - if (depth < 0 && + if (!success && __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { - if (__kmp_affinity_verbose) { - KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); - } if (!__kmp_hwloc_error) { - depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); - if (depth == 0) { - KMP_EXIT_AFF_NONE; + success = __kmp_affinity_create_hwloc_map(&msg_id); + if (!success && __kmp_affinity_verbose) { + KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); } } else if (__kmp_affinity_verbose) { KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); @@ -4358,166 +3532,85 @@ #endif #if KMP_ARCH_X86 || KMP_ARCH_X86_64 - - if (depth < 0) { - if (__kmp_affinity_verbose) { - KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); - } - - file_name = NULL; - depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); - if (depth == 0) { - KMP_EXIT_AFF_NONE; + if (!success) { + success = __kmp_affinity_create_x2apicid_map(&msg_id); + if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); } - - if (depth < 0) { - if (__kmp_affinity_verbose) { - if (msg_id != kmp_i18n_null) { - KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", - __kmp_i18n_catgets(msg_id), - KMP_I18N_STR(DecodingLegacyAPIC)); - } else { - KMP_INFORM(AffInfoStr, "KMP_AFFINITY", - KMP_I18N_STR(DecodingLegacyAPIC)); - } - } - - file_name = NULL; - depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); - if (depth == 0) { - KMP_EXIT_AFF_NONE; - } + } + if (!success) { + success = __kmp_affinity_create_apicid_map(&msg_id); + if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); } } - #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ #if KMP_OS_LINUX - - if (depth < 0) { - if (__kmp_affinity_verbose) { - if (msg_id != kmp_i18n_null) { - KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", - __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); - } else { - KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); - } - } - - kmp_safe_raii_file_t f("/proc/cpuinfo", "r"); - depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); - if (depth == 0) { - KMP_EXIT_AFF_NONE; + if (!success) { + int line = 0; + success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); + if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); } } - #endif /* KMP_OS_LINUX */ #if KMP_GROUP_AFFINITY - - if ((depth < 0) && (__kmp_num_proc_groups > 1)) { - if (__kmp_affinity_verbose) { - KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); + if (!success && (__kmp_num_proc_groups > 1)) { + success = __kmp_affinity_create_proc_group_map(&msg_id); + if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); } - - depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); - KMP_ASSERT(depth != 0); } - #endif /* KMP_GROUP_AFFINITY */ - if (depth < 0) { - if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { - if (file_name == NULL) { - KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); - } else if (line == 0) { - KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); - } else { - KMP_INFORM(UsingFlatOSFileLine, file_name, line, - __kmp_i18n_catgets(msg_id)); - } - } - // FIXME - print msg if msg_id = kmp_i18n_null ??? - - file_name = ""; - depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); - if (depth == 0) { - KMP_EXIT_AFF_NONE; + if (!success) { + success = __kmp_affinity_create_flat_map(&msg_id); + if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); } - KMP_ASSERT(depth > 0); - KMP_ASSERT(address2os != NULL); + KMP_ASSERT(success); } } +// If the user has specified that a paricular topology discovery method is to be +// used, then we abort if that method fails. The exception is group affinity, +// which might have been implicitly set. #if KMP_USE_HWLOC else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); - if (__kmp_affinity_verbose) { - KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); - } - depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); - if (depth == 0) { - KMP_EXIT_AFF_NONE; + success = __kmp_affinity_create_hwloc_map(&msg_id); + if (!success) { + KMP_ASSERT(msg_id != kmp_i18n_null); + KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); } } #endif // KMP_USE_HWLOC - // If the user has specified that a particular topology discovery method is to - // be used, then we abort if that method fails. The exception is group - // affinity, which might have been implicitly set. - #if KMP_ARCH_X86 || KMP_ARCH_X86_64 - else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { - if (__kmp_affinity_verbose) { - KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); - } - - depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); - if (depth == 0) { - KMP_EXIT_AFF_NONE; - } - if (depth < 0) { + success = __kmp_affinity_create_x2apicid_map(&msg_id); + if (!success) { KMP_ASSERT(msg_id != kmp_i18n_null); KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); } } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { - if (__kmp_affinity_verbose) { - KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); - } - - depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); - if (depth == 0) { - KMP_EXIT_AFF_NONE; - } - if (depth < 0) { + success = __kmp_affinity_create_apicid_map(&msg_id); + if (!success) { KMP_ASSERT(msg_id != kmp_i18n_null); KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); } } - #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { - const char *filename; - const char *env_var = nullptr; - if (__kmp_cpuinfo_file != NULL) { - filename = __kmp_cpuinfo_file; - env_var = "KMP_CPUINFO_FILE"; - } else { - filename = "/proc/cpuinfo"; - } - - if (__kmp_affinity_verbose) { - KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); - } - - kmp_safe_raii_file_t f(filename, "r", env_var); int line = 0; - depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); - if (depth < 0) { + success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); + if (!success) { KMP_ASSERT(msg_id != kmp_i18n_null); + const char *filename = __kmp_cpuinfo_get_filename(); if (line > 0) { KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); @@ -4525,84 +3618,80 @@ KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); } } - if (__kmp_affinity_type == affinity_none) { - KMP_ASSERT(depth == 0); - KMP_EXIT_AFF_NONE; - } } #if KMP_GROUP_AFFINITY - else if (__kmp_affinity_top_method == affinity_top_method_group) { - if (__kmp_affinity_verbose) { - KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); - } - - depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); - KMP_ASSERT(depth != 0); - if (depth < 0) { + success = __kmp_affinity_create_proc_group_map(&msg_id); + KMP_ASSERT(success); + if (!success) { KMP_ASSERT(msg_id != kmp_i18n_null); KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); } } - #endif /* KMP_GROUP_AFFINITY */ else if (__kmp_affinity_top_method == affinity_top_method_flat) { - if (__kmp_affinity_verbose) { - KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); - } - - depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); - if (depth == 0) { - KMP_EXIT_AFF_NONE; - } + success = __kmp_affinity_create_flat_map(&msg_id); // should not fail - KMP_ASSERT(depth > 0); - KMP_ASSERT(address2os != NULL); + KMP_ASSERT(success); } -#if KMP_USE_HIER_SCHED - __kmp_dispatch_set_hierarchy_values(); -#endif - - if (address2os == NULL) { + // Early exit if topology could not be created + if (!__kmp_topology) { if (KMP_AFFINITY_CAPABLE() && (__kmp_affinity_verbose || (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { KMP_WARNING(ErrorInitializeAffinity); } + if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && + __kmp_ncores > 0) { + __kmp_topology = kmp_topology_t::allocate(0, 0, NULL); + __kmp_topology->canonicalize(nPackages, nCoresPerPkg, + __kmp_nThreadsPerCore, __kmp_ncores); + if (__kmp_affinity_verbose) { + __kmp_topology->print("KMP_AFFINITY"); + } + } __kmp_affinity_type = affinity_none; __kmp_create_affinity_none_places(); +#if KMP_USE_HIER_SCHED + __kmp_dispatch_set_hierarchy_values(); +#endif KMP_AFFINITY_DISABLE(); return; } - if (__kmp_affinity_gran == affinity_gran_tile -#if KMP_USE_HWLOC - && __kmp_tile_depth == 0 + // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and + // initialize other data structures which depend on the topology + __kmp_topology->canonicalize(); + if (__kmp_affinity_verbose) + __kmp_topology->print("KMP_AFFINITY"); + bool filtered = __kmp_topology->filter_hw_subset(); + if (filtered && __kmp_affinity_verbose) + __kmp_topology->print("KMP_HW_SUBSET"); + machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); + KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); + // If KMP_AFFINITY=none, then only create the single "none" place + // which is the process's initial affinity mask or the number of + // hardware threads depending on respect,norespect + if (__kmp_affinity_type == affinity_none) { + __kmp_create_affinity_none_places(); +#if KMP_USE_HIER_SCHED + __kmp_dispatch_set_hierarchy_values(); #endif - ) { - // tiles requested but not detected, warn user on this - KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY"); + return; } - - __kmp_apply_thread_places(&address2os, depth); + int depth = __kmp_topology->get_depth(); // Create the table of masks, indexed by thread Id. unsigned maxIndex; unsigned numUnique; - kmp_affin_mask_t *osId2Mask = - __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); + kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique); if (__kmp_affinity_gran_levels == 0) { KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); } - // Set the childNums vector in all Address objects. This must be done before - // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into - // account the setting of __kmp_affinity_compact. - __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); - switch (__kmp_affinity_type) { case affinity_explicit: @@ -4627,18 +3716,17 @@ } break; - // The other affinity types rely on sorting the Addresses according to some - // permutation of the machine topology tree. Set __kmp_affinity_compact and - // __kmp_affinity_offset appropriately, then jump to a common code fragment - // to do the sort and create the array of affinity masks. - + // The other affinity types rely on sorting the hardware threads according to + // some permutation of the machine topology tree. Set __kmp_affinity_compact + // and __kmp_affinity_offset appropriately, then jump to a common code + // fragment to do the sort and create the array of affinity masks. case affinity_logical: __kmp_affinity_compact = 0; if (__kmp_affinity_offset) { __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; } - goto sortAddresses; + goto sortTopology; case affinity_physical: if (__kmp_nThreadsPerCore > 1) { @@ -4653,7 +3741,7 @@ __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; } - goto sortAddresses; + goto sortTopology; case affinity_scatter: if (__kmp_affinity_compact >= depth) { @@ -4661,13 +3749,13 @@ } else { __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; } - goto sortAddresses; + goto sortTopology; case affinity_compact: if (__kmp_affinity_compact >= depth) { __kmp_affinity_compact = depth - 1; } - goto sortAddresses; + goto sortTopology; case affinity_balanced: if (depth <= 1) { @@ -4677,16 +3765,16 @@ __kmp_affinity_type = affinity_none; __kmp_create_affinity_none_places(); return; - } else if (!__kmp_affinity_uniform_topology()) { + } else if (!__kmp_topology->is_uniform()) { // Save the depth for further usage __kmp_aff_depth = depth; - int core_level = __kmp_affinity_find_core_level( - address2os, __kmp_avail_proc, depth - 1); - int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, - depth - 1, core_level); + int core_level = + __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1); + int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1, + core_level); int maxprocpercore = __kmp_affinity_max_proc_per_core( - address2os, __kmp_avail_proc, depth - 1, core_level); + __kmp_avail_proc, depth - 1, core_level); int nproc = ncores * maxprocpercore; if ((nproc < 2) || (nproc < __kmp_avail_proc)) { @@ -4705,9 +3793,8 @@ int lastcore = -1; int inlastcore = 0; for (int i = 0; i < __kmp_avail_proc; i++) { - int proc = address2os[i].second; - int core = - __kmp_affinity_find_core(address2os, i, depth - 1, core_level); + int proc = __kmp_topology->at(i).os_id; + int core = __kmp_affinity_find_core(i, depth - 1, core_level); if (core == lastcore) { inlastcore++; @@ -4723,7 +3810,7 @@ __kmp_affinity_compact = depth - 1; } - sortAddresses: + sortTopology: // Allocate the gtid->affinity mask table. if (__kmp_affinity_dups) { __kmp_affinity_num_masks = __kmp_avail_proc; @@ -4739,18 +3826,19 @@ KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); - // Sort the address2os table according to the current setting of + // Sort the topology table according to the current setting of // __kmp_affinity_compact, then fill out __kmp_affinity_masks. - qsort(address2os, __kmp_avail_proc, sizeof(*address2os), - __kmp_affinity_cmp_Address_child_num); + __kmp_topology->sort_compact(); { int i; unsigned j; - for (i = 0, j = 0; i < __kmp_avail_proc; i++) { - if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { + int num_hw_threads = __kmp_topology->get_num_hw_threads(); + for (i = 0, j = 0; i < num_hw_threads; i++) { + if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) { continue; } - unsigned osId = address2os[i].second; + int osId = __kmp_topology->at(i).os_id; + kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); KMP_ASSERT(KMP_CPU_ISSET(osId, src)); @@ -4761,6 +3849,8 @@ } KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); } + // Sort the topology back using ids + __kmp_topology->sort_ids(); break; default: @@ -4768,9 +3858,7 @@ } KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); - machine_hierarchy.init(address2os, __kmp_avail_proc); } -#undef KMP_EXIT_AFF_NONE void __kmp_affinity_initialize(void) { // Much of the code above was written assuming that if a machine was not @@ -4810,10 +3898,6 @@ __kmp_free(__kmp_affinity_proclist); __kmp_affinity_proclist = NULL; } - if (address2os != NULL) { - __kmp_free(address2os); - address2os = NULL; - } if (procarr != NULL) { __kmp_free(procarr); procarr = NULL; @@ -4824,6 +3908,14 @@ __kmp_hwloc_topology = NULL; } #endif + if (__kmp_hw_subset) { + kmp_hw_subset_t::deallocate(__kmp_hw_subset); + __kmp_hw_subset = nullptr; + } + if (__kmp_topology) { + kmp_topology_t::deallocate(__kmp_topology); + __kmp_topology = nullptr; + } KMPAffinity::destroy_api(); } @@ -5201,15 +4293,14 @@ int tid = th->th.th_info.ds.ds_tid; switch (__kmp_affinity_gran) { - case affinity_gran_fine: - case affinity_gran_thread: + case KMP_HW_THREAD: break; - case affinity_gran_core: + case KMP_HW_CORE: if (__kmp_nThreadsPerCore > 1) { fine_gran = false; } break; - case affinity_gran_package: + case KMP_HW_SOCKET: if (nCoresPerPkg > 1) { fine_gran = false; } @@ -5218,7 +4309,7 @@ fine_gran = false; } - if (__kmp_affinity_uniform_topology()) { + if (__kmp_topology->is_uniform()) { int coreID; int threadID; // Number of hyper threads per core in HT machine @@ -5242,7 +4333,6 @@ coreID = (tid - big_cores) / chunk; threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; } - KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), "Illegal set affinity operation when not capable"); @@ -5250,12 +4340,13 @@ KMP_CPU_ZERO(mask); if (fine_gran) { - int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; + int osID = + __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id; KMP_CPU_SET(osID, mask); } else { for (int i = 0; i < __kmp_nth_per_core; i++) { int osID; - osID = address2os[coreID * __kmp_nth_per_core + i].second; + osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id; KMP_CPU_SET(osID, mask); } } @@ -5271,26 +4362,26 @@ kmp_affin_mask_t *mask = th->th.th_affin_mask; KMP_CPU_ZERO(mask); - int core_level = __kmp_affinity_find_core_level( - address2os, __kmp_avail_proc, __kmp_aff_depth - 1); - int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, + int core_level = + __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1); + int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, __kmp_aff_depth - 1, core_level); int nth_per_core = __kmp_affinity_max_proc_per_core( - address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); + __kmp_avail_proc, __kmp_aff_depth - 1, core_level); // For performance gain consider the special case nthreads == // __kmp_avail_proc if (nthreads == __kmp_avail_proc) { if (fine_gran) { - int osID = address2os[tid].second; + int osID = __kmp_topology->at(tid).os_id; KMP_CPU_SET(osID, mask); } else { - int core = __kmp_affinity_find_core(address2os, tid, - __kmp_aff_depth - 1, core_level); + int core = + __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level); for (int i = 0; i < __kmp_avail_proc; i++) { - int osID = address2os[i].second; - if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, - core_level) == core) { + int osID = __kmp_topology->at(i).os_id; + if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) == + core) { KMP_CPU_SET(osID, mask); } } diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -247,8 +247,6 @@ #if KMP_USE_HWLOC int __kmp_hwloc_error = FALSE; hwloc_topology_t __kmp_hwloc_topology = NULL; -int __kmp_numa_detected = FALSE; -int __kmp_tile_depth = 0; #endif #if KMP_OS_WINDOWS @@ -263,7 +261,7 @@ size_t __kmp_affin_mask_size = 0; enum affinity_type __kmp_affinity_type = affinity_default; -enum affinity_gran __kmp_affinity_gran = affinity_gran_default; +kmp_hw_t __kmp_affinity_gran = KMP_HW_UNKNOWN; int __kmp_affinity_gran_levels = -1; int __kmp_affinity_dups = TRUE; enum affinity_top_method __kmp_affinity_top_method = @@ -286,15 +284,6 @@ int __kmp_display_affinity = FALSE; char *__kmp_affinity_format = NULL; -kmp_hws_item_t __kmp_hws_socket = {0, 0}; -kmp_hws_item_t __kmp_hws_die = {0, 0}; -kmp_hws_item_t __kmp_hws_node = {0, 0}; -kmp_hws_item_t __kmp_hws_tile = {0, 0}; -kmp_hws_item_t __kmp_hws_core = {0, 0}; -kmp_hws_item_t __kmp_hws_proc = {0, 0}; -int __kmp_hws_requested = 0; -int __kmp_hws_abs_flag = 0; // absolute or per-item number requested - kmp_int32 __kmp_default_device = 0; kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams; diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -2069,9 +2069,9 @@ enum affinity_type *out_type, char **out_proclist, int *out_verbose, int *out_warn, int *out_respect, - enum affinity_gran *out_gran, - int *out_gran_levels, int *out_dups, - int *out_compact, int *out_offset) { + kmp_hw_t *out_gran, int *out_gran_levels, + int *out_dups, int *out_compact, + int *out_offset) { char *buffer = NULL; // Copy of env var value. char *buf = NULL; // Buffer for strtok_r() function. char *next = NULL; // end of token / start of next. @@ -2087,6 +2087,7 @@ int respect = 0; int gran = 0; int dups = 0; + bool set = false; KMP_ASSERT(value != NULL); @@ -2232,45 +2233,51 @@ SKIP_WS(next); buf = next; - if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) { - set_gran(affinity_gran_fine, -1); - buf = next; - } else if (__kmp_match_str("thread", buf, CCAST(const char **, &next))) { - set_gran(affinity_gran_thread, -1); - buf = next; - } else if (__kmp_match_str("core", buf, CCAST(const char **, &next))) { - set_gran(affinity_gran_core, -1); - buf = next; -#if KMP_USE_HWLOC - } else if (__kmp_match_str("tile", buf, CCAST(const char **, &next))) { - set_gran(affinity_gran_tile, -1); - buf = next; -#endif - } else if (__kmp_match_str("die", buf, CCAST(const char **, &next))) { - set_gran(affinity_gran_die, -1); - buf = next; - } else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) { - set_gran(affinity_gran_package, -1); - buf = next; - } else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) { - set_gran(affinity_gran_node, -1); - buf = next; + + // Try any hardware topology type for granularity + KMP_FOREACH_HW_TYPE(type) { + const char *name = __kmp_hw_get_keyword(type); + if (__kmp_match_str(name, buf, CCAST(const char **, &next))) { + set_gran(type, -1); + buf = next; + set = true; + break; + } + } + if (!set) { + // Support older names for different granularity layers + if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) { + set_gran(KMP_HW_THREAD, -1); + buf = next; + set = true; + } else if (__kmp_match_str("package", buf, + CCAST(const char **, &next))) { + set_gran(KMP_HW_SOCKET, -1); + buf = next; + set = true; + } else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) { + set_gran(KMP_HW_NUMA, -1); + buf = next; + set = true; #if KMP_GROUP_AFFINITY - } else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) { - set_gran(affinity_gran_group, -1); - buf = next; + } else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) { + set_gran(KMP_HW_PROC_GROUP, -1); + buf = next; + set = true; #endif /* KMP_GROUP AFFINITY */ - } else if ((*buf >= '0') && (*buf <= '9')) { - int n; - next = buf; - SKIP_DIGITS(next); - n = __kmp_str_to_int(buf, *next); - KMP_ASSERT(n >= 0); - buf = next; - set_gran(affinity_gran_default, n); - } else { - EMIT_WARN(TRUE, (AffInvalidParam, name, start)); - continue; + } else if ((*buf >= '0') && (*buf <= '9')) { + int n; + next = buf; + SKIP_DIGITS(next); + n = __kmp_str_to_int(buf, *next); + KMP_ASSERT(n >= 0); + buf = next; + set_gran(KMP_HW_UNKNOWN, n); + set = true; + } else { + EMIT_WARN(TRUE, (AffInvalidParam, name, start)); + continue; + } } } else if (__kmp_match_str("proclist", buf, CCAST(const char **, &next))) { char *temp_proclist; @@ -2377,20 +2384,20 @@ *out_offset = number[1]; } - if (__kmp_affinity_gran == affinity_gran_default) { + if (__kmp_affinity_gran == KMP_HW_UNKNOWN) { #if KMP_MIC_SUPPORTED if (__kmp_mic_type != non_mic) { if (__kmp_affinity_verbose || __kmp_affinity_warnings) { KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "fine"); } - __kmp_affinity_gran = affinity_gran_fine; + __kmp_affinity_gran = KMP_HW_THREAD; } else #endif { if (__kmp_affinity_verbose || __kmp_affinity_warnings) { KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "core"); } - __kmp_affinity_gran = affinity_gran_core; + __kmp_affinity_gran = KMP_HW_CORE; } } } break; @@ -2475,31 +2482,8 @@ } else { __kmp_str_buf_print(buffer, "%s,", "norespect"); } - switch (__kmp_affinity_gran) { - case affinity_gran_default: - __kmp_str_buf_print(buffer, "%s", "granularity=default,"); - break; - case affinity_gran_fine: - __kmp_str_buf_print(buffer, "%s", "granularity=fine,"); - break; - case affinity_gran_thread: - __kmp_str_buf_print(buffer, "%s", "granularity=thread,"); - break; - case affinity_gran_core: - __kmp_str_buf_print(buffer, "%s", "granularity=core,"); - break; - case affinity_gran_package: - __kmp_str_buf_print(buffer, "%s", "granularity=package,"); - break; - case affinity_gran_node: - __kmp_str_buf_print(buffer, "%s", "granularity=node,"); - break; -#if KMP_GROUP_AFFINITY - case affinity_gran_group: - __kmp_str_buf_print(buffer, "%s", "granularity=group,"); - break; -#endif /* KMP_GROUP_AFFINITY */ - } + __kmp_str_buf_print(buffer, "granularity=%s,", + __kmp_hw_get_keyword(__kmp_affinity_gran, false)); } if (!KMP_AFFINITY_CAPABLE()) { __kmp_str_buf_print(buffer, "%s", "disabled"); @@ -2571,7 +2555,7 @@ // GOMP_CPU_AFFINITY => granularity=fine,explicit,proclist=... __kmp_affinity_proclist = temp_proclist; __kmp_affinity_type = affinity_explicit; - __kmp_affinity_gran = affinity_gran_fine; + __kmp_affinity_gran = KMP_HW_THREAD; __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel; } else { KMP_WARNING(AffSyntaxError, name); @@ -2856,10 +2840,20 @@ static void __kmp_stg_parse_places(char const *name, char const *value, void *data) { + struct kmp_place_t { + const char *name; + kmp_hw_t type; + }; int count; + bool set = false; const char *scan = value; const char *next = scan; const char *kind = "\"threads\""; + kmp_place_t std_places[] = {{"threads", KMP_HW_THREAD}, + {"cores", KMP_HW_CORE}, + {"numa_domains", KMP_HW_NUMA}, + {"ll_caches", KMP_HW_LLC}, + {"sockets", KMP_HW_SOCKET}}; kmp_setting_t **rivals = (kmp_setting_t **)data; int rc; @@ -2868,52 +2862,47 @@ return; } - if (__kmp_match_str("threads", scan, &next)) { - scan = next; - __kmp_affinity_type = affinity_compact; - __kmp_affinity_gran = affinity_gran_thread; - __kmp_affinity_dups = FALSE; - kind = "\"threads\""; - } else if (__kmp_match_str("cores", scan, &next)) { - scan = next; - __kmp_affinity_type = affinity_compact; - __kmp_affinity_gran = affinity_gran_core; - __kmp_affinity_dups = FALSE; - kind = "\"cores\""; -#if KMP_USE_HWLOC - } else if (__kmp_match_str("tiles", scan, &next)) { - scan = next; - __kmp_affinity_type = affinity_compact; - __kmp_affinity_gran = affinity_gran_tile; - __kmp_affinity_dups = FALSE; - kind = "\"tiles\""; -#endif - } else if (__kmp_match_str("dice", scan, &next) || - __kmp_match_str("dies", scan, &next)) { - scan = next; - __kmp_affinity_type = affinity_compact; - __kmp_affinity_gran = affinity_gran_die; - __kmp_affinity_dups = FALSE; - kind = "\"dice\""; - } else if (__kmp_match_str("sockets", scan, &next)) { - scan = next; - __kmp_affinity_type = affinity_compact; - __kmp_affinity_gran = affinity_gran_package; - __kmp_affinity_dups = FALSE; - kind = "\"sockets\""; - } else { + // Standard choices + for (size_t i = 0; i < sizeof(std_places) / sizeof(std_places[0]); ++i) { + const kmp_place_t &place = std_places[i]; + if (__kmp_match_str(place.name, scan, &next)) { + scan = next; + __kmp_affinity_type = affinity_compact; + __kmp_affinity_gran = place.type; + __kmp_affinity_dups = FALSE; + set = true; + break; + } + } + // Implementation choices for OMP_PLACES based on internal types + if (!set) { + KMP_FOREACH_HW_TYPE(type) { + const char *name = __kmp_hw_get_keyword(type, true); + if (__kmp_match_str("unknowns", scan, &next)) + continue; + if (__kmp_match_str(name, scan, &next)) { + scan = next; + __kmp_affinity_type = affinity_compact; + __kmp_affinity_gran = type; + __kmp_affinity_dups = FALSE; + set = true; + break; + } + } + } + if (!set) { if (__kmp_affinity_proclist != NULL) { KMP_INTERNAL_FREE((void *)__kmp_affinity_proclist); __kmp_affinity_proclist = NULL; } if (__kmp_parse_place_list(name, value, &__kmp_affinity_proclist)) { __kmp_affinity_type = affinity_explicit; - __kmp_affinity_gran = affinity_gran_fine; + __kmp_affinity_gran = KMP_HW_THREAD; __kmp_affinity_dups = FALSE; } else { // Syntax error fallback __kmp_affinity_type = affinity_compact; - __kmp_affinity_gran = affinity_gran_core; + __kmp_affinity_gran = KMP_HW_CORE; __kmp_affinity_dups = FALSE; } if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) { @@ -2921,6 +2910,9 @@ } return; } + if (__kmp_affinity_gran != KMP_HW_UNKNOWN) { + kind = __kmp_hw_get_keyword(__kmp_affinity_gran); + } if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) { __kmp_nested_proc_bind.bind_types[0] = proc_bind_true; @@ -2985,31 +2977,12 @@ } else { num = 0; } - if (__kmp_affinity_gran == affinity_gran_thread) { - if (num > 0) { - __kmp_str_buf_print(buffer, "='threads(%d)'\n", num); - } else { - __kmp_str_buf_print(buffer, "='threads'\n"); - } - } else if (__kmp_affinity_gran == affinity_gran_core) { + if (__kmp_affinity_gran != KMP_HW_UNKNOWN) { + const char *name = __kmp_hw_get_keyword(__kmp_affinity_gran, true); if (num > 0) { - __kmp_str_buf_print(buffer, "='cores(%d)' \n", num); + __kmp_str_buf_print(buffer, "='%s(%d)'\n", name, num); } else { - __kmp_str_buf_print(buffer, "='cores'\n"); - } -#if KMP_USE_HWLOC - } else if (__kmp_affinity_gran == affinity_gran_tile) { - if (num > 0) { - __kmp_str_buf_print(buffer, "='tiles(%d)' \n", num); - } else { - __kmp_str_buf_print(buffer, "='tiles'\n"); - } -#endif - } else if (__kmp_affinity_gran == affinity_gran_package) { - if (num > 0) { - __kmp_str_buf_print(buffer, "='sockets(%d)'\n", num); - } else { - __kmp_str_buf_print(buffer, "='sockets'\n"); + __kmp_str_buf_print(buffer, "='%s'\n", name); } } else { __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined)); @@ -3118,8 +3091,12 @@ break; #if KMP_ARCH_X86 || KMP_ARCH_X86_64 + case affinity_top_method_x2apicid_1f: + value = "x2APIC id leaf 0x1f"; + break; + case affinity_top_method_x2apicid: - value = "x2APIC id"; + value = "x2APIC id leaf 0xb"; break; case affinity_top_method_apicid: @@ -4727,12 +4704,92 @@ // ----------------------------------------------------------------------------- // KMP_HW_SUBSET (was KMP_PLACE_THREADS) +// 2s16c,2t => 2S16C,2T => 2S16C \0 2T + +// Return KMP_HW_SUBSET preferred hardware type in case a token is ambiguously +// short. The original KMP_HW_SUBSET environment variable had single letters: +// s, c, t for sockets, cores, threads repsectively. +static kmp_hw_t __kmp_hw_subset_break_tie(const kmp_hw_t *possible, + size_t num_possible) { + for (size_t i = 0; i < num_possible; ++i) { + if (possible[i] == KMP_HW_THREAD) + return KMP_HW_THREAD; + else if (possible[i] == KMP_HW_CORE) + return KMP_HW_CORE; + else if (possible[i] == KMP_HW_SOCKET) + return KMP_HW_SOCKET; + } + return KMP_HW_UNKNOWN; +} -// The longest observable sequence of items is -// Socket-Node-Tile-Core-Thread -// So, let's limit to 5 levels for now +// Return hardware type from string or HW_UNKNOWN if string cannot be parsed +// This algorithm is very forgiving to the user in that, the instant it can +// reduce the search space to one, it assumes that is the topology level the +// user wanted, even if it is misspelled later in the token. +static kmp_hw_t __kmp_stg_parse_hw_subset_name(char const *token) { + size_t index, num_possible, token_length; + kmp_hw_t possible[KMP_HW_LAST]; + const char *end; + + // Find the end of the hardware token string + end = token; + token_length = 0; + while (isalnum(*end) || *end == '_') { + token_length++; + end++; + } + + // Set the possibilities to all hardware types + num_possible = 0; + KMP_FOREACH_HW_TYPE(type) { possible[num_possible++] = type; } + + // Eliminate hardware types by comparing the front of the token + // with hardware names + // In most cases, the first letter in the token will indicate exactly + // which hardware type is parsed, e.g., 'C' = Core + index = 0; + while (num_possible > 1 && index < token_length) { + size_t n = num_possible; + char token_char = (char)toupper(token[index]); + for (size_t i = 0; i < n; ++i) { + const char *s; + kmp_hw_t type = possible[i]; + s = __kmp_hw_get_keyword(type, false); + if (index < KMP_STRLEN(s)) { + char c = (char)toupper(s[index]); + // Mark hardware types for removal when the characters do not match + if (c != token_char) { + possible[i] = KMP_HW_UNKNOWN; + num_possible--; + } + } + } + // Remove hardware types that this token cannot be + size_t start = 0; + for (size_t i = 0; i < n; ++i) { + if (possible[i] != KMP_HW_UNKNOWN) { + kmp_hw_t temp = possible[i]; + possible[i] = possible[start]; + possible[start] = temp; + start++; + } + } + KMP_ASSERT(start == num_possible); + index++; + } + + // Attempt to break a tie if user has very short token + // (e.g., is 'T' tile or thread?) + if (num_possible > 1) + return __kmp_hw_subset_break_tie(possible, num_possible); + if (num_possible == 1) + return possible[0]; + return KMP_HW_UNKNOWN; +} + +// The longest observable sequence of items can only be HW_LAST length // The input string is usually short enough, let's use 512 limit for now -#define MAX_T_LEVEL 5 +#define MAX_T_LEVEL KMP_HW_LAST #define MAX_STR_LEN 512 static void __kmp_stg_parse_hw_subset(char const *name, char const *value, void *data) { @@ -4751,12 +4808,13 @@ char input[MAX_STR_LEN]; size_t len = 0, mlen = MAX_STR_LEN; int level = 0; - // Canonize the string (remove spaces, unify delimiters, etc.) + bool absolute = false; + // Canonicalize the string (remove spaces, unify delimiters, etc.) char *pos = CCAST(char *, value); while (*pos && mlen) { if (*pos != ' ') { // skip spaces if (len == 0 && *pos == ':') { - __kmp_hws_abs_flag = 1; // if the first symbol is ":", skip it + absolute = true; } else { input[len] = (char)(toupper(*pos)); if (input[len] == 'X') @@ -4769,10 +4827,10 @@ mlen--; pos++; } - if (len == 0 || mlen == 0) + if (len == 0 || mlen == 0) { goto err; // contents is either empty or too long + } input[len] = '\0'; - __kmp_hws_requested = 1; // mark that subset requested // Split by delimiter pos = input; components[level++] = pos; @@ -4782,145 +4840,68 @@ *pos = '\0'; // modify input and avoid more copying components[level++] = ++pos; // expect something after "," } + + __kmp_hw_subset = kmp_hw_subset_t::allocate(); + if (absolute) + __kmp_hw_subset->set_absolute(); + // Check each component for (int i = 0; i < level; ++i) { int offset = 0; int num = atoi(components[i]); // each component should start with a number + if (num <= 0) { + goto err; // only positive integers are valid for count + } if ((pos = strchr(components[i], '@'))) { offset = atoi(pos + 1); // save offset *pos = '\0'; // cut the offset from the component } pos = components[i] + strspn(components[i], digits); - if (pos == components[i]) + if (pos == components[i]) { goto err; + } // detect the component type - switch (*pos) { - case 'S': // Socket - if (__kmp_hws_socket.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_socket.num = num; - __kmp_hws_socket.offset = offset; - break; - case 'N': // NUMA Node - if (__kmp_hws_node.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_node.num = num; - __kmp_hws_node.offset = offset; - break; - case 'D': // Die - if (__kmp_hws_die.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_die.num = num; - __kmp_hws_die.offset = offset; - break; - case 'L': // Cache - if (*(pos + 1) == '2') { // L2 - Tile - if (__kmp_hws_tile.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_tile.num = num; - __kmp_hws_tile.offset = offset; - } else if (*(pos + 1) == '3') { // L3 - Socket - if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_socket.num = num; - __kmp_hws_socket.offset = offset; - } else if (*(pos + 1) == '1') { // L1 - Core - if (__kmp_hws_core.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_core.num = num; - __kmp_hws_core.offset = offset; - } - break; - case 'C': // Core (or Cache?) - if (*(pos + 1) != 'A') { - if (__kmp_hws_core.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_core.num = num; - __kmp_hws_core.offset = offset; - } else { // Cache - char *d = pos + strcspn(pos, digits); // find digit - if (*d == '2') { // L2 - Tile - if (__kmp_hws_tile.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_tile.num = num; - __kmp_hws_tile.offset = offset; - } else if (*d == '3') { // L3 - Socket - if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_socket.num = num; - __kmp_hws_socket.offset = offset; - } else if (*d == '1') { // L1 - Core - if (__kmp_hws_core.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_core.num = num; - __kmp_hws_core.offset = offset; - } else { - goto err; - } - } - break; - case 'T': // Thread - if (__kmp_hws_proc.num > 0) - goto err; // duplicate is not allowed - __kmp_hws_proc.num = num; - __kmp_hws_proc.offset = offset; - break; - default: + kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos); + if (type == KMP_HW_UNKNOWN) { goto err; } + if (__kmp_hw_subset->specified(type)) { + goto err; + } + __kmp_hw_subset->push_back(num, type, offset); } return; err: KMP_WARNING(AffHWSubsetInvalid, name, value); - __kmp_hws_requested = 0; // mark that subset not requested + if (__kmp_hw_subset) { + kmp_hw_subset_t::deallocate(__kmp_hw_subset); + __kmp_hw_subset = nullptr; + } return; } static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name, void *data) { - if (__kmp_hws_requested) { - int comma = 0; - kmp_str_buf_t buf; - __kmp_str_buf_init(&buf); - if (__kmp_env_format) - KMP_STR_BUF_PRINT_NAME_EX(name); - else - __kmp_str_buf_print(buffer, " %s='", name); - if (__kmp_hws_socket.num) { - __kmp_str_buf_print(&buf, "%ds", __kmp_hws_socket.num); - if (__kmp_hws_socket.offset) - __kmp_str_buf_print(&buf, "@%d", __kmp_hws_socket.offset); - comma = 1; - } - if (__kmp_hws_die.num) { - __kmp_str_buf_print(&buf, "%s%dd", comma ? "," : "", __kmp_hws_die.num); - if (__kmp_hws_die.offset) - __kmp_str_buf_print(&buf, "@%d", __kmp_hws_die.offset); - comma = 1; - } - if (__kmp_hws_node.num) { - __kmp_str_buf_print(&buf, "%s%dn", comma ? "," : "", __kmp_hws_node.num); - if (__kmp_hws_node.offset) - __kmp_str_buf_print(&buf, "@%d", __kmp_hws_node.offset); - comma = 1; - } - if (__kmp_hws_tile.num) { - __kmp_str_buf_print(&buf, "%s%dL2", comma ? "," : "", __kmp_hws_tile.num); - if (__kmp_hws_tile.offset) - __kmp_str_buf_print(&buf, "@%d", __kmp_hws_tile.offset); - comma = 1; - } - if (__kmp_hws_core.num) { - __kmp_str_buf_print(&buf, "%s%dc", comma ? "," : "", __kmp_hws_core.num); - if (__kmp_hws_core.offset) - __kmp_str_buf_print(&buf, "@%d", __kmp_hws_core.offset); - comma = 1; - } - if (__kmp_hws_proc.num) - __kmp_str_buf_print(&buf, "%s%dt", comma ? "," : "", __kmp_hws_proc.num); - __kmp_str_buf_print(buffer, "%s'\n", buf.str); - __kmp_str_buf_free(&buf); + kmp_str_buf_t buf; + int depth; + if (!__kmp_hw_subset) + return; + __kmp_str_buf_init(&buf); + if (__kmp_env_format) + KMP_STR_BUF_PRINT_NAME_EX(name); + else + __kmp_str_buf_print(buffer, " %s='", name); + + depth = __kmp_hw_subset->get_depth(); + for (int i = 0; i < depth; ++i) { + const auto &item = __kmp_hw_subset->at(i); + __kmp_str_buf_print(&buf, "%s%d%s", (i > 0 ? "," : ""), item.num, + __kmp_hw_get_keyword(item.type)); + if (item.offset) + __kmp_str_buf_print(&buf, "@%d", item.offset); } + __kmp_str_buf_print(buffer, "%s'\n", buf.str); + __kmp_str_buf_free(&buf); } #if USE_ITT_BUILD @@ -5762,7 +5743,7 @@ // Reset the affinity flags to their default values, // in case this is called from kmp_set_defaults(). __kmp_affinity_type = affinity_default; - __kmp_affinity_gran = affinity_gran_default; + __kmp_affinity_gran = KMP_HW_UNKNOWN; __kmp_affinity_top_method = affinity_top_method_default; __kmp_affinity_respect_mask = affinity_respect_mask_default; } @@ -5772,7 +5753,7 @@ aff_str = __kmp_env_blk_var(&block, "OMP_PROC_BIND"); if (aff_str != NULL) { __kmp_affinity_type = affinity_default; - __kmp_affinity_gran = affinity_gran_default; + __kmp_affinity_gran = KMP_HW_UNKNOWN; __kmp_affinity_top_method = affinity_top_method_default; __kmp_affinity_respect_mask = affinity_respect_mask_default; } @@ -5844,12 +5825,19 @@ if (!TCR_4(__kmp_init_middle)) { #if KMP_USE_HWLOC // Force using hwloc when either tiles or numa nodes requested within - // KMP_HW_SUBSET and no other topology method is requested - if ((__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0 || - __kmp_affinity_gran == affinity_gran_tile) && - (__kmp_affinity_top_method == affinity_top_method_default)) { + // KMP_HW_SUBSET or granularity setting and no other topology method + // is requested + if (__kmp_hw_subset && + __kmp_affinity_top_method == affinity_top_method_default) + if (__kmp_hw_subset->specified(KMP_HW_NUMA) || + __kmp_hw_subset->specified(KMP_HW_TILE) || + __kmp_affinity_gran == KMP_HW_TILE || + __kmp_affinity_gran == KMP_HW_NUMA) + __kmp_affinity_top_method = affinity_top_method_hwloc; + // Force using hwloc when tiles or numa nodes requested for OMP_PLACES + if (__kmp_affinity_gran == KMP_HW_NUMA || + __kmp_affinity_gran == KMP_HW_TILE) __kmp_affinity_top_method = affinity_top_method_hwloc; - } #endif // Determine if the machine/OS is actually capable of supporting // affinity. @@ -5879,7 +5867,7 @@ } __kmp_affinity_type = affinity_disabled; __kmp_affinity_respect_mask = 0; - __kmp_affinity_gran = affinity_gran_fine; + __kmp_affinity_gran = KMP_HW_THREAD; } } @@ -5937,44 +5925,27 @@ __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel; } if (__kmp_affinity_top_method == affinity_top_method_default) { - if (__kmp_affinity_gran == affinity_gran_default) { + if (__kmp_affinity_gran == KMP_HW_UNKNOWN) { __kmp_affinity_top_method = affinity_top_method_group; - __kmp_affinity_gran = affinity_gran_group; - } else if (__kmp_affinity_gran == affinity_gran_group) { + __kmp_affinity_gran = KMP_HW_PROC_GROUP; + } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) { __kmp_affinity_top_method = affinity_top_method_group; } else { __kmp_affinity_top_method = affinity_top_method_all; } } else if (__kmp_affinity_top_method == affinity_top_method_group) { - if (__kmp_affinity_gran == affinity_gran_default) { - __kmp_affinity_gran = affinity_gran_group; - } else if ((__kmp_affinity_gran != affinity_gran_group) && - (__kmp_affinity_gran != affinity_gran_fine) && - (__kmp_affinity_gran != affinity_gran_thread)) { - const char *str = NULL; - switch (__kmp_affinity_gran) { - case affinity_gran_core: - str = "core"; - break; - case affinity_gran_package: - str = "package"; - break; - case affinity_gran_node: - str = "node"; - break; - case affinity_gran_tile: - str = "tile"; - break; - default: - KMP_DEBUG_ASSERT(0); - } + if (__kmp_affinity_gran == KMP_HW_UNKNOWN) { + __kmp_affinity_gran = KMP_HW_PROC_GROUP; + } else if ((__kmp_affinity_gran != KMP_HW_PROC_GROUP) && + (__kmp_affinity_gran != KMP_HW_THREAD)) { + const char *str = __kmp_hw_get_keyword(__kmp_affinity_gran); KMP_WARNING(AffGranTopGroup, var, str); - __kmp_affinity_gran = affinity_gran_fine; + __kmp_affinity_gran = KMP_HW_THREAD; } } else { - if (__kmp_affinity_gran == affinity_gran_default) { - __kmp_affinity_gran = affinity_gran_core; - } else if (__kmp_affinity_gran == affinity_gran_group) { + if (__kmp_affinity_gran == KMP_HW_UNKNOWN) { + __kmp_affinity_gran = KMP_HW_CORE; + } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) { const char *str = NULL; switch (__kmp_affinity_type) { case affinity_physical: @@ -5997,7 +5968,7 @@ KMP_DEBUG_ASSERT(0); } KMP_WARNING(AffGranGroupType, var, str); - __kmp_affinity_gran = affinity_gran_core; + __kmp_affinity_gran = KMP_HW_CORE; } } } else @@ -6039,15 +6010,15 @@ __kmp_affinity_type = affinity_none; } } - if ((__kmp_affinity_gran == affinity_gran_default) && + if ((__kmp_affinity_gran == KMP_HW_UNKNOWN) && (__kmp_affinity_gran_levels < 0)) { #if KMP_MIC_SUPPORTED if (__kmp_mic_type != non_mic) { - __kmp_affinity_gran = affinity_gran_fine; + __kmp_affinity_gran = KMP_HW_THREAD; } else #endif { - __kmp_affinity_gran = affinity_gran_core; + __kmp_affinity_gran = KMP_HW_CORE; } } if (__kmp_affinity_top_method == affinity_top_method_default) { diff --git a/openmp/runtime/test/affinity/kmp-affinity.c b/openmp/runtime/test/affinity/kmp-affinity.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/affinity/kmp-affinity.c @@ -0,0 +1,71 @@ +// RUN: %libomp-compile -D_GNU_SOURCE +// RUN: env KMP_AFFINITY=granularity=thread,compact %libomp-run +// RUN: env KMP_AFFINITY=granularity=core,compact %libomp-run +// RUN: env KMP_AFFINITY=granularity=socket,compact %libomp-run +// REQUIRES: linux + +#include +#include +#include +#include "libomp_test_affinity.h" +#include "libomp_test_topology.h" + +// Compare place lists. Make sure every place in p1 is in p2. +static int compare_places(const place_list_t *p1, const place_list_t *p2) { + int i, j; + for (i = 0; i < p1->num_places; ++i) { + int found = 0; + for (j = 0; j < p2->num_places; ++j) { + if (affinity_mask_equal(p1->masks[i], p2->masks[j])) { + found = 1; + break; + } + } + if (!found) { + printf("Found place in p1 not in p2!\n"); + printf("p1 places:\n"); + topology_print_places(p1); + printf("\n"); + printf("p2 places:\n"); + topology_print_places(p1); + return EXIT_FAILURE; + } + } + return EXIT_SUCCESS; +} + +static int check_places() { + int status; + const char *value = getenv("KMP_AFFINITY"); + if (!value) { + fprintf(stderr, "error: must set OMP_PLACES envirable for this test!\n"); + return EXIT_FAILURE; + } + place_list_t *places, *openmp_places; + if (strstr(value, "socket")) { + places = topology_alloc_type_places(TOPOLOGY_OBJ_SOCKET); + } else if (strstr(value, "core")) { + places = topology_alloc_type_places(TOPOLOGY_OBJ_CORE); + } else if (strstr(value, "thread")) { + places = topology_alloc_type_places(TOPOLOGY_OBJ_THREAD); + } else { + fprintf( + stderr, + "error: KMP_AFFINITY granularity must be one of thread,core,socket!\n"); + return EXIT_FAILURE; + } + openmp_places = topology_alloc_openmp_places(); + status = compare_places(openmp_places, places); + topology_free_places(places); + topology_free_places(openmp_places); + return status; +} + +int main() { + if (!topology_using_full_mask()) { + printf("Thread does not have access to all logical processors. Skipping " + "test.\n"); + return EXIT_SUCCESS; + } + return check_places(); +} diff --git a/openmp/runtime/test/affinity/kmp-hw-subset.c b/openmp/runtime/test/affinity/kmp-hw-subset.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/affinity/kmp-hw-subset.c @@ -0,0 +1,127 @@ +// RUN: %libomp-compile -D_GNU_SOURCE +// RUN: env OMP_PLACES=threads %libomp-run +// RUN: env OMP_PLACES=cores %libomp-run +// RUN: env OMP_PLACES=sockets %libomp-run +// REQUIRES: linux + +#include +#include +#include +#include "libomp_test_affinity.h" +#include "libomp_test_topology.h" + +// Check openmp place list to make sure it follow KMP_HW_SUBSET restriction +static int compare_hw_subset_places(const place_list_t *openmp_places, + topology_obj_type_t type, int nsockets, + int ncores_per_socket, + int nthreads_per_core) { + int i, j, expected_total, expected_per_place; + if (type == TOPOLOGY_OBJ_THREAD) { + expected_total = nsockets * ncores_per_socket * nthreads_per_core; + expected_per_place = 1; + } else if (type == TOPOLOGY_OBJ_CORE) { + expected_total = nsockets * ncores_per_socket; + expected_per_place = nthreads_per_core; + } else { + expected_total = nsockets; + expected_per_place = ncores_per_socket; + } + if (openmp_places->num_places != expected_total) { + fprintf(stderr, "error: KMP_HW_SUBSET did not half each resource layer!\n"); + printf("openmp_places places:\n"); + topology_print_places(openmp_places); + printf("\n"); + return EXIT_FAILURE; + } + for (i = 0; i < openmp_places->num_places; ++i) { + int count = affinity_mask_count(openmp_places->masks[i]); + if (count != expected_per_place) { + fprintf(stderr, "error: place %d has %d OS procs instead of %d\n", i, + count, expected_per_place); + return EXIT_FAILURE; + } + } + return EXIT_SUCCESS; +} + +static int check_places() { + char buf[100]; + topology_obj_type_t type; + const char *value; + int status = EXIT_SUCCESS; + place_list_t *threads, *cores, *sockets, *openmp_places; + threads = topology_alloc_type_places(TOPOLOGY_OBJ_THREAD); + cores = topology_alloc_type_places(TOPOLOGY_OBJ_CORE); + sockets = topology_alloc_type_places(TOPOLOGY_OBJ_SOCKET); + + if (threads->num_places <= 1) { + printf("Only one hardware thread to execute on. Skipping test.\n"); + return status; + } + + value = getenv("OMP_PLACES"); + if (!value) { + fprintf(stderr, + "error: OMP_PLACES must be set to one of threads,cores,sockets!\n"); + return EXIT_FAILURE; + } + if (strcmp(value, "threads") == 0) + type = TOPOLOGY_OBJ_THREAD; + else if (strcmp(value, "cores") == 0) + type = TOPOLOGY_OBJ_CORE; + else if (strcmp(value, "sockets") == 0) + type = TOPOLOGY_OBJ_SOCKET; + else { + fprintf(stderr, + "error: OMP_PLACES must be one of threads,cores,sockets!\n"); + return EXIT_FAILURE; + } + + // Calculate of num threads per core, num cores per socket, & num sockets + if (cores->num_places <= 0) { + printf("Invalid number of cores (%d). Skipping test.\n", cores->num_places); + return status; + } else if (sockets->num_places <= 0) { + printf("Invalid number of sockets (%d). Skipping test.\n", + cores->num_places); + return status; + } + int nthreads_per_core = threads->num_places / cores->num_places; + int ncores_per_socket = cores->num_places / sockets->num_places; + int nsockets = sockets->num_places; + + if (nsockets * ncores_per_socket * nthreads_per_core != threads->num_places) { + printf("Only uniform topologies can be tested. Skipping test.\n"); + return status; + } + + // Use half the resources of every level + if (nthreads_per_core > 1) + nthreads_per_core /= 2; + if (ncores_per_socket > 1) + ncores_per_socket /= 2; + if (nsockets > 1) + nsockets /= 2; + + snprintf(buf, sizeof(buf), "%ds,%dc,%dt", nsockets, ncores_per_socket, + nthreads_per_core); + setenv("KMP_HW_SUBSET", buf, 1); + + openmp_places = topology_alloc_openmp_places(); + status = compare_hw_subset_places(openmp_places, type, nsockets, + ncores_per_socket, nthreads_per_core); + topology_free_places(threads); + topology_free_places(cores); + topology_free_places(sockets); + topology_free_places(openmp_places); + return status; +} + +int main() { + if (!topology_using_full_mask()) { + printf("Thread does not have access to all logical processors. Skipping " + "test.\n"); + return EXIT_SUCCESS; + } + return check_places(); +} diff --git a/openmp/runtime/test/affinity/libomp_test_topology.h b/openmp/runtime/test/affinity/libomp_test_topology.h new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/affinity/libomp_test_topology.h @@ -0,0 +1,231 @@ +#ifndef LIBOMP_TEST_TOPOLOGY_H +#define LIBOMP_TEST_TOPOLOGY_H + +#include "libomp_test_affinity.h" +#include +#include +#include +#include +#include +#include + +typedef enum topology_obj_type_t { + TOPOLOGY_OBJ_THREAD, + TOPOLOGY_OBJ_CORE, + TOPOLOGY_OBJ_SOCKET, + TOPOLOGY_OBJ_MAX +} topology_obj_type_t; + +typedef struct place_list_t { + int num_places; + affinity_mask_t **masks; +} place_list_t; + +// Return the first character in file 'f' that is not a whitespace character +// including newlines and carriage returns +static int get_first_nonspace_from_file(FILE *f) { + int c; + do { + c = fgetc(f); + } while (c != EOF && (isspace(c) || c == '\n' || c == '\r')); + return c; +} + +// Read an integer from file 'f' into 'number' +// Return 1 on successful read of integer, +// 0 on unsuccessful read of integer, +// EOF on end of file. +static int get_integer_from_file(FILE *f, int *number) { + int n; + n = fscanf(f, "%d", number); + if (feof(f)) + return EOF; + if (n != 1) + return 0; + return 1; +} + +// Read a siblings list file from Linux /sys/devices/system/cpu/cpu?/topology/* +static affinity_mask_t *topology_get_mask_from_file(const char *filename) { + int status = EXIT_SUCCESS; + FILE *f = fopen(filename, "r"); + if (!f) { + perror(filename); + exit(EXIT_FAILURE); + } + affinity_mask_t *mask = affinity_mask_alloc(); + while (1) { + int c, i, n, lower, upper; + // Read the first integer + n = get_integer_from_file(f, &lower); + if (n == EOF) { + break; + } else if (n == 0) { + fprintf(stderr, "syntax error: expected integer\n"); + status = EXIT_FAILURE; + break; + } + + // Now either a , or - + c = get_first_nonspace_from_file(f); + if (c == EOF || c == ',') { + affinity_mask_set(mask, lower); + if (c == EOF) + break; + } else if (c == '-') { + n = get_integer_from_file(f, &upper); + if (n == EOF || n == 0) { + fprintf(stderr, "syntax error: expected integer\n"); + status = EXIT_FAILURE; + break; + } + for (i = lower; i <= upper; ++i) + affinity_mask_set(mask, i); + c = get_first_nonspace_from_file(f); + if (c == EOF) { + break; + } else if (c == ',') { + continue; + } else { + fprintf(stderr, "syntax error: unexpected character: '%c (%d)'\n", c, + c); + status = EXIT_FAILURE; + break; + } + } else { + fprintf(stderr, "syntax error: unexpected character: '%c (%d)'\n", c, c); + status = EXIT_FAILURE; + break; + } + } + fclose(f); + if (status == EXIT_FAILURE) { + affinity_mask_free(mask); + mask = NULL; + } + return mask; +} + +static int topology_get_num_cpus() { + char buf[1024]; + // Count the number of cpus + int cpu = 0; + while (1) { + snprintf(buf, sizeof(buf), "/sys/devices/system/cpu/cpu%d", cpu); + DIR *dir = opendir(buf); + if (dir) { + closedir(dir); + cpu++; + } else { + break; + } + } + if (cpu == 0) + cpu = 1; + return cpu; +} + +// Return whether the current thread has access to all logical processors +static int topology_using_full_mask() { + int cpu; + int has_all = 1; + int num_cpus = topology_get_num_cpus(); + affinity_mask_t *mask = affinity_mask_alloc(); + get_thread_affinity(mask); + for (cpu = 0; cpu < num_cpus; ++cpu) { + if (!affinity_mask_isset(mask, cpu)) { + has_all = 0; + break; + } + } + affinity_mask_free(mask); + return has_all; +} + +// Return array of masks representing OMP_PLACES keyword (e.g., sockets, cores, +// threads) +static place_list_t *topology_alloc_type_places(topology_obj_type_t type) { + char buf[1024]; + int i, cpu, num_places, num_unique; + int num_cpus = topology_get_num_cpus(); + place_list_t *places = (place_list_t *)malloc(sizeof(place_list_t)); + affinity_mask_t **masks = + (affinity_mask_t **)malloc(sizeof(affinity_mask_t *) * num_cpus); + num_unique = 0; + for (cpu = 0; cpu < num_cpus; ++cpu) { + affinity_mask_t *mask; + if (type == TOPOLOGY_OBJ_CORE) { + snprintf(buf, sizeof(buf), + "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", + cpu); + mask = topology_get_mask_from_file(buf); + } else if (type == TOPOLOGY_OBJ_SOCKET) { + snprintf(buf, sizeof(buf), + "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", + cpu); + mask = topology_get_mask_from_file(buf); + } else if (type == TOPOLOGY_OBJ_THREAD) { + mask = affinity_mask_alloc(); + affinity_mask_set(mask, cpu); + } else { + fprintf(stderr, "Unknown topology type (%d)\n", (int)type); + exit(EXIT_FAILURE); + } + // Check for unique topology objects above the thread level + if (type != TOPOLOGY_OBJ_THREAD) { + for (i = 0; i < num_unique; ++i) { + if (affinity_mask_equal(masks[i], mask)) { + affinity_mask_free(mask); + mask = NULL; + break; + } + } + } + if (mask) + masks[num_unique++] = mask; + } + places->num_places = num_unique; + places->masks = masks; + return places; +} + +static place_list_t *topology_alloc_openmp_places() { + int place, i; + int num_places = omp_get_num_places(); + place_list_t *places = (place_list_t *)malloc(sizeof(place_list_t)); + affinity_mask_t **masks = + (affinity_mask_t **)malloc(sizeof(affinity_mask_t *) * num_places); + for (place = 0; place < num_places; ++place) { + int num_procs = omp_get_place_num_procs(place); + int *ids = (int *)malloc(sizeof(int) * num_procs); + omp_get_place_proc_ids(place, ids); + affinity_mask_t *mask = affinity_mask_alloc(); + for (i = 0; i < num_procs; ++i) + affinity_mask_set(mask, ids[i]); + masks[place] = mask; + } + places->num_places = num_places; + places->masks = masks; + return places; +} + +// Free the array of masks from one of: topology_alloc_type_masks() +// or topology_alloc_openmp_masks() +static void topology_free_places(place_list_t *places) { + int i; + for (i = 0; i < places->num_places; ++i) + affinity_mask_free(places->masks[i]); + free(places->masks); + free(places); +} + +static void topology_print_places(const place_list_t *p) { + int i; + char buf[1024]; + for (i = 0; i < p->num_places; ++i) { + affinity_mask_snprintf(buf, sizeof(buf), p->masks[i]); + printf("Place %d: %s\n", i, buf); + } +} + +#endif diff --git a/openmp/runtime/test/affinity/omp-places.c b/openmp/runtime/test/affinity/omp-places.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/affinity/omp-places.c @@ -0,0 +1,83 @@ +// RUN: %libomp-compile -D_GNU_SOURCE +// RUN: env OMP_PLACES=threads %libomp-run +// RUN: env OMP_PLACES=cores %libomp-run +// RUN: env OMP_PLACES=sockets %libomp-run +// REQUIRES: linux + +#include +#include +#include +#include "libomp_test_affinity.h" +#include "libomp_test_topology.h" + +// Compare place lists. The order is not taken into consideration here. +// The OS detection might have the cores/sockets in a different +// order from the runtime. +static int compare_places(const place_list_t *p1, const place_list_t *p2) { + int i, j; + if (p1->num_places != p2->num_places) { + fprintf(stderr, "error: places do not have same number of places! (p1 has " + "%d, p2 has %d)\n", + p1->num_places, p2->num_places); + printf("p1 places:\n"); + topology_print_places(p1); + printf("\n"); + printf("p2 places:\n"); + topology_print_places(p1); + return EXIT_FAILURE; + } + for (i = 0; i < p1->num_places; ++i) { + int found = 0; + for (j = 0; j < p2->num_places; ++j) { + if (affinity_mask_equal(p1->masks[i], p2->masks[j])) { + found = 1; + break; + } + } + if (!found) { + printf("Found difference in places!\n"); + printf("p1 places:\n"); + topology_print_places(p1); + printf("\n"); + printf("p2 places:\n"); + topology_print_places(p1); + return EXIT_FAILURE; + } + } + return EXIT_SUCCESS; +} + +static int check_places() { + int status; + const char *value = getenv("OMP_PLACES"); + if (!value) { + fprintf(stderr, "error: must set OMP_PLACES envirable for this test!\n"); + return EXIT_FAILURE; + } + place_list_t *places, *openmp_places; + if (strcmp(value, "sockets") == 0) { + places = topology_alloc_type_places(TOPOLOGY_OBJ_SOCKET); + } else if (strcmp(value, "cores") == 0) { + places = topology_alloc_type_places(TOPOLOGY_OBJ_CORE); + } else if (strcmp(value, "threads") == 0) { + places = topology_alloc_type_places(TOPOLOGY_OBJ_THREAD); + } else { + fprintf(stderr, + "error: OMP_PLACES must be one of threads,cores,sockets!\n"); + return EXIT_FAILURE; + } + openmp_places = topology_alloc_openmp_places(); + status = compare_places(places, openmp_places); + topology_free_places(places); + topology_free_places(openmp_places); + return status; +} + +int main() { + if (!topology_using_full_mask()) { + printf("Thread does not have access to all logical processors. Skipping " + "test.\n"); + return EXIT_SUCCESS; + } + return check_places(); +}