diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt --- a/openmp/runtime/src/i18n/en_US.txt +++ b/openmp/runtime/src/i18n/en_US.txt @@ -123,6 +123,7 @@ ProcGroup "processor group" ProcGroups "processor groups" Unknown "unknown" +NoLeaf31Support "cpuid leaf 31 not supported" @@ -383,8 +384,8 @@ OBSOLETE "KMP_DYNAMIC_MODE=irml cannot be used with KMP_USE_IRML=0" IttUnknownGroup "ittnotify: Unknown group \"%2$s\" specified in environment variable \"%1$s\"." IttEnvVarTooLong "ittnotify: Environment variable \"%1$s\" too long: Actual lengths is %2$lu, max allowed length is %3$lu." -AffUseGlobCpuidL11 "%1$s: Affinity capable, using global cpuid leaf 11 info" -AffNotCapableUseLocCpuidL11 "%1$s: Affinity not capable, using local cpuid leaf 11 info" +OBSOLETE "%1$s: Affinity capable, using global cpuid leaf 11 info" +OBSOLETE "%1$s: Affinity not capable, using local cpuid leaf 11 info" AffInfoStr "%1$s: %2$s." AffInfoStrStr "%1$s: %2$s - %3$s." OSProcToPhysicalThreadMap "%1$s: OS proc to physical thread map:" @@ -450,6 +451,10 @@ AffFormatDefault "OMP: pid %1$s tid %2$s thread %3$s bound to OS proc set {%4$s}" APIDeprecated "%1$s routine deprecated, please use %2$s instead." GompFeatureNotSupported "libgomp compatibility layer does not support OpenMP feature: %1$s" +AffHWSubsetManyDies "KMP_HW_SUBSET ignored: too many Dies requested." +AffUseGlobCpuidL "%1$s: Affinity capable, using global cpuid leaf %2$d info" +AffNotCapableUseLocCpuidL "%1$s: Affinity not capable, using local cpuid leaf %2$d info" +AffNotUsingHwloc "%1$s: Affinity not capable, using hwloc." # -------------------------------------------------------------------------------------------------- -*- HINTS -*- diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -789,6 +789,7 @@ affinity_gran_thread, affinity_gran_core, affinity_gran_tile, + affinity_gran_die, affinity_gran_numa, affinity_gran_package, affinity_gran_node, @@ -805,6 +806,7 @@ #if KMP_ARCH_X86 || KMP_ARCH_X86_64 affinity_top_method_apicid, affinity_top_method_x2apicid, + affinity_top_method_x2apicid_1f, #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too #if KMP_GROUP_AFFINITY @@ -889,6 +891,7 @@ } kmp_hws_item_t; extern kmp_hws_item_t __kmp_hws_socket; +extern kmp_hws_item_t __kmp_hws_die; extern kmp_hws_item_t __kmp_hws_node; extern kmp_hws_item_t __kmp_hws_tile; extern kmp_hws_item_t __kmp_hws_core; diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -45,6 +45,8 @@ thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; } +#if KMP_AFFINITY_SUPPORTED + const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { switch (type) { case KMP_HW_SOCKET: @@ -73,7 +75,6 @@ return KMP_I18N_STR(Unknown); } -#if KMP_USE_HWLOC // This function removes the topology levels that are radix 1 and don't offer // further information about the topology. The most common example is when you // have one thread context per core, we don't want the extra thread context @@ -213,8 +214,8 @@ } // calculate the number of X's per Y -static inline int __kmp_hwloc_calculate_ratio(int *ratio, int deep_level, - int shallow_level) { +static inline int __kmp_affinity_calculate_ratio(int *ratio, int deep_level, + int shallow_level) { int retval = 1; if (deep_level < 0 || shallow_level < 0) return retval; @@ -222,7 +223,23 @@ retval *= ratio[level]; return retval; } -#endif // KMP_USE_HWLOC + +static void __kmp_affinity_print_topology(AddrUnsPair *addrP, int len, + int depth, kmp_hw_t *types) { + int proc; + kmp_str_buf_t buf; + __kmp_str_buf_init(&buf); + KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); + for (proc = 0; proc < len; proc++) { + for (int i = 0; i < depth; ++i) { + __kmp_str_buf_print(&buf, "%s %d ", __kmp_hw_get_catalog_string(types[i]), + addrP[proc].first.labels[i]); + } + KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str); + __kmp_str_buf_clear(&buf); + } + __kmp_str_buf_free(&buf); +} // Print out the detailed machine topology map, i.e. the physical locations // of each OS proc. @@ -257,8 +274,6 @@ } } -#if KMP_AFFINITY_SUPPORTED - bool KMPAffinity::picked_api = false; void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } @@ -516,34 +531,19 @@ static int __kmp_ncores; #endif static int *__kmp_pu_os_idx = NULL; +static int nDiesPerPkg = 1; // __kmp_affinity_uniform_topology() doesn't work when called from // places which support arbitrarily many levels in the machine topology // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() // __kmp_affinity_create_x2apicid_map(). inline static bool __kmp_affinity_uniform_topology() { - return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); + return __kmp_avail_proc == + (__kmp_nThreadsPerCore * nCoresPerPkg * nDiesPerPkg * nPackages); } #if KMP_USE_HWLOC -static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len, - int depth, kmp_hw_t *types) { - int proc; - kmp_str_buf_t buf; - __kmp_str_buf_init(&buf); - KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); - for (proc = 0; proc < len; proc++) { - for (int i = 0; i < depth; ++i) { - __kmp_str_buf_print(&buf, "%s %d ", __kmp_hw_get_catalog_string(types[i]), - addrP[proc].first.labels[i]); - } - KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str); - __kmp_str_buf_clear(&buf); - } - __kmp_str_buf_free(&buf); -} - static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { #if HWLOC_API_VERSION >= 0x00020000 return hwloc_obj_type_is_cache(obj->type); @@ -699,7 +699,7 @@ nCoresPerPkg = 1; // to prevent possible division by 0 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; if (__kmp_affinity_verbose) { - KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); + KMP_INFORM(AffNotUsingHwloc, "KMP_AFFINITY"); KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); if (__kmp_affinity_uniform_topology()) { KMP_INFORM(Uniform, "KMP_AFFINITY"); @@ -886,8 +886,9 @@ numa_level = level; } __kmp_nThreadsPerCore = - __kmp_hwloc_calculate_ratio(ratio, thread_level, core_level); - nCoresPerPkg = __kmp_hwloc_calculate_ratio(ratio, core_level, socket_level); + __kmp_affinity_calculate_ratio(ratio, thread_level, core_level); + nCoresPerPkg = + __kmp_affinity_calculate_ratio(ratio, core_level, socket_level); if (socket_level >= 0) nPackages = count[socket_level]; else @@ -964,7 +965,7 @@ } if (__kmp_affinity_verbose) - __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, types); + __kmp_affinity_print_topology(retval, nActiveThreads, depth, types); KMP_CPU_FREE(oldMask); *address2os = retval; @@ -1126,6 +1127,123 @@ #if KMP_ARCH_X86 || KMP_ARCH_X86_64 +/* + * CPUID.B or 1F, Input ECX (sub leaf # aka level number) + Bits Bits Bits Bits + 31-16 15-8 7-4 4-0 +---+-----------+--------------+-------------+-----------------+ +EAX| reserved | reserved | reserved | Bits to Shift | +---+-----------|--------------+-------------+-----------------| +EBX| reserved | Num logical processors at level (16 bits) | +---+-----------|--------------+-------------------------------| +ECX| reserved | Level Type | Level Number (8 bits) | +---+-----------+--------------+-------------------------------| +EDX| X2APIC ID (32 bits) | +---+----------------------------------------------------------+ +*/ + +enum { + INTEL_LEVEL_TYPE_INVALID = 0, // Package level + INTEL_LEVEL_TYPE_SMT = 1, + INTEL_LEVEL_TYPE_CORE = 2, + INTEL_LEVEL_TYPE_TILE = 3, + INTEL_LEVEL_TYPE_MODULE = 4, + INTEL_LEVEL_TYPE_DIE = 5, + INTEL_LEVEL_TYPE_LAST = 6, +}; + +struct cpuid_level_info_t { + unsigned level_type, mask, mask_width, nitems, cache_mask; +}; + +template +static inline unsigned __kmp_extract_bits(kmp_uint32 v) { + const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; + const kmp_uint32 SHIFT_RIGHT = LSB; + kmp_uint32 retval = v; + retval <<= SHIFT_LEFT; + retval >>= (SHIFT_LEFT + SHIFT_RIGHT); + return retval; +} + +static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { + switch (intel_type) { + case INTEL_LEVEL_TYPE_INVALID: + return KMP_HW_SOCKET; + case INTEL_LEVEL_TYPE_SMT: + return KMP_HW_THREAD; + case INTEL_LEVEL_TYPE_CORE: + return KMP_HW_CORE; + // TODO: add support for the tile and module + case INTEL_LEVEL_TYPE_TILE: + return KMP_HW_UNKNOWN; + case INTEL_LEVEL_TYPE_MODULE: + return KMP_HW_UNKNOWN; + case INTEL_LEVEL_TYPE_DIE: + return KMP_HW_DIE; + } + return KMP_HW_UNKNOWN; +} + +// This function takes the topology leaf, a levels array to store the levels +// detected and a bitmap of the known levels. +// Returns the number of levels in the topology +static unsigned +__kmp_x2apicid_get_levels(int leaf, + cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], + kmp_uint64 known_levels) { + unsigned level, levels_index; + unsigned level_type, mask_width, nitems; + kmp_cpuid buf; + + // The new algorithm has known topology layers act as highest unknown topology + // layers when unknown topology layers exist. + // e.g., Suppose layers were SMT CORE PACKAGE + // Then CORE will take the characteristics (nitems and mask width) of . + // In developing the id mask for each layer, this eliminates unknown portions + // of the topology while still keeping the correct underlying structure. + level = levels_index = 0; + do { + __kmp_x86_cpuid(leaf, level, &buf); + level_type = __kmp_extract_bits<8, 15>(buf.ecx); + mask_width = __kmp_extract_bits<0, 4>(buf.eax); + nitems = __kmp_extract_bits<0, 15>(buf.ebx); + if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) + return 0; + + if (known_levels & (1ull << level_type)) { + // Add a new level to the topology + KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); + levels[levels_index].level_type = level_type; + levels[levels_index].mask_width = mask_width; + levels[levels_index].nitems = nitems; + levels_index++; + } else { + // If it is an unknown level, then logically move the previous layer up + if (levels_index > 0) { + levels[levels_index - 1].mask_width = mask_width; + levels[levels_index - 1].nitems = nitems; + } + } + level++; + } while (level_type != INTEL_LEVEL_TYPE_INVALID); + + // Set the masks to & with apicid + for (unsigned i = 0; i < levels_index; ++i) { + if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { + levels[i].mask = ~((-1) << levels[i].mask_width); + levels[i].cache_mask = (-1) << levels[i].mask_width; + for (unsigned j = 0; j < i; ++j) + levels[i].mask ^= levels[j].mask; + } else { + KMP_DEBUG_ASSERT(levels_index > 0); + levels[i].mask = (-1) << levels[i - 1].mask_width; + levels[i].cache_mask = 0; + } + } + return levels_index; +} + static int __kmp_cpuid_mask_width(int count) { int r = 0; @@ -1573,123 +1691,102 @@ // Intel(R) microarchitecture code name Nehalem, Dunnington and later // architectures support a newer interface for specifying the x2APIC Ids, -// based on cpuid leaf 11. +// based on CPUID.B or CPUID.1F static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, kmp_i18n_id_t *const msg_id) { + + cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; + int ratio[KMP_HW_LAST]; + int count[KMP_HW_LAST]; + kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; + unsigned levels_index; kmp_cpuid buf; - *address2os = NULL; - *msg_id = kmp_i18n_null; + kmp_uint64 known_levels; + int topology_leaf, highest_leaf, apic_id; + int num_leaves; + static int leaves[] = {0, 0}; - // Check to see if cpuid leaf 11 is supported. - __kmp_x86_cpuid(0, 0, &buf); - if (buf.eax < 11) { - *msg_id = kmp_i18n_str_NoLeaf11Support; - return -1; - } - __kmp_x86_cpuid(11, 0, &buf); - if (buf.ebx == 0) { - *msg_id = kmp_i18n_str_NoLeaf11Support; - return -1; - } + kmp_i18n_id_t leaf_message_id; - // Find the number of levels in the machine topology. While we're at it, get - // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to - // get more accurate values later by explicitly counting them, but get - // reasonable defaults now, in case we return early. - int level; - int threadLevel = -1; - int coreLevel = -1; - int pkgLevel = -1; - __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; + KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); - for (level = 0;; level++) { - if (level > 31) { - // FIXME: Hack for DPD200163180 - // - // If level is big then something went wrong -> exiting - // - // There could actually be 32 valid levels in the machine topology, but so - // far, the only machine we have seen which does not exit this loop before - // iteration 32 has fubar x2APIC settings. - // - // For now, just reject this case based upon loop trip count. - *msg_id = kmp_i18n_str_InvalidCpuidInfo; - return -1; - } - __kmp_x86_cpuid(11, level, &buf); - if (buf.ebx == 0) { - if (pkgLevel < 0) { - // Will infer nPackages from __kmp_xproc - pkgLevel = level; - level++; - } - break; - } - int kind = (buf.ecx >> 8) & 0xff; - if (kind == 1) { - // SMT level - threadLevel = level; - coreLevel = -1; - pkgLevel = -1; - __kmp_nThreadsPerCore = buf.ebx & 0xffff; - if (__kmp_nThreadsPerCore == 0) { - *msg_id = kmp_i18n_str_InvalidCpuidInfo; - return -1; - } - } else if (kind == 2) { - // core level - coreLevel = level; - pkgLevel = -1; - nCoresPerPkg = buf.ebx & 0xffff; - if (nCoresPerPkg == 0) { - *msg_id = kmp_i18n_str_InvalidCpuidInfo; - return -1; - } - } else { - if (level <= 0) { - *msg_id = kmp_i18n_str_InvalidCpuidInfo; - return -1; - } - if (pkgLevel >= 0) { - continue; - } - pkgLevel = level; - nPackages = buf.ebx & 0xffff; - if (nPackages == 0) { - *msg_id = kmp_i18n_str_InvalidCpuidInfo; - return -1; - } + *msg_id = kmp_i18n_null; + + // Figure out the known topology levels + known_levels = 0ull; + for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { + if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { + known_levels |= (1ull << i); } } - int depth = level; - // In the above loop, "level" was counted from the finest level (usually - // thread) to the coarsest. The caller expects that we will place the labels - // in (*address2os)[].first.labels[] in the inverse order, so we need to - // invert the vars saying which level means what. - if (threadLevel >= 0) { - threadLevel = depth - threadLevel - 1; + // Get the highest cpuid leaf supported + __kmp_x86_cpuid(0, 0, &buf); + highest_leaf = buf.eax; + + // If a specific topology method was requested, only allow that specific leaf + // otherwise, try both leaves 31 and 11 in that order + num_leaves = 0; + if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { + num_leaves = 1; + leaves[0] = 11; + leaf_message_id = kmp_i18n_str_NoLeaf11Support; + } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { + num_leaves = 1; + leaves[0] = 31; + leaf_message_id = kmp_i18n_str_NoLeaf31Support; + } else { + num_leaves = 2; + leaves[0] = 31; + leaves[1] = 11; + leaf_message_id = kmp_i18n_str_NoLeaf11Support; } - if (coreLevel >= 0) { - coreLevel = depth - coreLevel - 1; + + // Check to see if cpuid leaf 31 or 11 is supported. + __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; + topology_leaf = -1; + for (int i = 0; i < num_leaves; ++i) { + int leaf = leaves[i]; + if (highest_leaf < leaf) + continue; + __kmp_x86_cpuid(leaf, 0, &buf); + if (buf.ebx == 0) + continue; + topology_leaf = leaf; + levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); + if (levels_index == 0) + continue; + break; + } + if (topology_leaf == -1 || levels_index == 0) { + *msg_id = leaf_message_id; + return -1; } - KMP_DEBUG_ASSERT(pkgLevel >= 0); - pkgLevel = depth - pkgLevel - 1; + KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); // The algorithm used starts by setting the affinity to each available thread // and retrieving info from the cpuid instruction, so if we are not capable of - // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we - // need to do something else - use the defaults that we calculated from + // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then + // we need to do something else - use the defaults that we calculated from // issuing cpuid without binding to each proc. if (!KMP_AFFINITY_CAPABLE()) { // Hack to try and infer the machine topology using only the data // available from cpuid on the current thread, and __kmp_xproc. KMP_ASSERT(__kmp_affinity_type == affinity_none); + for (unsigned i = 0; i < levels_index; ++i) { + if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { + __kmp_nThreadsPerCore = levels[i].nitems; + } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { + nCoresPerPkg = levels[i].nitems; + } else if (levels[i].level_type == INTEL_LEVEL_TYPE_DIE) { + nDiesPerPkg = levels[i].nitems; + } + } __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; if (__kmp_affinity_verbose) { - KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); + KMP_INFORM(AffNotCapableUseLocCpuidL, "KMP_AFFINITY", topology_leaf); KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); if (__kmp_affinity_uniform_topology()) { KMP_INFORM(Uniform, "KMP_AFFINITY"); @@ -1712,6 +1809,9 @@ __kmp_get_system_affinity(oldMask, TRUE); // Allocate the data structure to be returned. + int depth = levels_index; + for (int i = depth - 1, j = 0; i >= 0; --i, ++j) + types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); @@ -1720,6 +1820,9 @@ unsigned int proc; int nApics = 0; KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { + cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; + unsigned my_levels_index; + // Skip this proc if it is not included in the machine model. if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { continue; @@ -1728,36 +1831,24 @@ __kmp_affinity_dispatch->bind_thread(proc); - // Extract labels for each level in the machine topology map from Apic ID. + // New algorithm + __kmp_x86_cpuid(topology_leaf, 0, &buf); + apic_id = buf.edx; Address addr(depth); - int prev_shift = 0; - - for (level = 0; level < depth; level++) { - __kmp_x86_cpuid(11, level, &buf); - unsigned apicId = buf.edx; - if (buf.ebx == 0) { - if (level != depth - 1) { - KMP_CPU_FREE(oldMask); - *msg_id = kmp_i18n_str_InconsistentCpuidInfo; - return -1; - } - addr.labels[depth - level - 1] = apicId >> prev_shift; - level++; - break; - } - int shift = buf.eax & 0x1f; - int mask = (1 << shift) - 1; - addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; - prev_shift = shift; - } - if (level != depth) { + my_levels_index = + __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); + if (my_levels_index == 0 || my_levels_index != levels_index) { KMP_CPU_FREE(oldMask); - *msg_id = kmp_i18n_str_InconsistentCpuidInfo; + *msg_id = kmp_i18n_str_InvalidCpuidInfo; return -1; } - - retval[nApics] = AddrUnsPair(addr, proc); - nApics++; + // Put in topology information + for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { + addr.labels[idx] = apic_id & my_levels[j].mask; + if (j > 0) + addr.labels[idx] >>= my_levels[j - 1].mask_width; + } + retval[nApics++] = AddrUnsPair(addr, proc); } // We've collected all the info we need. @@ -1767,10 +1858,11 @@ // If there's only one thread context to bind to, return now. KMP_ASSERT(nApics > 0); if (nApics == 1) { + int pkg_level; __kmp_ncores = nPackages = 1; __kmp_nThreadsPerCore = nCoresPerPkg = 1; if (__kmp_affinity_verbose) { - KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); + KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf); KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); KMP_INFORM(Uniform, "KMP_AFFINITY"); KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, @@ -1783,9 +1875,15 @@ return 0; } + pkg_level = 0; + for (int i = 0; i < depth; ++i) + if (types[i] == KMP_HW_SOCKET) { + pkg_level = i; + break; + } // Form an Address object which only includes the package level. Address addr(1); - addr.labels[0] = retval[0].first.labels[pkgLevel]; + addr.labels[0] = retval[0].first.labels[pkg_level]; retval[0].first = addr; if (__kmp_affinity_gran_levels < 0) { @@ -1804,89 +1902,51 @@ // Sort the table by physical Id. qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); - // Find the radix at each of the levels. - unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); - unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); - unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); - unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); - for (level = 0; level < depth; level++) { - totals[level] = 1; - maxCt[level] = 1; - counts[level] = 1; - last[level] = retval[0].first.labels[level]; - } - - // From here on, the iteration variable "level" runs from the finest level to - // the coarsest, i.e. we iterate forward through - // (*address2os)[].first.labels[] - in the previous loops, we iterated - // backwards. - for (proc = 1; (int)proc < nApics; proc++) { - int level; - for (level = 0; level < depth; level++) { - if (retval[proc].first.labels[level] != last[level]) { - int j; - for (j = level + 1; j < depth; j++) { - totals[j]++; - counts[j] = 1; - // The line below causes printing incorrect topology information in - // case the max value for some level (maxCt[level]) is encountered - // earlier than some less value while going through the array. For - // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then - // maxCt[1] == 2 - // whereas it must be 4. - // TODO!!! Check if it can be commented safely - // maxCt[j] = 1; - last[j] = retval[proc].first.labels[j]; - } - totals[level]++; - counts[level]++; - if (counts[level] > maxCt[level]) { - maxCt[level] = counts[level]; - } - last[level] = retval[proc].first.labels[level]; - break; - } else if (level == depth - 1) { - __kmp_free(last); - __kmp_free(maxCt); - __kmp_free(counts); - __kmp_free(totals); - __kmp_free(retval); - KMP_CPU_FREE(oldMask); - *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; - return -1; - } - } - } + __kmp_affinity_gather_enumeration_information(retval, nApics, depth, types, + ratio, count); // When affinity is off, this routine will still be called to set // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. // Make sure all these vars are set correctly, and return if affinity is not // enabled. - if (threadLevel >= 0) { - __kmp_nThreadsPerCore = maxCt[threadLevel]; - } else { - __kmp_nThreadsPerCore = 1; + int thread_level, core_level, socket_level, die_level; + thread_level = core_level = die_level = socket_level = -1; + for (int level = 0; level < depth; ++level) { + if (types[level] == KMP_HW_THREAD) + thread_level = level; + else if (types[level] == KMP_HW_CORE) + core_level = level; + else if (types[level] == KMP_HW_DIE) + die_level = level; + else if (types[level] == KMP_HW_SOCKET) + socket_level = level; } - nPackages = totals[pkgLevel]; - - if (coreLevel >= 0) { - __kmp_ncores = totals[coreLevel]; - nCoresPerPkg = maxCt[coreLevel]; + __kmp_nThreadsPerCore = + __kmp_affinity_calculate_ratio(ratio, thread_level, core_level); + if (die_level > 0) { + nDiesPerPkg = + __kmp_affinity_calculate_ratio(ratio, die_level, socket_level); + nCoresPerPkg = __kmp_affinity_calculate_ratio(ratio, core_level, die_level); } else { - __kmp_ncores = nPackages; - nCoresPerPkg = 1; + nCoresPerPkg = + __kmp_affinity_calculate_ratio(ratio, core_level, socket_level); } + if (socket_level >= 0) + nPackages = count[socket_level]; + else + nPackages = 1; + if (core_level >= 0) + __kmp_ncores = count[core_level]; + else + __kmp_ncores = 1; // Check to see if the machine topology is uniform - unsigned prod = maxCt[0]; - for (level = 1; level < depth; level++) { - prod *= maxCt[level]; - } - bool uniform = (prod == totals[level - 1]); + unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count); // Print the machine topology summary. if (__kmp_affinity_verbose) { - KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); + kmp_hw_t numerator_type, denominator_type; + KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf); KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); if (uniform) { KMP_INFORM(Uniform, "KMP_AFFINITY"); @@ -1897,15 +1957,31 @@ kmp_str_buf_t buf; __kmp_str_buf_init(&buf); - __kmp_str_buf_print(&buf, "%d", totals[0]); - for (level = 1; level <= pkgLevel; level++) { - __kmp_str_buf_print(&buf, " x %d", maxCt[level]); - } - KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, - __kmp_nThreadsPerCore, __kmp_ncores); + if (core_level < 0) + core_level = depth - 1; + int ncores = count[core_level]; + denominator_type = KMP_HW_UNKNOWN; + for (int level = 0; level < depth; ++level) { + int c; + bool plural; + numerator_type = types[level]; + c = ratio[level]; + plural = (c > 1); + if (level == 0) { + __kmp_str_buf_print(&buf, "%d %s", c, __kmp_hw_get_catalog_string( + numerator_type, plural)); + } else { + __kmp_str_buf_print(&buf, " x %d %s/%s", c, + __kmp_hw_get_catalog_string(numerator_type, plural), + __kmp_hw_get_catalog_string(denominator_type)); + } + denominator_type = numerator_type; + } + KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores); __kmp_str_buf_free(&buf); } + KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); @@ -1913,10 +1989,6 @@ __kmp_pu_os_idx[proc] = retval[proc].second; } if (__kmp_affinity_type == affinity_none) { - __kmp_free(last); - __kmp_free(maxCt); - __kmp_free(counts); - __kmp_free(totals); __kmp_free(retval); KMP_CPU_FREE(oldMask); return 0; @@ -1924,64 +1996,30 @@ // Find any levels with radix 1, and remove them from the map // (except for the package level). - int new_depth = 0; - for (level = 0; level < depth; level++) { - if ((maxCt[level] == 1) && (level != pkgLevel)) { - continue; - } - new_depth++; - } - - // If we are removing any levels, allocate a new vector to return, - // and copy the relevant information to it. - if (new_depth != depth) { - AddrUnsPair *new_retval = - (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); - for (proc = 0; (int)proc < nApics; proc++) { - Address addr(new_depth); - new_retval[proc] = AddrUnsPair(addr, retval[proc].second); - } - int new_level = 0; - int newPkgLevel = -1; - int newCoreLevel = -1; - int newThreadLevel = -1; - for (level = 0; level < depth; level++) { - if ((maxCt[level] == 1) && (level != pkgLevel)) { - // Remove this level. Never remove the package level - continue; - } - if (level == pkgLevel) { - newPkgLevel = new_level; - } - if (level == coreLevel) { - newCoreLevel = new_level; - } - if (level == threadLevel) { - newThreadLevel = new_level; - } - for (proc = 0; (int)proc < nApics; proc++) { - new_retval[proc].first.labels[new_level] = - retval[proc].first.labels[level]; - } - new_level++; - } - - __kmp_free(retval); - retval = new_retval; - depth = new_depth; - pkgLevel = newPkgLevel; - coreLevel = newCoreLevel; - threadLevel = newThreadLevel; + depth = __kmp_affinity_remove_radix_one_levels(retval, nApics, depth, types); + thread_level = core_level = die_level = socket_level = -1; + for (int level = 0; level < depth; ++level) { + if (types[level] == KMP_HW_THREAD) + thread_level = level; + else if (types[level] == KMP_HW_CORE) + core_level = level; + else if (types[level] == KMP_HW_DIE) + die_level = level; + else if (types[level] == KMP_HW_SOCKET) + socket_level = level; } if (__kmp_affinity_gran_levels < 0) { // Set the granularity level based on what levels are modeled // in the machine topology map. __kmp_affinity_gran_levels = 0; - if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { + if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { __kmp_affinity_gran_levels++; } - if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { + if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { + __kmp_affinity_gran_levels++; + } + if ((die_level >= 0) && (__kmp_affinity_gran > affinity_gran_die)) { __kmp_affinity_gran_levels++; } if (__kmp_affinity_gran > affinity_gran_package) { @@ -1990,14 +2028,9 @@ } if (__kmp_affinity_verbose) { - __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel, - threadLevel); + __kmp_affinity_print_topology(retval, nApics, depth, types); } - __kmp_free(last); - __kmp_free(maxCt); - __kmp_free(counts); - __kmp_free(totals); KMP_CPU_FREE(oldMask); *address2os = retval; return depth; @@ -3951,6 +3984,8 @@ } if (__kmp_hws_socket.num == 0) __kmp_hws_socket.num = nPackages; // use all available sockets + if (__kmp_hws_die.num == 0) + __kmp_hws_die.num = nDiesPerPkg; // use all available dies if (__kmp_hws_core.num == 0) __kmp_hws_core.num = nCoresPerPkg; // use all available cores if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) @@ -3959,7 +3994,7 @@ KMP_WARNING(AffHWSubsetNonUniform); goto _exit; // don't support non-uniform topology } - if (depth > 3) { + if (depth > 4) { KMP_WARNING(AffHWSubsetNonThreeLevel); goto _exit; // don't support not-3-level topology } @@ -3967,6 +4002,10 @@ KMP_WARNING(AffHWSubsetManySockets); goto _exit; } + if (depth == 4 && __kmp_hws_die.offset + __kmp_hws_die.num > nDiesPerPkg) { + KMP_WARNING(AffHWSubsetManyDies); + goto _exit; + } if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { KMP_WARNING(AffHWSubsetManyCores); goto _exit; @@ -3974,62 +4013,84 @@ // Form the requested subset if (pAddr) // pAddr is NULL in case of affinity_none newAddr = (AddrUnsPair *)__kmp_allocate( - sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num * - __kmp_hws_proc.num); + sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_die.num * + __kmp_hws_core.num * __kmp_hws_proc.num); for (int i = 0; i < nPackages; ++i) { if (i < __kmp_hws_socket.offset || i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { // skip not-requested socket - n_old += nCoresPerPkg * __kmp_nThreadsPerCore; + n_old += nDiesPerPkg * nCoresPerPkg * __kmp_nThreadsPerCore; if (__kmp_pu_os_idx != NULL) { // walk through skipped socket - for (int j = 0; j < nCoresPerPkg; ++j) { - for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { - KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); - ++proc_num; + for (int l = 0; l < nDiesPerPkg; ++l) { + for (int j = 0; j < nCoresPerPkg; ++j) { + for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { + KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); + ++proc_num; + } } } } } else { // walk through requested socket - for (int j = 0; j < nCoresPerPkg; ++j) { - if (j < __kmp_hws_core.offset || - j >= __kmp_hws_core.offset + - __kmp_hws_core.num) { // skip not-requested core - n_old += __kmp_nThreadsPerCore; + for (int l = 0; l < nDiesPerPkg; ++l) { + // skip unwanted die + if (l < __kmp_hws_die.offset || + l >= __kmp_hws_die.offset + __kmp_hws_die.num) { + n_old += nCoresPerPkg; if (__kmp_pu_os_idx != NULL) { - for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { + for (int k = 0; k < nCoresPerPkg; ++k) { KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); ++proc_num; } } } else { - // walk through requested core - for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { - if (k < __kmp_hws_proc.num) { - if (pAddr) // collect requested thread's data - newAddr[n_new] = (*pAddr)[n_old]; - n_new++; + for (int j = 0; j < nCoresPerPkg; ++j) { + if (j < __kmp_hws_core.offset || + j >= __kmp_hws_core.offset + + __kmp_hws_core.num) { // skip not-requested core + n_old += __kmp_nThreadsPerCore; + if (__kmp_pu_os_idx != NULL) { + for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { + KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], + __kmp_affin_fullMask); + ++proc_num; + } + } } else { - if (__kmp_pu_os_idx != NULL) - KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); + // walk through requested core + for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { + if (k < __kmp_hws_proc.num) { + if (pAddr) // collect requested thread's data + newAddr[n_new] = (*pAddr)[n_old]; + n_new++; + } else { + if (__kmp_pu_os_idx != NULL) + KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], + __kmp_affin_fullMask); + } + n_old++; + ++proc_num; + } } - n_old++; - ++proc_num; } } } } } - KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); + KMP_DEBUG_ASSERT(n_old == + nPackages * nDiesPerPkg * nCoresPerPkg * + __kmp_nThreadsPerCore); KMP_DEBUG_ASSERT(n_new == - __kmp_hws_socket.num * __kmp_hws_core.num * - __kmp_hws_proc.num); + __kmp_hws_socket.num * __kmp_hws_die.num * + __kmp_hws_core.num * __kmp_hws_proc.num); nPackages = __kmp_hws_socket.num; // correct nPackages nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg + nDiesPerPkg = __kmp_hws_die.num; // correct nDiesPerPkg __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore __kmp_avail_proc = n_new; // correct avail_proc - __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores + __kmp_ncores = + nPackages * nDiesPerPkg * __kmp_hws_core.num; // correct ncores } // non-hwloc topology method if (pAddr) { __kmp_free(*pAddr); @@ -4395,7 +4456,8 @@ #if KMP_ARCH_X86 || KMP_ARCH_X86_64 - else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { + else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || + __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { if (__kmp_affinity_verbose) { KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); } diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -284,6 +284,7 @@ char *__kmp_affinity_format = NULL; kmp_hws_item_t __kmp_hws_socket = {0, 0}; +kmp_hws_item_t __kmp_hws_die = {0, 0}; kmp_hws_item_t __kmp_hws_node = {0, 0}; kmp_hws_item_t __kmp_hws_tile = {0, 0}; kmp_hws_item_t __kmp_hws_core = {0, 0}; diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -2216,6 +2216,9 @@ set_gran(affinity_gran_tile, -1); buf = next; #endif + } else if (__kmp_match_str("die", buf, CCAST(const char **, &next))) { + set_gran(affinity_gran_die, -1); + buf = next; } else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) { set_gran(affinity_gran_package, -1); buf = next; @@ -2856,6 +2859,13 @@ __kmp_affinity_dups = FALSE; kind = "\"tiles\""; #endif + } else if (__kmp_match_str("dice", scan, &next) || + __kmp_match_str("dies", scan, &next)) { + scan = next; + __kmp_affinity_type = affinity_compact; + __kmp_affinity_gran = affinity_gran_die; + __kmp_affinity_dups = FALSE; + kind = "\"dice\""; } else if (__kmp_match_str("sockets", scan, &next)) { scan = next; __kmp_affinity_type = affinity_compact; @@ -2986,28 +2996,38 @@ } #endif #if KMP_ARCH_X86 || KMP_ARCH_X86_64 - else if (__kmp_str_match("x2apic id", 9, value) || - __kmp_str_match("x2apic_id", 9, value) || - __kmp_str_match("x2apic-id", 9, value) || - __kmp_str_match("x2apicid", 8, value) || - __kmp_str_match("cpuid leaf 11", 13, value) || - __kmp_str_match("cpuid_leaf_11", 13, value) || - __kmp_str_match("cpuid-leaf-11", 13, value) || - __kmp_str_match("cpuid leaf11", 12, value) || - __kmp_str_match("cpuid_leaf11", 12, value) || - __kmp_str_match("cpuid-leaf11", 12, value) || - __kmp_str_match("cpuidleaf 11", 12, value) || - __kmp_str_match("cpuidleaf_11", 12, value) || - __kmp_str_match("cpuidleaf-11", 12, value) || - __kmp_str_match("cpuidleaf11", 11, value) || - __kmp_str_match("cpuid 11", 8, value) || - __kmp_str_match("cpuid_11", 8, value) || - __kmp_str_match("cpuid-11", 8, value) || - __kmp_str_match("cpuid11", 7, value) || - __kmp_str_match("leaf 11", 7, value) || - __kmp_str_match("leaf_11", 7, value) || - __kmp_str_match("leaf-11", 7, value) || - __kmp_str_match("leaf11", 6, value)) { + else if (__kmp_str_match("cpuid_leaf31", 12, value) || + __kmp_str_match("cpuid 1f", 8, value) || + __kmp_str_match("cpuid 31", 8, value) || + __kmp_str_match("cpuid1f", 7, value) || + __kmp_str_match("cpuid31", 7, value) || + __kmp_str_match("leaf 1f", 7, value) || + __kmp_str_match("leaf 31", 7, value) || + __kmp_str_match("leaf1f", 6, value) || + __kmp_str_match("leaf31", 6, value)) { + __kmp_affinity_top_method = affinity_top_method_x2apicid_1f; + } else if (__kmp_str_match("x2apic id", 9, value) || + __kmp_str_match("x2apic_id", 9, value) || + __kmp_str_match("x2apic-id", 9, value) || + __kmp_str_match("x2apicid", 8, value) || + __kmp_str_match("cpuid leaf 11", 13, value) || + __kmp_str_match("cpuid_leaf_11", 13, value) || + __kmp_str_match("cpuid-leaf-11", 13, value) || + __kmp_str_match("cpuid leaf11", 12, value) || + __kmp_str_match("cpuid_leaf11", 12, value) || + __kmp_str_match("cpuid-leaf11", 12, value) || + __kmp_str_match("cpuidleaf 11", 12, value) || + __kmp_str_match("cpuidleaf_11", 12, value) || + __kmp_str_match("cpuidleaf-11", 12, value) || + __kmp_str_match("cpuidleaf11", 11, value) || + __kmp_str_match("cpuid 11", 8, value) || + __kmp_str_match("cpuid_11", 8, value) || + __kmp_str_match("cpuid-11", 8, value) || + __kmp_str_match("cpuid11", 7, value) || + __kmp_str_match("leaf 11", 7, value) || + __kmp_str_match("leaf_11", 7, value) || + __kmp_str_match("leaf-11", 7, value) || + __kmp_str_match("leaf11", 6, value)) { __kmp_affinity_top_method = affinity_top_method_x2apicid; } else if (__kmp_str_match("apic id", 7, value) || __kmp_str_match("apic_id", 7, value) || @@ -4738,6 +4758,12 @@ __kmp_hws_node.num = num; __kmp_hws_node.offset = offset; break; + case 'D': // Die + if (__kmp_hws_die.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_die.num = num; + __kmp_hws_die.offset = offset; + break; case 'L': // Cache if (*(pos + 1) == '2') { // L2 - Tile if (__kmp_hws_tile.num > 0) @@ -4745,7 +4771,7 @@ __kmp_hws_tile.num = num; __kmp_hws_tile.offset = offset; } else if (*(pos + 1) == '3') { // L3 - Socket - if (__kmp_hws_socket.num > 0) + if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0) goto err; // duplicate is not allowed __kmp_hws_socket.num = num; __kmp_hws_socket.offset = offset; @@ -4770,7 +4796,7 @@ __kmp_hws_tile.num = num; __kmp_hws_tile.offset = offset; } else if (*d == '3') { // L3 - Socket - if (__kmp_hws_socket.num > 0) + if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0) goto err; // duplicate is not allowed __kmp_hws_socket.num = num; __kmp_hws_socket.offset = offset; @@ -4817,6 +4843,12 @@ __kmp_str_buf_print(&buf, "@%d", __kmp_hws_socket.offset); comma = 1; } + if (__kmp_hws_die.num) { + __kmp_str_buf_print(&buf, "%s%dd", comma ? "," : "", __kmp_hws_die.num); + if (__kmp_hws_die.offset) + __kmp_str_buf_print(&buf, "@%d", __kmp_hws_die.offset); + comma = 1; + } if (__kmp_hws_node.num) { __kmp_str_buf_print(&buf, "%s%dn", comma ? "," : "", __kmp_hws_node.num); if (__kmp_hws_node.offset)