diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -461,13 +461,14 @@ units), cores per socket, and the threads per core, to use with an OpenMP application, as an alternative to writing complicated explicit affinity settings or a limiting process affinity mask. You can also specify an offset value to set -which resources to use. +which resources to use. When available, you can specify attributes to select +different subsets of resources. An extended syntax is available when ``KMP_TOPOLOGY_METHOD=hwloc``. Depending on what resources are detected, you may be able to specify additional resources, such as NUMA domains and groups of hardware resources that share certain cache levels. -**Basic syntax:** ``num_unitsID[@offset] [,num_unitsID[@offset]...]`` +**Basic syntax:** ``num_unitsID[@offset][:attribute] [,num_unitsID[@offset][:attribute]...]`` Supported unit IDs are not case-insensitive. @@ -485,6 +486,14 @@ ``offset`` - (Optional) The number of units to skip. +``attribute`` - (Optional) An attribute differentiating resources at a particular level. The attributes available to users are: + +* **Core type** - On Intel architectures, this can be ``intel_atom`` or ``intel_core`` +* **Core efficiency** - This is specified as ``eff``:emphasis:`num` where :emphasis:`num` is a number from 0 + to the number of core efficiencies detected in the machine topology minus one. + E.g., ``eff0``. The greater the efficiency number the more performant the core. There may be + more core efficiencies than core types and can be viewed by setting ``KMP_AFFINITY=verbose`` + .. note:: The hardware cache can be specified as a unit, e.g. L2 for L2 cache, or LL for last level cache. @@ -513,7 +522,10 @@ * a resource is specified, but detection of that resource is not supported by the chosen topology detection method and/or -* a resource is specified twice. +* a resource is specified twice. An exception to this condition is if attributes + differentiate the resource. +* attributes are used when not detected in the machine topology or conflict with + each other. This variable does not work if ``KMP_AFFINITY=disabled``. @@ -532,6 +544,10 @@ * ``1T``: Use all cores on all sockets, 1 thread per core. * ``1s, 1d, 1n, 1c, 1t``: Use 1 socket, 1 die, 1 NUMA node, 1 core, 1 thread - use HW thread as a result. +* ``4c:intel_atom,5c:intel_core``: Use all available sockets and use 4 + Intel Atom(R) processor cores and 5 Intel(R) Core(TM) processor cores per socket. +* ``2c:eff0@1,3c:eff1``: Use all available sockets, skip the first core with efficiency 0 + and use the next 2 cores with efficiency 0 and 3 cores with efficiency 1 per socket. * ``1s, 1c, 1t``: Use 1 socket, 1 core, 1 thread. This may result in using single thread on a 3-layer topology architecture, or multiple threads on 4-layer or 5-layer architecture. Result may even be different on the same diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt --- a/openmp/runtime/src/i18n/en_US.txt +++ b/openmp/runtime/src/i18n/en_US.txt @@ -361,6 +361,7 @@ TopologyGeneric "%1$s: %2$s (%3$d total cores)" AffGranularityBad "%1$s: granularity setting: %2$s does not exist in topology. Using granularity=%3$s instead." TopologyHybrid "%1$s: hybrid core type detected: %2$d %3$s cores." +TopologyHybridCoreEff "%1$s: %2$d with core efficiency %3$d." # --- OpenMP errors detected at runtime --- # @@ -472,6 +473,12 @@ AffGranTooCoarseProcGroup "%1$s: granularity=%2$s is too coarse, setting granularity=group." StgDeprecatedValue "%1$s: \"%2$s\" value is deprecated. Please use \"%3$s\" instead." NumTeamsNotPositive "num_teams value must be positive, it is %1$d, using %2$d instead." +AffHWSubsetIncompat "KMP_HW_SUBSET ignored: %1$s, %2$s: attributes are ambiguous, please only specify one." +AffHWSubsetAttrRepeat "KMP_HW_SUBSET ignored: %1$s: attribute specified more than once." +AffHWSubsetAttrInvalid "KMP_HW_SUBSET ignored: %1$s: attribute value %2$s is invalid." +AffHWSubsetAllFiltered "KMP_HW_SUBSET ignored: all hardware resources would be filtered, please reduce the filter." +AffHWSubsetAttrsNonHybrid "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre." +AffHWSubsetIgnoringAttr "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre." # -------------------------------------------------------------------------------------------------- -*- HINTS -*- @@ -530,6 +537,7 @@ "Check whether \"%1$s\" is a file for %2$s architecture." SystemLimitOnThreads "System-related limit on the number of threads." SetNewBound "Try setting new bounds (preferably less than or equal to %1$d) for num_teams clause." +ValidValuesRange "Valid values are from %1$d to %2$d." # -------------------------------------------------------------------------------------------------- diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -618,6 +618,19 @@ KMP_HW_LAST }; +typedef enum kmp_hw_core_type_t { + KMP_HW_CORE_TYPE_UNKNOWN = 0x0, +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + KMP_HW_CORE_TYPE_ATOM = 0x20, + KMP_HW_CORE_TYPE_CORE = 0x40, + KMP_HW_MAX_NUM_CORE_TYPES = 3, +#else + KMP_HW_MAX_NUM_CORE_TYPES = 1, +#endif +} kmp_hw_core_type_t; + +#define KMP_HW_MAX_NUM_CORE_EFFS 8 + #define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type) \ KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST) #define KMP_ASSERT_VALID_HW_TYPE(type) \ @@ -629,6 +642,7 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false); const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false); +const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type); /* Only Linux* OS and Windows* OS support thread affinity. */ #if KMP_AFFINITY_SUPPORTED diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h --- a/openmp/runtime/src/kmp_affinity.h +++ b/openmp/runtime/src/kmp_affinity.h @@ -598,16 +598,62 @@ #endif /* KMP_OS_WINDOWS */ #endif /* KMP_AFFINITY_SUPPORTED */ -typedef enum kmp_hw_core_type_t { - KMP_HW_CORE_TYPE_UNKNOWN = 0x0, -#if KMP_ARCH_X86 || KMP_ARCH_X86_64 - KMP_HW_CORE_TYPE_ATOM = 0x20, - KMP_HW_CORE_TYPE_CORE = 0x40, - KMP_HW_MAX_NUM_CORE_TYPES = 3, -#else - KMP_HW_MAX_NUM_CORE_TYPES = 1, -#endif -} kmp_hw_core_type_t; +// Describe an attribute for a level in the machine topology +struct kmp_hw_attr_t { + int core_type : 8; + int core_eff : 8; + unsigned valid : 1; + unsigned reserved : 15; + + static const int UNKNOWN_CORE_EFF = -1; + + kmp_hw_attr_t() + : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), + valid(0), reserved(0) {} + void set_core_type(kmp_hw_core_type_t type) { + valid = 1; + core_type = type; + } + void set_core_eff(int eff) { + valid = 1; + core_eff = eff; + } + kmp_hw_core_type_t get_core_type() const { + return (kmp_hw_core_type_t)core_type; + } + int get_core_eff() const { return core_eff; } + bool is_core_type_valid() const { + return core_type != KMP_HW_CORE_TYPE_UNKNOWN; + } + bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } + operator bool() const { return valid; } + void clear() { + core_type = KMP_HW_CORE_TYPE_UNKNOWN; + core_eff = UNKNOWN_CORE_EFF; + valid = 0; + } + bool contains(const kmp_hw_attr_t &other) const { + if (!valid && !other.valid) + return true; + if (valid && other.valid) { + if (other.is_core_type_valid()) { + if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) + return false; + } + if (other.is_core_eff_valid()) { + if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) + return false; + } + return true; + } + return false; + } + bool operator==(const kmp_hw_attr_t &rhs) const { + return (rhs.valid == valid && rhs.core_eff == core_eff && + rhs.core_type == core_type); + } + bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } +}; class kmp_hw_thread_t { public: @@ -618,14 +664,14 @@ int sub_ids[KMP_HW_LAST]; bool leader; int os_id; - kmp_hw_core_type_t core_type; + kmp_hw_attr_t attrs; void print() const; void clear() { for (int i = 0; i < (int)KMP_HW_LAST; ++i) ids[i] = UNKNOWN_ID; leader = false; - core_type = KMP_HW_CORE_TYPE_UNKNOWN; + attrs.clear(); } }; @@ -653,10 +699,11 @@ // Storage containing the absolute number of each topology layer int *count; - // Storage containing the core types and the number of - // each core type for hybrid processors + // The number of core efficiencies. This is only useful for hybrid + // topologies. Core efficiencies will range from 0 to num efficiencies - 1 + int num_core_efficiencies; + int num_core_types; kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; - int core_types_count[KMP_HW_MAX_NUM_CORE_TYPES]; // The hardware threads array // hw_threads is num_hw_threads long @@ -704,19 +751,11 @@ // Set the last level cache equivalent type void _set_last_level_cache(); - // Increments the number of cores of type 'type' - void _increment_core_type(kmp_hw_core_type_t type) { - for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { - if (core_types[i] == KMP_HW_CORE_TYPE_UNKNOWN) { - core_types[i] = type; - core_types_count[i] = 1; - break; - } else if (core_types[i] == type) { - core_types_count[i]++; - break; - } - } - } + // Return the number of cores with a particular attribute, 'attr'. + // If 'find_all' is true, then find all cores on the machine, otherwise find + // all cores per the layer 'above' + int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, + bool find_all = false) const; public: // Force use of allocate()/deallocate() @@ -807,6 +846,16 @@ KMP_DEBUG_ASSERT(level >= 0 && level < depth); return count[level]; } + // Return the total number of cores with attribute 'attr' + int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { + return _get_ncores_with_attr(attr, -1, true); + } + // Return the number of cores with attribute + // 'attr' per topology level 'above' + int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { + return _get_ncores_with_attr(attr, above, false); + } + #if KMP_AFFINITY_SUPPORTED void sort_compact() { qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), @@ -819,11 +868,16 @@ extern kmp_topology_t *__kmp_topology; class kmp_hw_subset_t { + const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; + public: + // Describe a machine topology item in KMP_HW_SUBSET struct item_t { - int num; kmp_hw_t type; - int offset; + int num_attrs; + int num[MAX_ATTRS]; + int offset[MAX_ATTRS]; + kmp_hw_attr_t attr[MAX_ATTRS]; }; private: @@ -869,7 +923,20 @@ } void set_absolute() { absolute = true; } bool is_absolute() const { return absolute; } - void push_back(int num, kmp_hw_t type, int offset) { + void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { + for (int i = 0; i < depth; ++i) { + // Found an existing item for this layer type + // Add the num, offset, and attr to this item + if (items[i].type == type) { + int idx = items[i].num_attrs++; + if ((size_t)idx >= MAX_ATTRS) + return; + items[i].num[idx] = num; + items[i].offset[idx] = offset; + items[i].attr[idx] = attr; + return; + } + } if (depth == capacity - 1) { capacity *= 2; item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); @@ -878,9 +945,11 @@ __kmp_free(items); items = new_items; } - items[depth].num = num; + items[depth].num_attrs = 1; items[depth].type = type; - items[depth].offset = offset; + items[depth].num[0] = num; + items[depth].offset[0] = offset; + items[depth].attr[0] = attr; depth++; set |= (1ull << type); } @@ -912,8 +981,19 @@ printf("* depth: %d\n", depth); printf("* items:\n"); for (int i = 0; i < depth; ++i) { - printf("num: %d, type: %s, offset: %d\n", items[i].num, - __kmp_hw_get_keyword(items[i].type), items[i].offset); + printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type)); + for (int j = 0; j < items[i].num_attrs; ++j) { + printf(" num: %d, offset: %d, attr: ", items[i].num[j], + items[i].offset[j]); + if (!items[i].attr[j]) { + printf(" (none)\n"); + } else { + printf( + " core_type = %s, core_eff = %d\n", + __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()), + items[i].attr[j].get_core_eff()); + } + } } printf("* set: 0x%llx\n", set); printf("* absolute: %d\n", absolute); diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -189,8 +189,11 @@ for (int i = 0; i < depth; ++i) { printf("%4d ", ids[i]); } - if (core_type != KMP_HW_CORE_TYPE_UNKNOWN) { - printf(" (%s)", __kmp_hw_get_core_type_string(core_type)); + if (attrs) { + if (attrs.is_core_type_valid()) + printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type())); + if (attrs.is_core_eff_valid()) + printf(" (eff=%d)", attrs.get_core_eff()); } printf("\n"); } @@ -391,12 +394,6 @@ count[i] = 0; ratio[i] = 0; } - if (__kmp_is_hybrid_cpu()) { - for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { - core_types_count[i] = 0; - core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN; - } - } int core_level = get_level(KMP_HW_CORE); for (int i = 0; i < num_hw_threads; ++i) { kmp_hw_thread_t &hw_thread = hw_threads[i]; @@ -413,9 +410,29 @@ ratio[l] = max[l]; max[l] = 1; } - // Figure out the number of each core type for hybrid CPUs - if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) - _increment_core_type(hw_thread.core_type); + // Figure out the number of different core types + // and efficiencies for hybrid CPUs + if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) { + if (hw_thread.attrs.is_core_eff_valid() && + hw_thread.attrs.core_eff >= num_core_efficiencies) { + // Because efficiencies can range from 0 to max efficiency - 1, + // the number of efficiencies is max efficiency + 1 + num_core_efficiencies = hw_thread.attrs.core_eff + 1; + } + if (hw_thread.attrs.is_core_type_valid()) { + bool found = false; + for (int j = 0; j < num_core_types; ++j) { + if (hw_thread.attrs.get_core_type() == core_types[j]) { + found = true; + break; + } + } + if (!found) { + KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES); + core_types[num_core_types++] = hw_thread.attrs.get_core_type(); + } + } + } break; } } @@ -429,6 +446,42 @@ } } +int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr, + int above_level, + bool find_all) const { + int current, current_max; + int previous_id[KMP_HW_LAST]; + for (int i = 0; i < depth; ++i) + previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; + int core_level = get_level(KMP_HW_CORE); + if (find_all) + above_level = -1; + KMP_ASSERT(above_level < core_level); + current_max = 0; + current = 0; + for (int i = 0; i < num_hw_threads; ++i) { + kmp_hw_thread_t &hw_thread = hw_threads[i]; + if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) { + if (current > current_max) + current_max = current; + current = hw_thread.attrs.contains(attr); + } else { + for (int level = above_level + 1; level <= core_level; ++level) { + if (hw_thread.ids[level] != previous_id[level]) { + if (hw_thread.attrs.contains(attr)) + current++; + break; + } + } + } + for (int level = 0; level < depth; ++level) + previous_id[level] = hw_thread.ids[level]; + } + if (current > current_max) + current_max = current; + return current_max; +} + // Find out if the topology is uniform void kmp_topology_t::_discover_uniformity() { int num = 1; @@ -517,6 +570,10 @@ retval->types = (kmp_hw_t *)arr; retval->ratio = arr + (size_t)KMP_HW_LAST; retval->count = arr + 2 * (size_t)KMP_HW_LAST; + retval->num_core_efficiencies = 0; + retval->num_core_types = 0; + for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) + retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN; KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } for (int i = 0; i < ndepth; ++i) { retval->types[i] = types[i]; @@ -574,18 +631,12 @@ } printf("\n"); - printf("* core_types:\n"); - for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { - if (core_types[i] != KMP_HW_CORE_TYPE_UNKNOWN) { - printf(" %d %s core%c\n", core_types_count[i], - __kmp_hw_get_core_type_string(core_types[i]), - ((core_types_count[i] > 1) ? 's' : ' ')); - } else { - if (i == 0) - printf("No hybrid information available\n"); - break; - } - } + printf("* num_core_eff: %d\n", num_core_efficiencies); + printf("* num_core_types: %d\n", num_core_types); + printf("* core_types: "); + for (int i = 0; i < num_core_types; ++i) + printf("%3d ", core_types[i]); + printf("\n"); printf("* equivalent map:\n"); KMP_FOREACH_HW_TYPE(i) { @@ -680,12 +731,26 @@ } KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); + // Hybrid topology information if (__kmp_is_hybrid_cpu()) { - for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { - if (core_types[i] == KMP_HW_CORE_TYPE_UNKNOWN) - break; - KMP_INFORM(TopologyHybrid, env_var, core_types_count[i], - __kmp_hw_get_core_type_string(core_types[i])); + for (int i = 0; i < num_core_types; ++i) { + kmp_hw_core_type_t core_type = core_types[i]; + kmp_hw_attr_t attr; + attr.clear(); + attr.set_core_type(core_type); + int ncores = get_ncores_with_attr(attr); + if (ncores > 0) { + KMP_INFORM(TopologyHybrid, env_var, ncores, + __kmp_hw_get_core_type_string(core_type)); + KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS) + for (int eff = 0; eff < num_core_efficiencies; ++eff) { + attr.set_core_eff(eff); + int ncores_with_eff = get_ncores_with_attr(attr); + if (ncores_with_eff > 0) { + KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff); + } + } + } } } @@ -705,7 +770,8 @@ } if (__kmp_is_hybrid_cpu()) __kmp_str_buf_print( - &buf, "(%s)", __kmp_hw_get_core_type_string(hw_threads[i].core_type)); + &buf, "(%s)", + __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type())); KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); } @@ -816,6 +882,56 @@ _discover_uniformity(); } +// Represents running sub IDs for a single core attribute where +// attribute values have SIZE possibilities. +template struct kmp_sub_ids_t { + int last_level; // last level in topology to consider for sub_ids + int sub_id[SIZE]; // The sub ID for a given attribute value + int prev_sub_id[KMP_HW_LAST]; + IndexFunc indexer; + +public: + kmp_sub_ids_t(int last_level) : last_level(last_level) { + KMP_ASSERT(last_level < KMP_HW_LAST); + for (size_t i = 0; i < SIZE; ++i) + sub_id[i] = -1; + for (size_t i = 0; i < KMP_HW_LAST; ++i) + prev_sub_id[i] = -1; + } + void update(const kmp_hw_thread_t &hw_thread) { + int idx = indexer(hw_thread); + KMP_ASSERT(idx < (int)SIZE); + for (int level = 0; level <= last_level; ++level) { + if (hw_thread.sub_ids[level] != prev_sub_id[level]) { + if (level < last_level) + sub_id[idx] = -1; + sub_id[idx]++; + break; + } + } + for (int level = 0; level <= last_level; ++level) + prev_sub_id[level] = hw_thread.sub_ids[level]; + } + int get_sub_id(const kmp_hw_thread_t &hw_thread) const { + return sub_id[indexer(hw_thread)]; + } +}; + +static kmp_str_buf_t * +__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf, + bool plural) { + __kmp_str_buf_init(buf); + if (attr.is_core_type_valid()) + __kmp_str_buf_print(buf, "%s %s", + __kmp_hw_get_core_type_string(attr.get_core_type()), + __kmp_hw_get_catalog_string(KMP_HW_CORE, plural)); + else + __kmp_str_buf_print(buf, "%s eff=%d", + __kmp_hw_get_catalog_string(KMP_HW_CORE, plural), + attr.get_core_eff()); + return buf; +} + // Apply the KMP_HW_SUBSET envirable to the topology // Returns true if KMP_HW_SUBSET filtered any processors // otherwise, returns false @@ -828,17 +944,23 @@ __kmp_hw_subset->sort(); // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology + bool using_core_types = false; + bool using_core_effs = false; int hw_subset_depth = __kmp_hw_subset->get_depth(); kmp_hw_t specified[KMP_HW_LAST]; + int topology_levels[hw_subset_depth]; KMP_ASSERT(hw_subset_depth > 0); KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } + int core_level = get_level(KMP_HW_CORE); for (int i = 0; i < hw_subset_depth; ++i) { int max_count; - int num = __kmp_hw_subset->at(i).num; - int offset = __kmp_hw_subset->at(i).offset; - kmp_hw_t type = __kmp_hw_subset->at(i).type; + const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i); + int num = item.num[0]; + int offset = item.offset[0]; + kmp_hw_t type = item.type; kmp_hw_t equivalent_type = equivalent[type]; int level = get_level(type); + topology_levels[i] = level; // Check to see if current layer is in detected machine topology if (equivalent_type != KMP_HW_UNKNOWN) { @@ -849,8 +971,8 @@ return false; } - // Check to see if current layer has already been specified - // either directly or through an equivalent type + // Check to see if current layer has already been + // specified either directly or through an equivalent type if (specified[equivalent_type] != KMP_HW_UNKNOWN) { KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), __kmp_hw_get_catalog_string(specified[equivalent_type])); @@ -866,41 +988,233 @@ __kmp_hw_get_catalog_string(type, plural)); return false; } + + // Check to see if core attributes are consistent + if (core_level == level) { + // Determine which core attributes are specified + for (int j = 0; j < item.num_attrs; ++j) { + if (item.attr[j].is_core_type_valid()) + using_core_types = true; + if (item.attr[j].is_core_eff_valid()) + using_core_effs = true; + } + + // Check if using a single core attribute on non-hybrid arch. + // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute. + // + // Check if using multiple core attributes on non-hyrbid arch. + // Ignore all of KMP_HW_SUBSET if this is the case. + if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) { + if (item.num_attrs == 1) { + if (using_core_effs) { + KMP_WARNING(AffHWSubsetIgnoringAttr, "efficiency"); + } else { + KMP_WARNING(AffHWSubsetIgnoringAttr, "core_type"); + } + using_core_effs = false; + using_core_types = false; + } else { + KMP_WARNING(AffHWSubsetAttrsNonHybrid); + return false; + } + } + + // Check if using both core types and core efficiencies together + if (using_core_types && using_core_effs) { + KMP_WARNING(AffHWSubsetIncompat, "core_type", "efficiency"); + return false; + } + + // Check that core efficiency values are valid + if (using_core_effs) { + for (int j = 0; j < item.num_attrs; ++j) { + if (item.attr[j].is_core_eff_valid()) { + int core_eff = item.attr[j].get_core_eff(); + if (core_eff < 0 || core_eff >= num_core_efficiencies) { + kmp_str_buf_t buf; + __kmp_str_buf_init(&buf); + __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff()); + __kmp_msg(kmp_ms_warning, + KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str), + KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1), + __kmp_msg_null); + __kmp_str_buf_free(&buf); + return false; + } + } + } + } + + // Check that the number of requested cores with attributes is valid + if (using_core_types || using_core_effs) { + for (int j = 0; j < item.num_attrs; ++j) { + int num = item.num[j]; + int offset = item.offset[j]; + int level_above = core_level - 1; + if (level_above >= 0) { + max_count = get_ncores_with_attr_per(item.attr[j], level_above); + if (max_count <= 0 || num + offset > max_count) { + kmp_str_buf_t buf; + __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0); + KMP_WARNING(AffHWSubsetManyGeneric, buf.str); + __kmp_str_buf_free(&buf); + return false; + } + } + } + } + + if ((using_core_types || using_core_effs) && item.num_attrs > 1) { + for (int j = 0; j < item.num_attrs; ++j) { + // Ambiguous use of specific core attribute + generic core + // e.g., 4c & 3c:intel_core or 4c & 3c:eff1 + if (!item.attr[j]) { + kmp_hw_attr_t other_attr; + for (int k = 0; k < item.num_attrs; ++k) { + if (item.attr[k] != item.attr[j]) { + other_attr = item.attr[k]; + break; + } + } + kmp_str_buf_t buf; + __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0); + KMP_WARNING(AffHWSubsetIncompat, + __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str); + __kmp_str_buf_free(&buf); + return false; + } + // Allow specifying a specific core type or core eff exactly once + for (int k = 0; k < j; ++k) { + if (!item.attr[j] || !item.attr[k]) + continue; + if (item.attr[k] == item.attr[j]) { + kmp_str_buf_t buf; + __kmp_hw_get_catalog_core_string(item.attr[j], &buf, + item.num[j] > 0); + KMP_WARNING(AffHWSubsetAttrRepeat, buf.str); + __kmp_str_buf_free(&buf); + return false; + } + } + } + } + } } - // Apply the filtered hardware subset - int new_index = 0; + struct core_type_indexer { + int operator()(const kmp_hw_thread_t &t) const { + switch (t.attrs.get_core_type()) { + case KMP_HW_CORE_TYPE_ATOM: + return 1; + case KMP_HW_CORE_TYPE_CORE: + return 2; + case KMP_HW_CORE_TYPE_UNKNOWN: + return 0; + } + KMP_ASSERT(0); + return 0; + } + }; + struct core_eff_indexer { + int operator()(const kmp_hw_thread_t &t) const { + return t.attrs.get_core_eff(); + } + }; + + kmp_sub_ids_t core_type_sub_ids( + core_level); + kmp_sub_ids_t core_eff_sub_ids( + core_level); + + // Determine which hardware threads should be filtered. + int num_filtered = 0; + bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads); for (int i = 0; i < num_hw_threads; ++i) { kmp_hw_thread_t &hw_thread = hw_threads[i]; + // Update type_sub_id + if (using_core_types) + core_type_sub_ids.update(hw_thread); + if (using_core_effs) + core_eff_sub_ids.update(hw_thread); + // Check to see if this hardware thread should be filtered bool should_be_filtered = false; - for (int level = 0, hw_subset_index = 0; - level < depth && hw_subset_index < hw_subset_depth; ++level) { - kmp_hw_t topology_type = types[level]; - auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); - kmp_hw_t hw_subset_type = hw_subset_item.type; - if (topology_type != hw_subset_type) + for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth; + ++hw_subset_index) { + const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index); + int level = topology_levels[hw_subset_index]; + if (level == -1) continue; - int num = hw_subset_item.num; - int offset = hw_subset_item.offset; - hw_subset_index++; - if (hw_thread.sub_ids[level] < offset || - hw_thread.sub_ids[level] >= offset + num) { - should_be_filtered = true; - break; + if ((using_core_effs || using_core_types) && level == core_level) { + // Look for the core attribute in KMP_HW_SUBSET which corresponds + // to this hardware thread's core attribute. Use this num,offset plus + // the running sub_id for the particular core attribute of this hardware + // thread to determine if the hardware thread should be filtered or not. + int attr_idx; + kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type(); + int core_eff = hw_thread.attrs.get_core_eff(); + for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) { + if (using_core_types && + hw_subset_item.attr[attr_idx].get_core_type() == core_type) + break; + if (using_core_effs && + hw_subset_item.attr[attr_idx].get_core_eff() == core_eff) + break; + } + // This core attribute isn't in the KMP_HW_SUBSET so always filter it. + if (attr_idx == hw_subset_item.num_attrs) { + should_be_filtered = true; + break; + } + int sub_id; + int num = hw_subset_item.num[attr_idx]; + int offset = hw_subset_item.offset[attr_idx]; + if (using_core_types) + sub_id = core_type_sub_ids.get_sub_id(hw_thread); + else + sub_id = core_eff_sub_ids.get_sub_id(hw_thread); + if (sub_id < offset || sub_id >= offset + num) { + should_be_filtered = true; + break; + } + } else { + int num = hw_subset_item.num[0]; + int offset = hw_subset_item.offset[0]; + if (hw_thread.sub_ids[level] < offset || + hw_thread.sub_ids[level] >= offset + num) { + should_be_filtered = true; + break; + } } } - if (!should_be_filtered) { + // Collect filtering information + filtered[i] = should_be_filtered; + if (should_be_filtered) + num_filtered++; + } + + // One last check that we shouldn't allow filtering entire machine + if (num_filtered == num_hw_threads) { + KMP_WARNING(AffHWSubsetAllFiltered); + __kmp_free(filtered); + return false; + } + + // Apply the filter + int new_index = 0; + for (int i = 0; i < num_hw_threads; ++i) { + if (!filtered[i]) { if (i != new_index) - hw_threads[new_index] = hw_thread; + hw_threads[new_index] = hw_threads[i]; new_index++; } else { #if KMP_AFFINITY_SUPPORTED - KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); + KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask); #endif __kmp_avail_proc--; } } + KMP_DEBUG_ASSERT(new_index <= num_hw_threads); num_hw_threads = new_index; @@ -909,6 +1223,7 @@ _discover_uniformity(); _set_globals(); _set_last_level_cache(); + __kmp_free(filtered); return true; } @@ -1461,8 +1776,10 @@ break; } } - if (cpukind_index >= 0) - hw_thread.core_type = cpukinds[cpukind_index].core_type; + if (cpukind_index >= 0) { + hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type); + hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency); + } } index--; } @@ -2040,11 +2357,21 @@ // Hybrid cpu detection using CPUID.1A // Thread should be pinned to processor already -static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, +static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency, unsigned *native_model_id) { kmp_cpuid buf; __kmp_x86_cpuid(0x1a, 0, &buf); *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax); + switch (*type) { + case KMP_HW_CORE_TYPE_ATOM: + *efficiency = 0; + break; + case KMP_HW_CORE_TYPE_CORE: + *efficiency = 1; + break; + default: + *efficiency = 0; + } *native_model_id = __kmp_extract_bits<0, 23>(buf.eax); } @@ -2321,8 +2648,10 @@ if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) { kmp_hw_core_type_t type; unsigned native_model_id; - __kmp_get_hybrid_info(&type, &native_model_id); - hw_thread.core_type = type; + int efficiency; + __kmp_get_hybrid_info(&type, &efficiency, &native_model_id); + hw_thread.attrs.set_core_type(type); + hw_thread.attrs.set_core_eff(efficiency); } hw_thread_index++; } diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -4961,28 +4961,76 @@ // Check each component for (int i = 0; i < level; ++i) { - int offset = 0; - int num = atoi(components[i]); // each component should start with a number - if (num <= 0) { - goto err; // only positive integers are valid for count - } - if ((pos = strchr(components[i], '@'))) { - offset = atoi(pos + 1); // save offset - *pos = '\0'; // cut the offset from the component - } - pos = components[i] + strspn(components[i], digits); - if (pos == components[i]) { - goto err; - } - // detect the component type - kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos); - if (type == KMP_HW_UNKNOWN) { - goto err; - } - if (__kmp_hw_subset->specified(type)) { - goto err; + int core_level = 0; + char *core_components[MAX_T_LEVEL]; + // Split possible core components by '&' delimiter + pos = components[i]; + core_components[core_level++] = pos; + while ((pos = strchr(pos, '&'))) { + if (core_level >= MAX_T_LEVEL) + goto err; // too many different core types + *pos = '\0'; // modify input and avoid more copying + core_components[core_level++] = ++pos; // expect something after '&' + } + + for (int j = 0; j < core_level; ++j) { + char *offset_ptr; + char *attr_ptr; + int offset = 0; + kmp_hw_attr_t attr; + int num = + atoi(core_components[j]); // each component should start with a number + if (num <= 0) { + goto err; // only positive integers are valid for count + } + + offset_ptr = strchr(core_components[j], '@'); + attr_ptr = strchr(core_components[j], ':'); + + if (offset_ptr) { + offset = atoi(offset_ptr + 1); // save offset + *offset_ptr = '\0'; // cut the offset from the component + } + if (attr_ptr) { + attr.clear(); + // save the attribute + if (__kmp_str_match("intel_core", -1, attr_ptr + 1)) { + attr.set_core_type(KMP_HW_CORE_TYPE_CORE); + } else if (__kmp_str_match("intel_atom", -1, attr_ptr + 1)) { + attr.set_core_type(KMP_HW_CORE_TYPE_ATOM); + } else if (__kmp_str_match("eff", 3, attr_ptr + 1)) { + const char *number = attr_ptr + 1; + // skip the eff[iciency] token + while (isalpha(*number)) + number++; + if (!isdigit(*number)) { + goto err; + } + int efficiency = atoi(number); + attr.set_core_eff(efficiency); + } else { + goto err; + } + *attr_ptr = '\0'; // cut the attribute from the component + } + pos = core_components[j] + strspn(core_components[j], digits); + if (pos == core_components[j]) { + goto err; + } + // detect the component type + kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos); + if (type == KMP_HW_UNKNOWN) { + goto err; + } + // Only the core type can have attributes + if (attr && type != KMP_HW_CORE) + goto err; + // Must allow core be specified more than once + if (type != KMP_HW_CORE && __kmp_hw_subset->specified(type)) { + goto err; + } + __kmp_hw_subset->push_back(num, type, offset, attr); } - __kmp_hw_subset->push_back(num, type, offset); } return; err: @@ -4994,6 +5042,21 @@ return; } +static inline const char * +__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) { + switch (type) { + case KMP_HW_CORE_TYPE_UNKNOWN: + return "unknown"; +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + case KMP_HW_CORE_TYPE_ATOM: + return "intel_atom"; + case KMP_HW_CORE_TYPE_CORE: + return "intel_core"; +#endif + } + return "unknown"; +} + static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name, void *data) { kmp_str_buf_t buf; @@ -5009,10 +5072,20 @@ depth = __kmp_hw_subset->get_depth(); for (int i = 0; i < depth; ++i) { const auto &item = __kmp_hw_subset->at(i); - __kmp_str_buf_print(&buf, "%s%d%s", (i > 0 ? "," : ""), item.num, - __kmp_hw_get_keyword(item.type)); - if (item.offset) - __kmp_str_buf_print(&buf, "@%d", item.offset); + if (i > 0) + __kmp_str_buf_print(&buf, "%c", ','); + for (int j = 0; j < item.num_attrs; ++j) { + __kmp_str_buf_print(&buf, "%s%d%s", (j > 0 ? "&" : ""), item.num[j], + __kmp_hw_get_keyword(item.type)); + if (item.attr[j].is_core_type_valid()) + __kmp_str_buf_print( + &buf, ":%s", + __kmp_hw_get_core_type_keyword(item.attr[j].get_core_type())); + if (item.attr[j].is_core_eff_valid()) + __kmp_str_buf_print(&buf, ":eff%d", item.attr[j].get_core_eff()); + if (item.offset[j]) + __kmp_str_buf_print(&buf, "@%d", item.offset[j]); + } } __kmp_str_buf_print(buffer, "%s'\n", buf.str); __kmp_str_buf_free(&buf);