Index: openmp/trunk/runtime/src/dllexports =================================================================== --- openmp/trunk/runtime/src/dllexports +++ openmp/trunk/runtime/src/dllexports @@ -351,7 +351,7 @@ %ifdef OMP_30 __kmpc_omp_taskyield 235 %endif # OMP_30 - __kmpc_place_threads 236 +# __kmpc_place_threads 236 %endif # OpenMP 4.0 entry points Index: openmp/trunk/runtime/src/i18n/en_US.txt =================================================================== --- openmp/trunk/runtime/src/i18n/en_US.txt +++ openmp/trunk/runtime/src/i18n/en_US.txt @@ -38,7 +38,7 @@ Country "USA" LangId "1033" Version "2" -Revision "20160714" +Revision "20161216" @@ -388,8 +388,8 @@ EnvLockWarn "%1$s must be set prior to first OMP lock call or critical section; ignored." FutexNotSupported "futex system call not supported; %1$s=%2$s ignored." AffGranUsing "%1$s: granularity=%2$s will be used." -AffHWSubsetInvalid "%1$s: invalid value \"%2$s\", valid format is \"Ns[@N],Nc[@N],Nt " - "(nSockets@offset, nCores@offset, nTthreads per core)\"." +AffHWSubsetInvalid "%1$s: invalid value \"%2$s\", valid format is \"N[@N][,...][,Nt] " + "( can be S, N, L2, C, T for Socket, NUMA Node, L2 Cache, Core, Thread)\"." AffHWSubsetUnsupported "KMP_HW_SUBSET ignored: unsupported architecture." AffHWSubsetManyCores "KMP_HW_SUBSET ignored: too many cores requested." SyntaxErrorUsing "%1$s: syntax error, using %2$s." @@ -411,6 +411,10 @@ EnvSerialWarn "%1$s must be set prior to OpenMP runtime library initialization; ignored." EnvVarDeprecated "%1$s variable deprecated, please use %2$s instead." RedMethodNotSupported "KMP_FORCE_REDUCTION: %1$s method is not supported; using critical." +AffHWSubsetNoHWLOC "KMP_HW_SUBSET ignored: unsupported item requested for non-HWLOC topology method (KMP_TOPOLOGY_METHOD)" +AffHWSubsetManyNodes "KMP_HW_SUBSET ignored: too many NUMA Nodes requested." +AffHWSubsetManyTiles "KMP_HW_SUBSET ignored: too many L2 Caches requested." +AffHWSubsetManyProcs "KMP_HW_SUBSET ignored: too many Procs requested." # -------------------------------------------------------------------------------------------------- Index: openmp/trunk/runtime/src/kmp.h =================================================================== --- openmp/trunk/runtime/src/kmp.h +++ openmp/trunk/runtime/src/kmp.h @@ -774,11 +774,19 @@ } kmp_cancel_kind_t; #endif // OMP_40_ENABLED -extern int __kmp_place_num_sockets; -extern int __kmp_place_socket_offset; -extern int __kmp_place_num_cores; -extern int __kmp_place_core_offset; -extern int __kmp_place_num_threads_per_core; +// KMP_HW_SUBSET support: +typedef struct kmp_hws_item { + int num; + int offset; +} kmp_hws_item_t; + +extern kmp_hws_item_t __kmp_hws_socket; +extern kmp_hws_item_t __kmp_hws_node; +extern kmp_hws_item_t __kmp_hws_tile; +extern kmp_hws_item_t __kmp_hws_core; +extern kmp_hws_item_t __kmp_hws_proc; +extern int __kmp_hws_requested; +extern int __kmp_hws_abs_flag; // absolute or per-item number requested /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ @@ -3494,9 +3502,6 @@ KMP_EXPORT kmp_uint64 __kmpc_get_taskid(); KMP_EXPORT kmp_uint64 __kmpc_get_parent_taskid(); -// this function exported for testing of KMP_PLACE_THREADS functionality -KMP_EXPORT void __kmpc_place_threads(int,int,int,int,int); - /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ Index: openmp/trunk/runtime/src/kmp_affinity.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_affinity.cpp +++ openmp/trunk/runtime/src/kmp_affinity.cpp @@ -3405,102 +3405,665 @@ #undef ADD_MASK #undef ADD_MASK_OSID +#if KMP_USE_HWLOC +static int +__kmp_hwloc_count_children_by_type( + hwloc_topology_t t, hwloc_obj_t o, hwloc_obj_type_t type, hwloc_obj_t* f) +{ + if (!hwloc_compare_types(o->type, type)) { + if (*f == NULL) + *f = o; // output first descendant found + return 1; + } + int sum = 0; + for (unsigned i = 0; i < o->arity; i++) + sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); + return sum; // will be 0 if no one found (as PU arity is 0) +} + +static int +__kmp_hwloc_count_children_by_depth( + hwloc_topology_t t, hwloc_obj_t o, unsigned depth, hwloc_obj_t* f) +{ + if (o->depth == depth) { + if (*f == NULL) + *f = o; // output first descendant found + return 1; + } + int sum = 0; + for (unsigned i = 0; i < o->arity; i++) + sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); + return sum; // will be 0 if no one found (as PU arity is 0) +} + +static int +__kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) +{ // skip PUs descendants of the object o + int skipped = 0; + hwloc_obj_t hT = NULL; + int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); + for (int i = 0; i < N; ++i) { + KMP_DEBUG_ASSERT(hT); + unsigned idx = hT->os_index; + if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { + KMP_CPU_CLR(idx, __kmp_affin_fullMask); + KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); + ++skipped; + } + hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); + } + return skipped; // count number of skipped units +} + +static int +__kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) +{ // check if obj has PUs present in fullMask + hwloc_obj_t hT = NULL; + int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); + for (int i = 0; i < N; ++i) { + KMP_DEBUG_ASSERT(hT); + unsigned idx = hT->os_index; + if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) + return 1; // found PU + hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); + } + return 0; // no PUs found +} +#endif // KMP_USE_HWLOC + static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { - int i, j, k, n_old = 0, n_new = 0, proc_num = 0; - if (__kmp_place_num_sockets == 0 && - __kmp_place_num_cores == 0 && - __kmp_place_num_threads_per_core == 0 ) - goto _exit; // no topology limiting actions requested, exit - if (__kmp_place_num_sockets == 0) - __kmp_place_num_sockets = nPackages; // use all available sockets - if (__kmp_place_num_cores == 0) - __kmp_place_num_cores = nCoresPerPkg; // use all available cores - if (__kmp_place_num_threads_per_core == 0 || - __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore) - __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts + AddrUnsPair *newAddr; + if (__kmp_hws_requested == 0) + goto _exit; // no topology limiting actions requested, exit +#if KMP_USE_HWLOC + if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { + // Number of subobjects calculated dynamically, this works fine for + // any non-uniform topology. + // L2 cache objects are determined by depth, other objects - by type. + hwloc_topology_t tp = __kmp_hwloc_topology; + int nS=0, nN=0, nL=0, nC=0, nT=0; // logical index including skipped + int nCr=0, nTr=0; // number of requested units + int nPkg=0, nCo=0, n_new=0, n_old = 0, nCpP=0, nTpC=0; // counters + hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) + int L2depth, idx; + + // check support of extensions ---------------------------------- + int numa_support = 0, tile_support = 0; + if (__kmp_pu_os_idx) + hT = hwloc_get_pu_obj_by_os_index( + tp, __kmp_pu_os_idx[__kmp_avail_proc - 1]); + else + hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); + if (hT == NULL) { // something's gone wrong + KMP_WARNING(AffHWSubsetUnsupported); + goto _exit; + } + // check NUMA node + hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); + hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); + if (hN != NULL && hN->depth > hS->depth) { + numa_support = 1; // 1 in case socket includes node(s) + } else if (__kmp_hws_node.num > 0) { + // don't support sockets inside NUMA node (no such HW found for testing) + KMP_WARNING(AffHWSubsetUnsupported); + goto _exit; + } + // check L2 cahce, get object by depth because of multiple caches + L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); + hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); + if (hL != NULL && __kmp_hwloc_count_children_by_type( + tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { + tile_support = 1; // no sense to count L2 if it includes single core + } else if (__kmp_hws_tile.num > 0) { + if (__kmp_hws_core.num == 0) { + __kmp_hws_core = __kmp_hws_tile; // replace L2 with core + __kmp_hws_tile.num = 0; + } else { + // L2 and core are both requested, but represent same object + KMP_WARNING(AffHWSubsetInvalid); + goto _exit; + } + } + // end of check of extensions ----------------------------------- + + // fill in unset items, validate settings ----------------------- + if (__kmp_hws_socket.num == 0) + __kmp_hws_socket.num = nPackages; // use all available sockets + if (__kmp_hws_socket.offset >= nPackages) { + KMP_WARNING(AffHWSubsetManySockets); + goto _exit; + } + if (numa_support) { + int NN = __kmp_hwloc_count_children_by_type( + tp, hS, HWLOC_OBJ_NUMANODE, &hN); // num nodes in socket + if (__kmp_hws_node.num == 0) + __kmp_hws_node.num = NN; // use all available nodes + if (__kmp_hws_node.offset >= NN) { + KMP_WARNING(AffHWSubsetManyNodes); + goto _exit; + } + if (tile_support) { + // get num tiles in node + int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); + if (__kmp_hws_tile.num == 0) { + __kmp_hws_tile.num = NL + 1; + } // use all available tiles, some node may have more tiles, thus +1 + if (__kmp_hws_tile.offset >= NL) { + KMP_WARNING(AffHWSubsetManyTiles); + goto _exit; + } + int NC = __kmp_hwloc_count_children_by_type( + tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in tile + if (__kmp_hws_core.num == 0) + __kmp_hws_core.num = NC; // use all available cores + if (__kmp_hws_core.offset >= NC) { + KMP_WARNING(AffHWSubsetManyCores); + goto _exit; + } + } else { // tile_support + int NC = __kmp_hwloc_count_children_by_type( + tp, hN, HWLOC_OBJ_CORE, &hC); // num cores in node + if (__kmp_hws_core.num == 0) + __kmp_hws_core.num = NC; // use all available cores + if (__kmp_hws_core.offset >= NC) { + KMP_WARNING(AffHWSubsetManyCores); + goto _exit; + } + } // tile_support + } else { // numa_support + if (tile_support) { + // get num tiles in socket + int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); + if (__kmp_hws_tile.num == 0) + __kmp_hws_tile.num = NL; // use all available tiles + if (__kmp_hws_tile.offset >= NL) { + KMP_WARNING(AffHWSubsetManyTiles); + goto _exit; + } + int NC = __kmp_hwloc_count_children_by_type( + tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in tile + if (__kmp_hws_core.num == 0) + __kmp_hws_core.num = NC; // use all available cores + if (__kmp_hws_core.offset >= NC) { + KMP_WARNING(AffHWSubsetManyCores); + goto _exit; + } + } else { // tile_support + int NC = __kmp_hwloc_count_children_by_type( + tp, hS, HWLOC_OBJ_CORE, &hC); // num cores in socket + if (__kmp_hws_core.num == 0) + __kmp_hws_core.num = NC; // use all available cores + if (__kmp_hws_core.offset >= NC) { + KMP_WARNING(AffHWSubsetManyCores); + goto _exit; + } + } // tile_support + } + if (__kmp_hws_proc.num == 0) + __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs + if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { + KMP_WARNING(AffHWSubsetManyProcs); + goto _exit; + } + // end of validation -------------------------------------------- - if ( !__kmp_affinity_uniform_topology() ) { + if (pAddr) // pAddr is NULL in case of affinity_none + newAddr = (AddrUnsPair *)__kmp_allocate( + sizeof(AddrUnsPair) * __kmp_avail_proc); // max size + // main loop to form HW subset ---------------------------------- + hS = NULL; + int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); + for (int s = 0; s < NP; ++s) { + // Check Socket ----------------------------------------------- + hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); + if (!__kmp_hwloc_obj_has_PUs(tp, hS)) + continue; // skip socket if all PUs are out of fullMask + ++nS; // only count objects those have PUs in affinity mask + if (nS <= __kmp_hws_socket.offset || + nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { + n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket + continue; // move to next socket + } + nCr = 0; // count number of cores per socket + // socket requested, go down the topology tree + // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) + if (numa_support) { + nN = 0; + hN = NULL; + int NN = __kmp_hwloc_count_children_by_type( + tp, hS, HWLOC_OBJ_NUMANODE, &hN); // num nodes in current socket + for (int n = 0; n < NN; ++n) { + // Check NUMA Node ---------------------------------------- + if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { + hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); + continue; // skip node if all PUs are out of fullMask + } + ++nN; + if (nN <= __kmp_hws_node.offset || + nN > __kmp_hws_node.num + __kmp_hws_node.offset) { + // skip node as not requested + n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node + hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); + continue; // move to next node + } + // node requested, go down the topology tree + if (tile_support) { + nL = 0; + hL = NULL; + int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); + for (int l = 0; l < NL; ++l) { + // Check L2 (tile) ------------------------------------ + if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { + hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); + continue; // skip tile if all PUs are out of fullMask + } + ++nL; + if (nL <= __kmp_hws_tile.offset || + nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { + // skip tile as not requested + n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile + hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); + continue; // move to next tile + } + // tile requested, go down the topology tree + nC = 0; + hC = NULL; + int NC = __kmp_hwloc_count_children_by_type( + tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in current tile + for (int c = 0; c < NC; ++c) { + // Check Core --------------------------------------- + if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + continue; // skip core if all PUs are out of fullMask + } + ++nC; + if (nC <= __kmp_hws_core.offset || + nC > __kmp_hws_core.num + __kmp_hws_core.offset) { + // skip node as not requested + n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + continue; // move to next node + } + // core requested, go down to PUs + nT = 0; + nTr = 0; + hT = NULL; + int NT = __kmp_hwloc_count_children_by_type( + tp, hC, HWLOC_OBJ_PU, &hT); // num procs in current core + for (int t = 0; t < NT; ++t) { + // Check PU --------------------------------------- + idx = hT->os_index; + if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + continue; // skip PU if not in fullMask + } + ++nT; + if (nT <= __kmp_hws_proc.offset || + nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { + // skip PU + KMP_CPU_CLR(idx, __kmp_affin_fullMask); + ++n_old; + KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + continue; // move to next node + } + ++nTr; + if (pAddr) // collect requested thread's data + newAddr[n_new] = (*pAddr)[n_old]; + ++n_new; + ++n_old; + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + } // threads loop + if (nTr > 0) { + ++nCr; // num cores per socket + ++nCo; // total num cores + if (nTr > nTpC) + nTpC = nTr; // calc max threads per core + } + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + } // cores loop + hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); + } // tiles loop + } else { // tile_support + // no tiles, check cores + nC = 0; + hC = NULL; + int NC = __kmp_hwloc_count_children_by_type( + tp, hN, HWLOC_OBJ_CORE, &hC); // num cores in current node + for (int c = 0; c < NC; ++c) { + // Check Core --------------------------------------- + if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + continue; // skip core if all PUs are out of fullMask + } + ++nC; + if (nC <= __kmp_hws_core.offset || + nC > __kmp_hws_core.num + __kmp_hws_core.offset) { + // skip node as not requested + n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + continue; // move to next node + } + // core requested, go down to PUs + nT = 0; + nTr = 0; + hT = NULL; + int NT = __kmp_hwloc_count_children_by_type( + tp, hC, HWLOC_OBJ_PU, &hT); + for (int t = 0; t < NT; ++t) { + // Check PU --------------------------------------- + idx = hT->os_index; + if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + continue; // skip PU if not in fullMask + } + ++nT; + if (nT <= __kmp_hws_proc.offset || + nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { + // skip PU + KMP_CPU_CLR(idx, __kmp_affin_fullMask); + ++n_old; + KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + continue; // move to next node + } + ++nTr; + if (pAddr) // collect requested thread's data + newAddr[n_new] = (*pAddr)[n_old]; + ++n_new; + ++n_old; + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + } // threads loop + if (nTr > 0) { + ++nCr; // num cores per socket + ++nCo; // total num cores + if (nTr > nTpC) + nTpC = nTr; // calc max threads per core + } + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + } // cores loop + } // tiles support + hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); + } // nodes loop + } else { // numa_support + // no NUMA support + if (tile_support) { + nL = 0; + hL = NULL; + int NL = __kmp_hwloc_count_children_by_depth( + tp, hS, L2depth, &hL); // num tiles in current socket + for (int l = 0; l < NL; ++l) { + // Check L2 (tile) ------------------------------------ + if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { + hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); + continue; // skip tile if all PUs are out of fullMask + } + ++nL; + if (nL <= __kmp_hws_tile.offset || + nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { + // skip tile as not requested + n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile + hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); + continue; // move to next tile + } + // tile requested, go down the topology tree + nC = 0; + hC = NULL; + int NC = __kmp_hwloc_count_children_by_type( + tp, hL, HWLOC_OBJ_CORE, &hC); // num cores per tile + for (int c = 0; c < NC; ++c) { + // Check Core --------------------------------------- + if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + continue; // skip core if all PUs are out of fullMask + } + ++nC; + if (nC <= __kmp_hws_core.offset || + nC > __kmp_hws_core.num + __kmp_hws_core.offset) { + // skip node as not requested + n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + continue; // move to next node + } + // core requested, go down to PUs + nT = 0; + nTr = 0; + hT = NULL; + int NT = __kmp_hwloc_count_children_by_type( + tp, hC, HWLOC_OBJ_PU, &hT); // num procs per core + for (int t = 0; t < NT; ++t) { + // Check PU --------------------------------------- + idx = hT->os_index; + if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + continue; // skip PU if not in fullMask + } + ++nT; + if (nT <= __kmp_hws_proc.offset || + nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { + // skip PU + KMP_CPU_CLR(idx, __kmp_affin_fullMask); + ++n_old; + KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + continue; // move to next node + } + ++nTr; + if (pAddr) // collect requested thread's data + newAddr[n_new] = (*pAddr)[n_old]; + ++n_new; + ++n_old; + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + } // threads loop + if (nTr > 0) { + ++nCr; // num cores per socket + ++nCo; // total num cores + if (nTr > nTpC) + nTpC = nTr; // calc max threads per core + } + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + } // cores loop + hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); + } // tiles loop + } else { // tile_support + // no tiles, check cores + nC = 0; + hC = NULL; + int NC = __kmp_hwloc_count_children_by_type( + tp, hS, HWLOC_OBJ_CORE, &hC); // num cores in socket + for (int c = 0; c < NC; ++c) { + // Check Core ------------------------------------------- + if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + continue; // skip core if all PUs are out of fullMask + } + ++nC; + if (nC <= __kmp_hws_core.offset || + nC > __kmp_hws_core.num + __kmp_hws_core.offset) { + // skip node as not requested + n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + continue; // move to next node + } + // core requested, go down to PUs + nT = 0; + nTr = 0; + hT = NULL; + int NT = __kmp_hwloc_count_children_by_type( + tp, hC, HWLOC_OBJ_PU, &hT); // num procs per core + for (int t = 0; t < NT; ++t) { + // Check PU --------------------------------------- + idx = hT->os_index; + if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + continue; // skip PU if not in fullMask + } + ++nT; + if (nT <= __kmp_hws_proc.offset || + nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { + // skip PU + KMP_CPU_CLR(idx, __kmp_affin_fullMask); + ++n_old; + KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + continue; // move to next node + } + ++nTr; + if (pAddr) // collect requested thread's data + newAddr[n_new] = (*pAddr)[n_old]; + ++n_new; + ++n_old; + hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); + } // threads loop + if (nTr > 0) { + ++nCr; // num cores per socket + ++nCo; // total num cores + if (nTr > nTpC) + nTpC = nTr; // calc max threads per core + } + hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); + } // cores loop + } // tiles support + } // numa_support + if (nCr > 0) { // found cores? + ++nPkg; // num sockets + if (nCr > nCpP) + nCpP = nCr; // calc max cores per socket + } + } // sockets loop + + // check the subset is valid + KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); + KMP_DEBUG_ASSERT(nPkg > 0); + KMP_DEBUG_ASSERT(nCpP > 0); + KMP_DEBUG_ASSERT(nTpC > 0); + KMP_DEBUG_ASSERT(nCo > 0); + KMP_DEBUG_ASSERT(nPkg <= nPackages); + KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); + KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); + KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); + + nPackages = nPkg; // correct num sockets + nCoresPerPkg = nCpP; // correct num cores per socket + __kmp_nThreadsPerCore = nTpC; // correct num threads per core + __kmp_avail_proc = n_new; // correct num procs + __kmp_ncores = nCo; // correct num cores + // hwloc topology method end + } else +#endif // KMP_USE_HWLOC + { + int n_old = 0, n_new = 0, proc_num = 0; + if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { + KMP_WARNING(AffHWSubsetNoHWLOC); + goto _exit; + } + if (__kmp_hws_socket.num == 0) + __kmp_hws_socket.num = nPackages; // use all available sockets + if (__kmp_hws_core.num == 0) + __kmp_hws_core.num = nCoresPerPkg; // use all available cores + if (__kmp_hws_proc.num == 0 || + __kmp_hws_proc.num > __kmp_nThreadsPerCore) + __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts + if ( !__kmp_affinity_uniform_topology() ) { KMP_WARNING( AffHWSubsetNonUniform ); goto _exit; // don't support non-uniform topology - } - if ( depth > 3 ) { + } + if ( depth > 3 ) { KMP_WARNING( AffHWSubsetNonThreeLevel ); goto _exit; // don't support not-3-level topology - } - if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) { + } + if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { KMP_WARNING(AffHWSubsetManySockets); goto _exit; - } - if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { + } + if ( __kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg ) { KMP_WARNING( AffHWSubsetManyCores ); goto _exit; - } - - AddrUnsPair *newAddr; - if (pAddr) // pAddr is NULL in case of affinity_none + } + // Form the requested subset + if (pAddr) // pAddr is NULL in case of affinity_none newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * - __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core); - - for (i = 0; i < nPackages; ++i) { - if (i < __kmp_place_socket_offset || - i >= __kmp_place_socket_offset + __kmp_place_num_sockets) { - n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket - if (__kmp_pu_os_idx != NULL) { - for (j = 0; j < nCoresPerPkg; ++j) { // walk through skipped socket - for (k = 0; k < __kmp_nThreadsPerCore; ++k) { - KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); - ++proc_num; - } - } + __kmp_hws_socket.num * __kmp_hws_core.num * __kmp_hws_proc.num); + for (int i = 0; i < nPackages; ++i) { + if (i < __kmp_hws_socket.offset || + i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { + // skip not-requested socket + n_old += nCoresPerPkg * __kmp_nThreadsPerCore; + if (__kmp_pu_os_idx != NULL) { + // walk through skipped socket + for (int j = 0; j < nCoresPerPkg; ++j) { + for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { + KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); + ++proc_num; + } } + } } else { - for (j = 0; j < nCoresPerPkg; ++j) { // walk through requested socket - if (j < __kmp_place_core_offset || - j >= __kmp_place_core_offset + __kmp_place_num_cores) { - n_old += __kmp_nThreadsPerCore; // skip not-requested core - if (__kmp_pu_os_idx != NULL) { - for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through skipped core - KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); - ++proc_num; - } - } + // walk through requested socket + for (int j = 0; j < nCoresPerPkg; ++j) { + if (j < __kmp_hws_core.offset || + j >= __kmp_hws_core.offset + __kmp_hws_core.num) + { // skip not-requested core + n_old += __kmp_nThreadsPerCore; + if (__kmp_pu_os_idx != NULL) { + for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { + KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); + ++proc_num; + } + } + } else { + // walk through requested core + for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { + if (k < __kmp_hws_proc.num) { + if (pAddr) // collect requested thread's data + newAddr[n_new] = (*pAddr)[n_old]; + n_new++; } else { - for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core - if (k < __kmp_place_num_threads_per_core) { - if (pAddr) - newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data - n_new++; - } else { - if (__kmp_pu_os_idx != NULL) - KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); - } - n_old++; - ++proc_num; - } + if (__kmp_pu_os_idx != NULL) + KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); } - } - } - } - KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); - KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores * - __kmp_place_num_threads_per_core); - - nPackages = __kmp_place_num_sockets; // correct nPackages - nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg - __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore - __kmp_avail_proc = n_new; // correct avail_proc - __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores - + n_old++; + ++proc_num; + } + } + } + } + } + KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); + KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_core.num * + __kmp_hws_proc.num); + nPackages = __kmp_hws_socket.num; // correct nPackages + nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg + __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore + __kmp_avail_proc = n_new; // correct avail_proc + __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores + } // non-hwloc topology method if (pAddr) { - __kmp_free( *pAddr ); - *pAddr = newAddr; // replace old topology with new one + __kmp_free( *pAddr ); + *pAddr = newAddr; // replace old topology with new one + } + if (__kmp_affinity_verbose) { + char m[KMP_AFFIN_MASK_PRINT_LEN]; + __kmp_affinity_print_mask(m,KMP_AFFIN_MASK_PRINT_LEN,__kmp_affin_fullMask); + if (__kmp_affinity_respect_mask) { + KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m); + } else { + KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m); + } + KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); + kmp_str_buf_t buf; + __kmp_str_buf_init(&buf); + __kmp_str_buf_print(&buf, "%d", nPackages); + KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, + __kmp_nThreadsPerCore, __kmp_ncores); + __kmp_str_buf_free(&buf); } _exit: if (__kmp_pu_os_idx != NULL) { - __kmp_free(__kmp_pu_os_idx); - __kmp_pu_os_idx = NULL; + __kmp_free(__kmp_pu_os_idx); + __kmp_pu_os_idx = NULL; } } Index: openmp/trunk/runtime/src/kmp_csupport.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_csupport.cpp +++ openmp/trunk/runtime/src/kmp_csupport.cpp @@ -3038,18 +3038,6 @@ } // __kmpc_get_parent_taskid -void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT) -{ - if ( ! __kmp_init_serial ) { - __kmp_serial_initialize(); - } - __kmp_place_num_sockets = nS; - __kmp_place_socket_offset = sO; - __kmp_place_num_cores = nC; - __kmp_place_core_offset = cO; - __kmp_place_num_threads_per_core = nT; -} - #if OMP_45_ENABLED /*! @ingroup WORK_SHARING Index: openmp/trunk/runtime/src/kmp_global.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_global.cpp +++ openmp/trunk/runtime/src/kmp_global.cpp @@ -264,11 +264,13 @@ int __kmp_affinity_num_places = 0; #endif -int __kmp_place_num_sockets = 0; -int __kmp_place_socket_offset = 0; -int __kmp_place_num_cores = 0; -int __kmp_place_core_offset = 0; -int __kmp_place_num_threads_per_core = 0; +kmp_hws_item_t __kmp_hws_socket = {0, 0}; +kmp_hws_item_t __kmp_hws_node = {0, 0}; +kmp_hws_item_t __kmp_hws_tile = {0, 0}; +kmp_hws_item_t __kmp_hws_core = {0, 0}; +kmp_hws_item_t __kmp_hws_proc = {0, 0}; +int __kmp_hws_requested = 0; +int __kmp_hws_abs_flag = 0; // absolute or per-item number requested #if OMP_40_ENABLED kmp_int32 __kmp_default_device = 0; Index: openmp/trunk/runtime/src/kmp_settings.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_settings.cpp +++ openmp/trunk/runtime/src/kmp_settings.cpp @@ -24,6 +24,7 @@ #include "kmp_lock.h" #include "kmp_io.h" #include "kmp_affinity.h" +#include // toupper() static int __kmp_env_toPrint( char const * name, int flag ); @@ -3108,6 +3109,12 @@ break; # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ +# if KMP_USE_HWLOC + case affinity_top_method_hwloc: + value = "hwloc"; + break; +# endif + case affinity_top_method_cpuinfo: value = "cpuinfo"; break; @@ -4297,275 +4304,152 @@ // KMP_HW_SUBSET (was KMP_PLACE_THREADS) // ------------------------------------------------------------------------------------------------- +// The longest observable sequense of items is +// Socket-Node-Tile-Core-Thread +// So, let's limit to 5 levels for now +// The input string is usually short enough, let's use 512 limit for now +#define MAX_T_LEVEL 5 +#define MAX_STR_LEN 512 static void __kmp_stg_parse_hw_subset( char const * name, char const * value, void * data ) { - // Value example: 5Cx2Tx15O - // Which means "use 5 cores with offset 15, 2 threads per core" - // AC: extended to sockets level, examples of - // "use 2 sockets with offset 6, 2 cores with offset 2 per socket, 2 threads per core": - // 2s,6o,2c,2o,2t; 2s,6o,2c,2t,2o; 2s@6,2c@2,2t - // To not break legacy code core-offset can be last; - // postfix "o" or prefix @ can be offset designator. - // Note: not all syntax errors are analyzed, some may be skipped. -#define CHECK_DELIM(_x) (*(_x) == ',' || *(_x) == 'x') - static int parsed = 0; - int num; - int single_warning = 0; - int flagS = 0, flagC = 0, flagT = 0, flagSO = 0, flagCO = 0; - const char *next = value; - const char *prev; - - if( strcmp(name, "KMP_PLACE_THREADS") == 0 ) { - KMP_INFORM(EnvVarDeprecated,name,"KMP_HW_SUBSET"); - if( parsed == 1 ) { - return; // already parsed KMP_HW_SUBSET - } - } - parsed = 1; - - SKIP_WS(next); // skip white spaces - if (*next == '\0') - return; // no data provided, retain default values - if( strcmp(name, "KMP_PLACE_THREADS") == 0 ) { - KMP_INFORM(EnvVarDeprecated,name,"KMP_HW_SUBSET"); - if( parsed == 1 ) { - return; // already parsed KMP_HW_SUBSET - } - } - parsed = 1; - - SKIP_WS(next); // skip white spaces - if (*next == '\0') - return; // no data provided, retain default values - // Get num_sockets first (or whatever specified) - if (*next >= '0' && *next <= '9') { - prev = next; - SKIP_DIGITS(next); - num = __kmp_str_to_int(prev, *next); - SKIP_WS(next); - if (*next == 's' || *next == 'S') { // e.g. "2s" - __kmp_place_num_sockets = num; - flagS = 1; // got num sockets - next++; - if (*next == '@') { // socket offset, e.g. "2s@4" - flagSO = 1; - prev = ++next; // don't allow spaces for simplicity - if (!(*next >= '0' && *next <= '9')) { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - SKIP_DIGITS(next); - num = __kmp_str_to_int(prev, *next); - __kmp_place_socket_offset = num; - } - } else if (*next == 'c' || *next == 'C') { - __kmp_place_num_cores = num; - flagS = flagC = 1; // sockets were not specified - use default - next++; - if (*next == '@') { // core offset, e.g. "2c@6" - flagCO = 1; - prev = ++next; // don't allow spaces for simplicity - if (!(*next >= '0' && *next <= '9')) { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - SKIP_DIGITS(next); - num = __kmp_str_to_int(prev, *next); - __kmp_place_core_offset = num; - } - } else if (CHECK_DELIM(next)) { - __kmp_place_num_cores = num; // no letter-designator - num cores - flagS = flagC = 1; // sockets were not specified - use default - next++; - } else if (*next == 't' || *next == 'T') { - __kmp_place_num_threads_per_core = num; - // sockets, cores were not specified - use default - return; // we ignore offset value in case all cores are used - } else if (*next == '\0') { - __kmp_place_num_cores = num; - return; // the only value provided - set num cores - } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - KMP_DEBUG_ASSERT(flagS); // num sockets should already be set here - SKIP_WS(next); - if (*next == '\0') - return; // " n " - something like this - if (CHECK_DELIM(next)) { - next++; // skip delimiter - SKIP_WS(next); - } - - // Get second value (could be offset, num_cores, num_threads) - if (*next >= '0' && *next <= '9') { - prev = next; - SKIP_DIGITS(next); - num = __kmp_str_to_int(prev, *next); - SKIP_WS(next); - if (*next == 'c' || *next == 'C') { - KMP_DEBUG_ASSERT(flagC == 0); - __kmp_place_num_cores = num; - flagC = 1; - next++; - if (*next == '@') { // core offset, e.g. "2c@6" - flagCO = 1; - prev = ++next; // don't allow spaces for simplicity - if (!(*next >= '0' && *next <= '9')) { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - SKIP_DIGITS(next); - num = __kmp_str_to_int(prev, *next); - __kmp_place_core_offset = num; - } - } else if (*next == 'o' || *next == 'O') { // offset specified - KMP_WARNING(AffHWSubsetDeprecated); - single_warning = 1; - if (flagC) { // whether num_cores already specified (sockets skipped) - KMP_DEBUG_ASSERT(!flagCO); // either "o" or @, not both - __kmp_place_core_offset = num; - } else { - KMP_DEBUG_ASSERT(!flagSO); // either "o" or @, not both - __kmp_place_socket_offset = num; - } - next++; - } else if (*next == 't' || *next == 'T') { - KMP_DEBUG_ASSERT(flagT == 0); - __kmp_place_num_threads_per_core = num; - flagC = 1; // num_cores could be skipped ? - flagT = 1; - next++; // can have core-offset specified after num threads - } else if (*next == '\0') { - KMP_DEBUG_ASSERT(flagC); // 4x2 means 4 cores 2 threads per core - __kmp_place_num_threads_per_core = num; - return; // two values provided without letter-designator - } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - SKIP_WS(next); - if (*next == '\0') - return; // " Ns,Nc " - something like this - if (CHECK_DELIM(next)) { - next++; // skip delimiter - SKIP_WS(next); - } - - // Get third value (could be core-offset, num_cores, num_threads) - if (*next >= '0' && *next <= '9') { - prev = next; - SKIP_DIGITS(next); - num = __kmp_str_to_int(prev, *next); - SKIP_WS(next); - if (*next == 't' || *next == 'T') { - KMP_DEBUG_ASSERT(flagT == 0); - __kmp_place_num_threads_per_core = num; - if (flagC == 0) - return; // num_cores could be skipped (e.g. 2s,4o,2t) - flagT = 1; - next++; // can have core-offset specified later (e.g. 2s,1c,2t,3o) - } else if (*next == 'c' || *next == 'C') { - KMP_DEBUG_ASSERT(flagC == 0); - __kmp_place_num_cores = num; - flagC = 1; - next++; - //KMP_DEBUG_ASSERT(*next != '@'); // socket offset used "o" designator - } else if (*next == 'o' || *next == 'O') { - KMP_WARNING(AffHWSubsetDeprecated); - single_warning = 1; - KMP_DEBUG_ASSERT(flagC); - //KMP_DEBUG_ASSERT(!flagSO); // socket offset couldn't use @ designator - __kmp_place_core_offset = num; - next++; + // Value example: 1s,5c@3,2T + // Which means "use 1 socket, 5 cores with offset 3, 2 threads per core" + static int parsed = 0; + if( strcmp(name, "KMP_PLACE_THREADS") == 0 ) { + KMP_INFORM(EnvVarDeprecated,name,"KMP_HW_SUBSET"); + if( parsed == 1 ) { + return; // already parsed KMP_HW_SUBSET + } + } + parsed = 1; + + char *components[MAX_T_LEVEL]; + char const *digits = "0123456789"; + char input[MAX_STR_LEN]; + size_t len = 0, mlen = MAX_STR_LEN; + int level = 0; + // Canonize the string (remove spaces, unify delimiters, etc.) + char *pos = (char *)value; + while (*pos && mlen) { + if (*pos != ' ') { // skip spaces + if (len == 0 && *pos == ':') { + __kmp_hws_abs_flag = 1; // if the first symbol is ":", skip it + } else { + input[len] = toupper(*pos); + if (input[len] == 'X') + input[len] = ','; // unify delimiters of levels + if (input[len] == 'O' && strchr(digits, *(pos + 1))) + input[len] = '@'; // unify delimiters of offset + len++; + } + } + mlen--; + pos++; + } + if (len == 0 || mlen == 0) + goto err; // contents is either empty or too long + input[len] = '\0'; + __kmp_hws_requested = 1; // mark that subset requested + // Split by delimiter + pos = input; + components[level++] = pos; + while (pos = strchr(pos, ',')) { + *pos = '\0'; // modify input and avoid more copying + components[level++] = ++pos; // expect something after "," + if (level > MAX_T_LEVEL) + goto err; // too many components provided + } + // Check each component + for (int i = 0; i < level; ++i) { + int offset = 0; + int num = atoi(components[i]); // each component should start with a number + if ((pos = strchr(components[i], '@'))) { + offset = atoi(pos + 1); // save offset + *pos = '\0'; // cut the offset from the component + } + pos = components[i] + strspn(components[i], digits); + if (pos == components[i]) + goto err; + // detect the component type + switch (*pos) { + case 'S': // Socket + if (__kmp_hws_socket.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_socket.num = num; + __kmp_hws_socket.offset = offset; + break; + case 'N': // NUMA Node + if (__kmp_hws_node.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_node.num = num; + __kmp_hws_node.offset = offset; + break; + case 'L': // Cache + if (*(pos + 1) == '2') { // L2 - Tile + if (__kmp_hws_tile.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_tile.num = num; + __kmp_hws_tile.offset = offset; + } else if (*(pos + 1) == '3') { // L3 - Socket + if (__kmp_hws_socket.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_socket.num = num; + __kmp_hws_socket.offset = offset; + } else if (*(pos + 1) == '1') { // L1 - Core + if (__kmp_hws_core.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_core.num = num; + __kmp_hws_core.offset = offset; + } + break; + case 'C': // Core (or Cache?) + if (*(pos + 1) != 'A') { + if (__kmp_hws_core.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_core.num = num; + __kmp_hws_core.offset = offset; + } else { // Cache + char *d = pos + strcspn(pos, digits); // find digit + if (*d == '2') { // L2 - Tile + if (__kmp_hws_tile.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_tile.num = num; + __kmp_hws_tile.offset = offset; + } else if (*d == '3') { // L3 - Socket + if (__kmp_hws_socket.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_socket.num = num; + __kmp_hws_socket.offset = offset; + } else if (*d == '1') { // L1 - Core + if (__kmp_hws_core.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_core.num = num; + __kmp_hws_core.offset = offset; } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; + goto err; } - } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - KMP_DEBUG_ASSERT(flagC); - SKIP_WS(next); - if ( *next == '\0' ) - return; - if (CHECK_DELIM(next)) { - next++; // skip delimiter - SKIP_WS(next); - } - - // Get 4-th value (could be core-offset, num_threads) - if (*next >= '0' && *next <= '9') { - prev = next; - SKIP_DIGITS(next); - num = __kmp_str_to_int(prev, *next); - SKIP_WS(next); - if (*next == 'o' || *next == 'O') { - if (!single_warning) { // warn once - KMP_WARNING(AffHWSubsetDeprecated); - } - KMP_DEBUG_ASSERT(!flagSO); // socket offset couldn't use @ designator - __kmp_place_core_offset = num; - next++; - } else if (*next == 't' || *next == 'T') { - KMP_DEBUG_ASSERT(flagT == 0); - __kmp_place_num_threads_per_core = num; - flagT = 1; - next++; // can have core-offset specified after num threads - } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); - return; - } - SKIP_WS(next); - if ( *next == '\0' ) - return; - if (CHECK_DELIM(next)) { - next++; // skip delimiter - SKIP_WS(next); - } - - // Get 5-th value (could be core-offset, num_threads) - if (*next >= '0' && *next <= '9') { - prev = next; - SKIP_DIGITS(next); - num = __kmp_str_to_int(prev, *next); - SKIP_WS(next); - if (*next == 'o' || *next == 'O') { - if (!single_warning) { // warn once - KMP_WARNING(AffHWSubsetDeprecated); - } - KMP_DEBUG_ASSERT(flagT); - KMP_DEBUG_ASSERT(!flagSO); // socket offset couldn't use @ designator - __kmp_place_core_offset = num; - } else if (*next == 't' || *next == 'T') { - KMP_DEBUG_ASSERT(flagT == 0); - __kmp_place_num_threads_per_core = num; - } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); - } - } else { - KMP_WARNING(AffHWSubsetInvalid, name, value); + } + break; + case 'T': // Thread + if (__kmp_hws_proc.num > 0) + goto err; // duplicate is not allowed + __kmp_hws_proc.num = num; + __kmp_hws_proc.offset = offset; + break; + default: + goto err; } - return; -#undef CHECK_DELIM + } + return; +err: + KMP_WARNING(AffHWSubsetInvalid, name, value); + __kmp_hws_requested = 0; // mark that subset not requested + return; } static void __kmp_stg_print_hw_subset( kmp_str_buf_t * buffer, char const * name, void * data ) { - if (__kmp_place_num_sockets + __kmp_place_num_cores + __kmp_place_num_threads_per_core) { + if (__kmp_hws_requested) { int comma = 0; kmp_str_buf_t buf; __kmp_str_buf_init(&buf); @@ -4573,26 +4457,34 @@ KMP_STR_BUF_PRINT_NAME_EX(name); else __kmp_str_buf_print(buffer, " %s='", name); - if (__kmp_place_num_sockets) { - __kmp_str_buf_print(&buf, "%ds", __kmp_place_num_sockets); - if (__kmp_place_socket_offset) - __kmp_str_buf_print(&buf, "@%d", __kmp_place_socket_offset); + if (__kmp_hws_socket.num) { + __kmp_str_buf_print(&buf, "%ds", __kmp_hws_socket.num); + if (__kmp_hws_socket.offset) + __kmp_str_buf_print(&buf, "@%d", __kmp_hws_socket.offset); comma = 1; } - if (__kmp_place_num_cores) { - __kmp_str_buf_print(&buf, "%s%dc", comma?",":"", __kmp_place_num_cores); - if (__kmp_place_core_offset) - __kmp_str_buf_print(&buf, "@%d", __kmp_place_core_offset); + if (__kmp_hws_node.num) { + __kmp_str_buf_print(&buf, "%s%dn", comma?",":"", __kmp_hws_node.num); + if (__kmp_hws_node.offset) + __kmp_str_buf_print(&buf, "@%d", __kmp_hws_node.offset); comma = 1; } - if (__kmp_place_num_threads_per_core) - __kmp_str_buf_print(&buf, "%s%dt", comma?",":"", __kmp_place_num_threads_per_core); + if (__kmp_hws_tile.num) { + __kmp_str_buf_print(&buf, "%s%dL2", comma?",":"", __kmp_hws_tile.num); + if (__kmp_hws_tile.offset) + __kmp_str_buf_print(&buf, "@%d", __kmp_hws_tile.offset); + comma = 1; + } + if (__kmp_hws_core.num) { + __kmp_str_buf_print(&buf, "%s%dc", comma?",":"", __kmp_hws_core.num); + if (__kmp_hws_core.offset) + __kmp_str_buf_print(&buf, "@%d", __kmp_hws_core.offset); + comma = 1; + } + if (__kmp_hws_proc.num) + __kmp_str_buf_print(&buf, "%s%dt", comma?",":"", __kmp_hws_proc.num); __kmp_str_buf_print(buffer, "%s'\n", buf.str ); __kmp_str_buf_free(&buf); -/* - } else { - __kmp_str_buf_print( buffer, " %s: %s \n", name, KMP_I18N_STR( NotDefined ) ); -*/ } }