Index: openmp/trunk/runtime/Build_With_CMake.txt =================================================================== --- openmp/trunk/runtime/Build_With_CMake.txt +++ openmp/trunk/runtime/Build_With_CMake.txt @@ -159,6 +159,18 @@ -DLIBOMP_USE_DEBUGGER=off|on Should the friendly debugger interface be included in the build? +-DLIBOMP_USE_HWLOC=off|on +Should the Hwloc library be used for affinity? +This option is not supported on Windows. +http://www.open-mpi.org/projects/hwloc + +-DLIBOMP_HWLOC_INSTALL_DIR=/path/to/hwloc/install/dir +Default: /usr/local +This option is only used if LIBOMP_USE_HWLOC is on. +Specifies install location of Hwloc. The configuration system will look for +hwloc.h in ${LIBOMP_HWLOC_INSTALL_DIR}/include and the library in +${LIBOMP_HWLOC_INSTALL_DIR}/lib. + ================================ How to append flags to the build ================================ Index: openmp/trunk/runtime/CMakeLists.txt =================================================================== --- openmp/trunk/runtime/CMakeLists.txt +++ openmp/trunk/runtime/CMakeLists.txt @@ -135,6 +135,12 @@ set(LIBOMP_COPY_EXPORTS TRUE CACHE STRING "Should exports be copied into source exports/ directory?") +# HWLOC-support +set(LIBOMP_USE_HWLOC FALSE CACHE BOOL + "Use Hwloc (http://www.open-mpi.org/projects/hwloc/) library for affinity?") +set(LIBOMP_HWLOC_INSTALL_DIR /usr/local CACHE PATH + "Install path for hwloc library") + # Get the build number from kmp_version.c libomp_get_build_number("${CMAKE_CURRENT_SOURCE_DIR}" LIBOMP_VERSION_BUILD) math(EXPR LIBOMP_VERSION_BUILD_YEAR "${LIBOMP_VERSION_BUILD}/10000") @@ -285,6 +291,11 @@ libomp_error_say("OpenMP Tools Interface requested but not available") endif() +# Error check hwloc support after config-ix has run +if(LIBOMP_USE_HWLOC AND (NOT LIBOMP_HAVE_HWLOC)) + libomp_error_say("Hwloc requested but not available") +endif() + # Setting final library name set(LIBOMP_DEFAULT_LIB_NAME libomp) if(${PROFILE_LIBRARY}) @@ -323,6 +334,7 @@ endif() libomp_say("Use Adaptive locks -- ${LIBOMP_USE_ADAPTIVE_LOCKS}") libomp_say("Use quad precision -- ${LIBOMP_USE_QUAD_PRECISION}") + libomp_say("Use Hwloc library -- ${LIBOMP_USE_HWLOC}") endif() add_subdirectory(src) Index: openmp/trunk/runtime/cmake/LibompHandleFlags.cmake =================================================================== --- openmp/trunk/runtime/cmake/LibompHandleFlags.cmake +++ openmp/trunk/runtime/cmake/LibompHandleFlags.cmake @@ -151,6 +151,7 @@ function(libomp_get_libflags libflags) set(libflags_local) libomp_append(libflags_local "${CMAKE_THREAD_LIBS_INIT}") + libomp_append(libflags_local "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC) if(${IA32}) libomp_append(libflags_local -lirc_pic LIBOMP_HAVE_IRC_PIC_LIBRARY) endif() Index: openmp/trunk/runtime/cmake/LibompMicroTests.cmake =================================================================== --- openmp/trunk/runtime/cmake/LibompMicroTests.cmake +++ openmp/trunk/runtime/cmake/LibompMicroTests.cmake @@ -82,10 +82,13 @@ libomp_append(libomp_test_touch_cflags -m32 LIBOMP_HAVE_M32_FLAG) endif() libomp_append(libomp_test_touch_libs ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}) + libomp_append(libomp_test_touch_libs "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC) if(APPLE) set(libomp_test_touch_env "DYLD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{DYLD_LIBRARY_PATH}") + libomp_append(libomp_test_touch_ldflags "-Wl,-rpath,${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC) else() set(libomp_test_touch_env "LD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{LD_LIBRARY_PATH}") + libomp_append(libomp_test_touch_ldflags "-Wl,-rpath=${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC) endif() endif() macro(libomp_test_touch_recipe test_touch_dir) @@ -169,8 +172,10 @@ set(libomp_expected_library_deps) if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") set(libomp_expected_library_deps libc.so.7 libthr.so.3) + libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC) elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD") set(libomp_expected_library_deps libc.so.12 libpthread.so.1 libm.so.0) + libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC) elseif(APPLE) set(libomp_expected_library_deps /usr/lib/libSystem.B.dylib) elseif(WIN32) @@ -203,6 +208,7 @@ libomp_append(libomp_expected_library_deps ld64.so.1) endif() libomp_append(libomp_expected_library_deps libpthread.so.0 IF_FALSE STUBS_LIBRARY) + libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC) endif() libomp_append(libomp_expected_library_deps libstdc++.so.6 LIBOMP_USE_STDCPPLIB) endif() Index: openmp/trunk/runtime/cmake/config-ix.cmake =================================================================== --- openmp/trunk/runtime/cmake/config-ix.cmake +++ openmp/trunk/runtime/cmake/config-ix.cmake @@ -12,6 +12,7 @@ include(CheckCCompilerFlag) include(CheckCSourceCompiles) include(CheckCXXCompilerFlag) +include(CheckIncludeFile) include(CheckLibraryExists) include(CheckIncludeFiles) include(LibompCheckLinkerFlag) @@ -211,3 +212,25 @@ endif() endif() +# Check if HWLOC support is available +if(${LIBOMP_USE_HWLOC}) + if(WIN32) + set(LIBOMP_HAVE_HWLOC FALSE) + libomp_say("Using hwloc not supported on Windows yet") + else() + set(CMAKE_REQUIRED_INCLUDES ${LIBOMP_HWLOC_INSTALL_DIR}/include) + check_include_file(hwloc.h LIBOMP_HAVE_HWLOC_H) + set(CMAKE_REQUIRED_INCLUDES) + check_library_exists(hwloc hwloc_topology_init + ${LIBOMP_HWLOC_INSTALL_DIR}/lib LIBOMP_HAVE_LIBHWLOC) + find_library(LIBOMP_HWLOC_LIBRARY hwloc ${LIBOMP_HWLOC_INSTALL_DIR}/lib) + get_filename_component(LIBOMP_HWLOC_LIBRARY_DIR ${LIBOMP_HWLOC_LIBRARY} PATH) + if(LIBOMP_HAVE_HWLOC_H AND LIBOMP_HAVE_LIBHWLOC AND LIBOMP_HWLOC_LIBRARY) + set(LIBOMP_HAVE_HWLOC TRUE) + else() + set(LIBOMP_HAVE_HWLOC FALSE) + libomp_say("Could not find hwloc") + endif() + endif() +endif() + Index: openmp/trunk/runtime/src/CMakeLists.txt =================================================================== --- openmp/trunk/runtime/src/CMakeLists.txt +++ openmp/trunk/runtime/src/CMakeLists.txt @@ -42,6 +42,9 @@ ${LIBOMP_INC_DIR} ${LIBOMP_SRC_DIR}/thirdparty/ittnotify ) +if(${LIBOMP_USE_HWLOC}) + include_directories(${LIBOMP_HWLOC_INSTALL_DIR}/include) +endif() # Getting correct source files to build library set(LIBOMP_CFILES) Index: openmp/trunk/runtime/src/i18n/en_US.txt =================================================================== --- openmp/trunk/runtime/src/i18n/en_US.txt +++ openmp/trunk/runtime/src/i18n/en_US.txt @@ -405,6 +405,9 @@ AffGranGroupType "%1$s: granularity=group is not supported with KMP_AFFINITY=%2$s. Using \"granularity=core\"." AffThrPlaceManySockets "KMP_PLACE_THREADS ignored: too many sockets requested." AffThrPlaceDeprecated "KMP_PLACE_THREADS \"o\" offset designator deprecated, please use @ prefix for offset value." +AffUsingHwloc "%1$s: Affinity capable, using hwloc." +AffIgnoringHwloc "%1$s: Ignoring hwloc mechanism." +AffHwlocErrorOccurred "%1$s: Hwloc failed in %2$s. Relying on internal affinity mechanisms." # -------------------------------------------------------------------------------------------------- Index: openmp/trunk/runtime/src/kmp.h =================================================================== --- openmp/trunk/runtime/src/kmp.h +++ openmp/trunk/runtime/src/kmp.h @@ -77,10 +77,18 @@ #include "kmp_os.h" +#include "kmp_safe_c_api.h" + #if KMP_STATS_ENABLED class kmp_stats_list; #endif +#if KMP_USE_HWLOC +#include "hwloc.h" +extern hwloc_topology_t __kmp_hwloc_topology; +extern int __kmp_hwloc_error; +#endif + #if KMP_ARCH_X86 || KMP_ARCH_X86_64 #include #endif @@ -488,6 +496,78 @@ # define KMP_AFFINITY_ENABLE(mask_size) (__kmp_affin_mask_size = mask_size) # define KMP_CPU_SETSIZE (__kmp_affin_mask_size * CHAR_BIT) +#if KMP_USE_HWLOC + +typedef hwloc_cpuset_t kmp_affin_mask_t; +# define KMP_CPU_SET(i,mask) hwloc_bitmap_set((hwloc_cpuset_t)mask, (unsigned)i) +# define KMP_CPU_ISSET(i,mask) hwloc_bitmap_isset((hwloc_cpuset_t)mask, (unsigned)i) +# define KMP_CPU_CLR(i,mask) hwloc_bitmap_clr((hwloc_cpuset_t)mask, (unsigned)i) +# define KMP_CPU_ZERO(mask) hwloc_bitmap_zero((hwloc_cpuset_t)mask) +# define KMP_CPU_COPY(dest, src) hwloc_bitmap_copy((hwloc_cpuset_t)dest, (hwloc_cpuset_t)src) +# define KMP_CPU_COMPLEMENT(max_bit_number, mask) \ + { \ + unsigned i; \ + for(i=0;i<(unsigned)max_bit_number+1;i++) { \ + if(hwloc_bitmap_isset((hwloc_cpuset_t)mask, i)) { \ + hwloc_bitmap_clr((hwloc_cpuset_t)mask, i); \ + } else { \ + hwloc_bitmap_set((hwloc_cpuset_t)mask, i); \ + } \ + } \ + } \ + +# define KMP_CPU_UNION(dest, src) hwloc_bitmap_or((hwloc_cpuset_t)dest, (hwloc_cpuset_t)dest, (hwloc_cpuset_t)src) +# define KMP_CPU_SET_ITERATE(i,mask) \ + for(i = hwloc_bitmap_first((hwloc_cpuset_t)mask); (int)i != -1; i = hwloc_bitmap_next((hwloc_cpuset_t)mask, i)) + +# define KMP_CPU_ALLOC(ptr) ptr = (kmp_affin_mask_t*)hwloc_bitmap_alloc() +# define KMP_CPU_FREE(ptr) hwloc_bitmap_free((hwloc_bitmap_t)ptr); +# define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr) +# define KMP_CPU_FREE_FROM_STACK(ptr) KMP_CPU_FREE(ptr) +# define KMP_CPU_INTERNAL_ALLOC(ptr) KMP_CPU_ALLOC(ptr) +# define KMP_CPU_INTERNAL_FREE(ptr) KMP_CPU_FREE(ptr) + +// +// The following macro should be used to index an array of masks. +// The array should be declared as "kmp_affinity_t *" and allocated with +// size "__kmp_affinity_mask_size * len". The macro takes care of the fact +// that on Windows* OS, sizeof(kmp_affin_t) is really the size of the mask, but +// on Linux* OS, sizeof(kmp_affin_t) is 1. +// +# define KMP_CPU_INDEX(array,i) ((kmp_affin_mask_t*)(array[i])) +# define KMP_CPU_ALLOC_ARRAY(arr, n) { \ + arr = (kmp_affin_mask_t *)__kmp_allocate(n*sizeof(kmp_affin_mask_t)); \ + unsigned i; \ + for(i=0;i<(unsigned)n;i++) { \ + arr[i] = hwloc_bitmap_alloc(); \ + } \ + } +# define KMP_CPU_FREE_ARRAY(arr, n) { \ + unsigned i; \ + for(i=0;i<(unsigned)n;i++) { \ + hwloc_bitmap_free(arr[i]); \ + } \ + __kmp_free(arr); \ + } +# define KMP_CPU_INTERNAL_ALLOC_ARRAY(arr, n) { \ + arr = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(n*sizeof(kmp_affin_mask_t)); \ + unsigned i; \ + for(i=0;i<(unsigned)n;i++) { \ + arr[i] = hwloc_bitmap_alloc(); \ + } \ + } +# define KMP_CPU_INTERNAL_FREE_ARRAY(arr, n) { \ + unsigned i; \ + for(i=0;i<(unsigned)n;i++) { \ + hwloc_bitmap_free(arr[i]); \ + } \ + KMP_INTERNAL_FREE(arr); \ + } + +#else /* KMP_USE_HWLOC */ +# define KMP_CPU_SET_ITERATE(i,mask) \ + for(i = 0; (size_t)i < KMP_CPU_SETSIZE; ++i) + # if KMP_OS_LINUX // // On Linux* OS, the mask is actually a vector of length __kmp_affin_mask_size @@ -526,7 +606,7 @@ } \ } -# define KMP_CPU_COMPLEMENT(mask) \ +# define KMP_CPU_COMPLEMENT(max_bit_number, mask) \ { \ size_t __i; \ for (__i = 0; __i < __kmp_affin_mask_size; __i++) { \ @@ -605,7 +685,7 @@ } \ } -# define KMP_CPU_COMPLEMENT(mask) \ +# define KMP_CPU_COMPLEMENT(max_bit_number, mask) \ { \ int __i; \ for (__i = 0; __i < __kmp_num_proc_groups; __i++) { \ @@ -637,7 +717,7 @@ extern int __kmp_get_proc_group(kmp_affin_mask_t const *mask); -# else +# else /* KMP_GROUP_AFFINITY */ typedef DWORD kmp_affin_mask_t; /* for compatibility with older winbase.h */ @@ -646,7 +726,7 @@ # define KMP_CPU_CLR(i,mask) (*(mask) &= ~(((kmp_affin_mask_t)1) << (i))) # define KMP_CPU_ZERO(mask) (*(mask) = 0) # define KMP_CPU_COPY(dest, src) (*(dest) = *(src)) -# define KMP_CPU_COMPLEMENT(mask) (*(mask) = ~*(mask)) +# define KMP_CPU_COMPLEMENT(max_bit_number, mask) (*(mask) = ~*(mask)) # define KMP_CPU_UNION(dest, src) (*(dest) |= *(src)) # endif /* KMP_GROUP_AFFINITY */ @@ -660,6 +740,10 @@ # define KMP_CPU_ALLOC(ptr) \ (ptr = ((kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size))) # define KMP_CPU_FREE(ptr) __kmp_free(ptr) +# define KMP_CPU_ALLOC_ON_STACK(ptr) (ptr = ((kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size))) +# define KMP_CPU_FREE_FROM_STACK(ptr) /* Nothing */ +# define KMP_CPU_INTERNAL_ALLOC(ptr) (ptr = ((kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(__kmp_affin_mask_size))) +# define KMP_CPU_INTERNAL_FREE(ptr) KMP_INTERNAL_FREE(ptr) // // The following macro should be used to index an array of masks. @@ -670,6 +754,12 @@ // # define KMP_CPU_INDEX(array,i) \ ((kmp_affin_mask_t *)(((char *)(array)) + (i) * __kmp_affin_mask_size)) +# define KMP_CPU_ALLOC_ARRAY(arr, n) arr = (kmp_affin_mask_t *)__kmp_allocate(n * __kmp_affin_mask_size) +# define KMP_CPU_FREE_ARRAY(arr, n) __kmp_free(arr); +# define KMP_CPU_INTERNAL_ALLOC_ARRAY(arr, n) arr = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(n * __kmp_affin_mask_size) +# define KMP_CPU_INTERNAL_FREE_ARRAY(arr, n) KMP_INTERNAL_FREE(arr); + +#endif /* KMP_USE_HWLOC */ // // Declare local char buffers with this size for printing debug and info @@ -716,6 +806,9 @@ affinity_top_method_group, #endif /* KMP_GROUP_AFFINITY */ affinity_top_method_flat, +#if KMP_USE_HWLOC + affinity_top_method_hwloc, +#endif affinity_top_method_default }; Index: openmp/trunk/runtime/src/kmp_affinity.h =================================================================== --- openmp/trunk/runtime/src/kmp_affinity.h +++ openmp/trunk/runtime/src/kmp_affinity.h @@ -57,6 +57,13 @@ bool operator!=(const Address &b) const { return !operator==(b); } + void print() const { + unsigned i; + printf("Depth: %u --- ", depth); + for(i=0;i= 40); + + // bufsize of 0 just retrieves the needed buffer size. + num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask); + + // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes + // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not + // take into account the '\0' character. + if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) { + KMP_SNPRINTF(buf, buf_len, "{}"); + } else if(num_chars_to_write < buf_len - 3) { + // no problem fitting the mask into buf_len number of characters + buf[0] = '{'; + // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer + num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask); + buf[num_chars_written+1] = '}'; + buf[num_chars_written+2] = '\0'; + } else { + // Need to truncate the affinity mask string and add ellipsis. + // To do this, we first write out the '{' + str(mask) + buf[0] = '{'; + hwloc_bitmap_list_snprintf(buf+1, buf_len-7, (hwloc_bitmap_t)mask); + // then, what we do here is go to the 7th to last character, then go backwards until we are NOT + // on a digit then write "...}\0". This way it is a clean ellipsis addition and we don't + // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get + // { 45, 67,...} instead. + scan = buf + buf_len - 7; + while(*scan >= '0' && *scan <= '9' && scan >= buf) + scan--; + *(scan+1) = '.'; + *(scan+2) = '.'; + *(scan+3) = '.'; + *(scan+4) = '}'; + *(scan+5) = '\0'; + } + return buf; +} +#else char * __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) { @@ -102,6 +146,7 @@ KMP_ASSERT(scan <= end); return buf; } +#endif // KMP_USE_HWLOC void @@ -263,6 +308,291 @@ } } +#if KMP_USE_HWLOC +static int +__kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, + kmp_i18n_id_t *const msg_id) +{ + *address2os = NULL; + *msg_id = kmp_i18n_null; + + // + // Save the affinity mask for the current thread. + // + kmp_affin_mask_t *oldMask; + KMP_CPU_ALLOC(oldMask); + __kmp_get_system_affinity(oldMask, TRUE); + + unsigned depth = hwloc_topology_get_depth(__kmp_hwloc_topology); + int threadLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_PU); + int coreLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_CORE); + int pkgLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET); + __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 0; + + // + // This makes an assumption about the topology being four levels: + // machines -> packages -> cores -> hardware threads + // + hwloc_obj_t current_level_iterator = hwloc_get_root_obj(__kmp_hwloc_topology); + hwloc_obj_t child_iterator; + for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL); + child_iterator != NULL; + child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator)) + { + nPackages++; + } + current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, pkgLevel, 0); + for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL); + child_iterator != NULL; + child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator)) + { + nCoresPerPkg++; + } + current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, coreLevel, 0); + for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL); + child_iterator != NULL; + child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator)) + { + __kmp_nThreadsPerCore++; + } + + if (! KMP_AFFINITY_CAPABLE()) + { + // + // Hack to try and infer the machine topology using only the data + // available from cpuid on the current thread, and __kmp_xproc. + // + KMP_ASSERT(__kmp_affinity_type == affinity_none); + + __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; + nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; + if (__kmp_affinity_verbose) { + KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); + KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); + if (__kmp_affinity_uniform_topology()) { + KMP_INFORM(Uniform, "KMP_AFFINITY"); + } else { + KMP_INFORM(NonUniform, "KMP_AFFINITY"); + } + KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, + __kmp_nThreadsPerCore, __kmp_ncores); + } + return 0; + } + + // + // Allocate the data structure to be returned. + // + AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); + + unsigned num_hardware_threads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel); + unsigned i; + hwloc_obj_t hardware_thread_iterator; + int nActiveThreads = 0; + for(i=0;iparent->parent->logical_index; + addr.labels[1] = hardware_thread_iterator->parent->logical_index % nCoresPerPkg; + addr.labels[2] = hardware_thread_iterator->logical_index % __kmp_nThreadsPerCore; + retval[nActiveThreads] = AddrUnsPair(addr, hardware_thread_iterator->os_index); + nActiveThreads++; + } + + // + // If there's only one thread context to bind to, return now. + // + KMP_ASSERT(nActiveThreads > 0); + if (nActiveThreads == 1) { + __kmp_ncores = nPackages = 1; + __kmp_nThreadsPerCore = nCoresPerPkg = 1; + if (__kmp_affinity_verbose) { + char buf[KMP_AFFIN_MASK_PRINT_LEN]; + __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); + + KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); + if (__kmp_affinity_respect_mask) { + KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); + } else { + KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); + } + KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); + KMP_INFORM(Uniform, "KMP_AFFINITY"); + KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, + __kmp_nThreadsPerCore, __kmp_ncores); + } + + if (__kmp_affinity_type == affinity_none) { + __kmp_free(retval); + KMP_CPU_FREE(oldMask); + return 0; + } + + // + // Form an Address object which only includes the package level. + // + Address addr(1); + addr.labels[0] = retval[0].first.labels[pkgLevel-1]; + retval[0].first = addr; + + if (__kmp_affinity_gran_levels < 0) { + __kmp_affinity_gran_levels = 0; + } + + if (__kmp_affinity_verbose) { + __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); + } + + *address2os = retval; + KMP_CPU_FREE(oldMask); + return 1; + } + + // + // Sort the table by physical Id. + // + qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels); + + // + // When affinity is off, this routine will still be called to set + // __kmp_ncores, as well as __kmp_nThreadsPerCore, + // nCoresPerPkg, & nPackages. Make sure all these vars are set + // correctly, and return if affinity is not enabled. + // + __kmp_ncores = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, coreLevel); + + // + // Check to see if the machine topology is uniform + // + unsigned npackages = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, pkgLevel); + unsigned ncores = __kmp_ncores; + unsigned nthreads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel); + unsigned uniform = (npackages * nCoresPerPkg * __kmp_nThreadsPerCore == nthreads); + + // + // Print the machine topology summary. + // + if (__kmp_affinity_verbose) { + char mask[KMP_AFFIN_MASK_PRINT_LEN]; + __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); + + KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); + if (__kmp_affinity_respect_mask) { + KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); + } else { + KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); + } + KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); + if (uniform) { + KMP_INFORM(Uniform, "KMP_AFFINITY"); + } else { + KMP_INFORM(NonUniform, "KMP_AFFINITY"); + } + + kmp_str_buf_t buf; + __kmp_str_buf_init(&buf); + + __kmp_str_buf_print(&buf, "%d", npackages); + //for (level = 1; level <= pkgLevel; level++) { + // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); + // } + KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, + __kmp_nThreadsPerCore, __kmp_ncores); + + __kmp_str_buf_free(&buf); + } + + if (__kmp_affinity_type == affinity_none) { + KMP_CPU_FREE(oldMask); + return 0; + } + + // + // Find any levels with radiix 1, and remove them from the map + // (except for the package level). + // + int new_depth = 0; + int level; + unsigned proc; + for (level = 1; level < (int)depth; level++) { + if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) { + continue; + } + new_depth++; + } + + // + // If we are removing any levels, allocate a new vector to return, + // and copy the relevant information to it. + // + if (new_depth != depth-1) { + AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( + sizeof(AddrUnsPair) * nActiveThreads); + for (proc = 0; (int)proc < nActiveThreads; proc++) { + Address addr(new_depth); + new_retval[proc] = AddrUnsPair(addr, retval[proc].second); + } + int new_level = 0; + for (level = 1; level < (int)depth; level++) { + if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) { + if (level == threadLevel) { + threadLevel = -1; + } + else if ((threadLevel >= 0) && (level < threadLevel)) { + threadLevel--; + } + if (level == coreLevel) { + coreLevel = -1; + } + else if ((coreLevel >= 0) && (level < coreLevel)) { + coreLevel--; + } + if (level < pkgLevel) { + pkgLevel--; + } + continue; + } + for (proc = 0; (int)proc < nActiveThreads; proc++) { + new_retval[proc].first.labels[new_level] + = retval[proc].first.labels[level]; + } + new_level++; + } + + __kmp_free(retval); + retval = new_retval; + depth = new_depth; + } + + if (__kmp_affinity_gran_levels < 0) { + // + // Set the granularity level based on what levels are modeled + // in the machine topology map. + // + __kmp_affinity_gran_levels = 0; + if ((threadLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { + __kmp_affinity_gran_levels++; + } + if ((coreLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { + __kmp_affinity_gran_levels++; + } + if (__kmp_affinity_gran > affinity_gran_package) { + __kmp_affinity_gran_levels++; + } + } + + if (__kmp_affinity_verbose) { + __kmp_affinity_print_topology(retval, nActiveThreads, depth-1, pkgLevel-1, + coreLevel-1, threadLevel-1); + } + + KMP_CPU_FREE(oldMask); + *address2os = retval; + if(depth == 0) return 0; + else return depth-1; +} +#endif // KMP_USE_HWLOC // // If we don't know how to retrieve the machine's processor topology, or @@ -329,7 +659,7 @@ __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); int avail_ct = 0; unsigned int i; - for (i = 0; i < KMP_CPU_SETSIZE; ++i) { + KMP_CPU_SET_ITERATE(i, fullMask) { // // Skip this proc if it is not included in the machine model. // @@ -394,7 +724,7 @@ __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); int avail_ct = 0; int i; - for (i = 0; i < KMP_CPU_SETSIZE; ++i) { + KMP_CPU_SET_ITERATE(i, fullMask) { // // Skip this proc if it is not included in the machine model. // @@ -656,7 +986,7 @@ apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( __kmp_avail_proc * sizeof(apicThreadInfo)); unsigned nApics = 0; - for (i = 0; i < KMP_CPU_SETSIZE; ++i) { + KMP_CPU_SET_ITERATE(i, fullMask) { // // Skip this proc if it is not included in the machine model. // @@ -1167,7 +1497,7 @@ // unsigned int proc; int nApics = 0; - for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { + KMP_CPU_SET_ITERATE(proc, fullMask) { // // Skip this proc if it is not included in the machine model. // @@ -2282,8 +2612,8 @@ maxOsId = osId; } } - kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( - (maxOsId + 1) * __kmp_affin_mask_size); + kmp_affin_mask_t *osId2Mask; + KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1)); // // Sort the address2os table according to physical order. Doing so @@ -2314,8 +2644,8 @@ unsigned j = 0; // index of 1st thread on core unsigned leader = 0; Address *leaderAddr = &(address2os[0].first); - kmp_affin_mask_t *sum - = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); + kmp_affin_mask_t *sum; + KMP_CPU_ALLOC_ON_STACK(sum); KMP_CPU_ZERO(sum); KMP_CPU_SET(address2os[0].second, sum); for (i = 1; i < numAddrs; i++) { @@ -2365,6 +2695,7 @@ address2os[j].first.leader = (j == leader); } unique++; + KMP_CPU_FREE_FROM_STACK(sum); *maxIndex = maxOsId; *numUnique = unique; @@ -2384,9 +2715,17 @@ #define ADD_MASK(_mask) \ { \ if (nextNewMask >= numNewMasks) { \ + int i; \ numNewMasks *= 2; \ - newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ - numNewMasks * __kmp_affin_mask_size); \ + kmp_affin_mask_t* temp; \ + KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ + for(i=0;i= '0') && (**scan <= '9')) { next = *scan; @@ -2866,17 +3208,23 @@ unsigned int *out_numMasks, const char *placelist, kmp_affin_mask_t *osId2Mask, int maxOsId) { + int i,j,count,stride,sign; const char *scan = placelist; const char *next = placelist; numNewMasks = 2; - newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks - * __kmp_affin_mask_size); + KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); nextNewMask = 0; - kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( - __kmp_affin_mask_size); + // tempMask is modified based on the previous or initial + // place to form the current place + // previousMask contains the previous place + kmp_affin_mask_t *tempMask; + kmp_affin_mask_t *previousMask; + KMP_CPU_ALLOC(tempMask); KMP_CPU_ZERO(tempMask); + KMP_CPU_ALLOC(previousMask); + KMP_CPU_ZERO(previousMask); int setSize = 0; for (;;) { @@ -2910,7 +3258,7 @@ "bad explicit places list"); next = scan; SKIP_DIGITS(next); - int count = __kmp_str_to_int(scan, *next); + count = __kmp_str_to_int(scan, *next); KMP_ASSERT(count >= 0); scan = next; @@ -2918,7 +3266,6 @@ // valid follow sets are ',' ':' and EOL // SKIP_WS(scan); - int stride; if (*scan == '\0' || *scan == ',') { stride = +1; } @@ -2929,7 +3276,7 @@ // // Read stride parameter // - int sign = +1; + sign = +1; for (;;) { SKIP_WS(scan); if (*scan == '+') { @@ -2954,66 +3301,30 @@ stride *= sign; } - if (stride > 0) { - int i; - for (i = 0; i < count; i++) { - int j; - if (setSize == 0) { - break; - } - ADD_MASK(tempMask); - setSize = 0; - for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { - if (! KMP_CPU_ISSET(j - stride, tempMask)) { - KMP_CPU_CLR(j, tempMask); - } - else if ((j > maxOsId) || - (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { - if ((__kmp_affinity_verbose || (__kmp_affinity_warnings - && (__kmp_affinity_type != affinity_none))) && i < count - 1) { - KMP_WARNING(AffIgnoreInvalidProcID, j); - } - KMP_CPU_CLR(j, tempMask); - } - else { - KMP_CPU_SET(j, tempMask); - setSize++; - } - } - for (; j >= 0; j--) { - KMP_CPU_CLR(j, tempMask); - } + // Add places determined by initial_place : count : stride + for (i = 0; i < count; i++) { + if (setSize == 0) { + break; } - } - else { - int i; - for (i = 0; i < count; i++) { - int j; - if (setSize == 0) { - break; + // Add the current place, then build the next place (tempMask) from that + KMP_CPU_COPY(previousMask, tempMask); + ADD_MASK(previousMask); + KMP_CPU_ZERO(tempMask); + setSize = 0; + KMP_CPU_SET_ITERATE(j, previousMask) { + if (! KMP_CPU_ISSET(j, previousMask)) { + continue; } - ADD_MASK(tempMask); - setSize = 0; - for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; - j++) { - if (! KMP_CPU_ISSET(j - stride, tempMask)) { - KMP_CPU_CLR(j, tempMask); - } - else if ((j > maxOsId) || - (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { - if ((__kmp_affinity_verbose || (__kmp_affinity_warnings - && (__kmp_affinity_type != affinity_none))) && i < count - 1) { - KMP_WARNING(AffIgnoreInvalidProcID, j); - } - KMP_CPU_CLR(j, tempMask); - } - else { - KMP_CPU_SET(j, tempMask); - setSize++; + else if ((j+stride > maxOsId) || (j+stride < 0) || + (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) { + if ((__kmp_affinity_verbose || (__kmp_affinity_warnings + && (__kmp_affinity_type != affinity_none))) && i < count - 1) { + KMP_WARNING(AffIgnoreInvalidProcID, j+stride); } } - for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { - KMP_CPU_CLR(j, tempMask); + else { + KMP_CPU_SET(j+stride, tempMask); + setSize++; } } } @@ -3038,14 +3349,18 @@ *out_numMasks = nextNewMask; if (nextNewMask == 0) { *out_masks = NULL; - KMP_INTERNAL_FREE(newMasks); + KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); return; } - *out_masks - = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); - KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); - __kmp_free(tempMask); - KMP_INTERNAL_FREE(newMasks); + KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); + KMP_CPU_FREE(tempMask); + KMP_CPU_FREE(previousMask); + for(i = 0; i < nextNewMask; i++) { + kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); + kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); + KMP_CPU_COPY(dest, src); + } + KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); } # endif /* OMP_40_ENABLED */ @@ -3140,7 +3455,7 @@ // processors that we know about on the machine. // if (fullMask == NULL) { - fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); + KMP_CPU_ALLOC(fullMask); } if (KMP_AFFINITY_CAPABLE()) { if (__kmp_affinity_respect_mask) { @@ -3151,7 +3466,7 @@ // unsigned i; __kmp_avail_proc = 0; - for (i = 0; i < KMP_CPU_SETSIZE; ++i) { + KMP_CPU_SET_ITERATE(i, fullMask) { if (! KMP_CPU_ISSET(i, fullMask)) { continue; } @@ -3193,39 +3508,60 @@ // const char *file_name = NULL; int line = 0; - -# if KMP_ARCH_X86 || KMP_ARCH_X86_64 - - if (__kmp_affinity_verbose) { - KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); +# if KMP_USE_HWLOC + if (depth < 0) { + if (__kmp_affinity_verbose) { + KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); + } + if(!__kmp_hwloc_error) { + depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); + if (depth == 0) { + KMP_ASSERT(__kmp_affinity_type == affinity_none); + KMP_ASSERT(address2os == NULL); + return; + } else if(depth < 0 && __kmp_affinity_verbose) { + KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); + } + } else if(__kmp_affinity_verbose) { + KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); + } } +# endif - file_name = NULL; - depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); - if (depth == 0) { - KMP_ASSERT(__kmp_affinity_type == affinity_none); - KMP_ASSERT(address2os == NULL); - return; - } +# if KMP_ARCH_X86 || KMP_ARCH_X86_64 if (depth < 0) { if (__kmp_affinity_verbose) { - if (msg_id != kmp_i18n_null) { - KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), - KMP_I18N_STR(DecodingLegacyAPIC)); - } - else { - KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); - } + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); } file_name = NULL; - depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); + depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); if (depth == 0) { KMP_ASSERT(__kmp_affinity_type == affinity_none); KMP_ASSERT(address2os == NULL); return; } + + if (depth < 0) { + if (__kmp_affinity_verbose) { + if (msg_id != kmp_i18n_null) { + KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), + KMP_I18N_STR(DecodingLegacyAPIC)); + } + else { + KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); + } + } + + file_name = NULL; + depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); + if (depth == 0) { + KMP_ASSERT(__kmp_affinity_type == affinity_none); + KMP_ASSERT(address2os == NULL); + return; + } + } } # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ @@ -3430,6 +3766,50 @@ KMP_ASSERT(address2os != NULL); } +# if KMP_USE_HWLOC + else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { + if (__kmp_affinity_verbose) { + KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); + } + depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); + if (depth == 0) { + KMP_ASSERT(__kmp_affinity_type == affinity_none); + KMP_ASSERT(address2os == NULL); + return; + } +# if KMP_DEBUG + AddrUnsPair *otheraddress2os = NULL; + int otherdepth = -1; +# if KMP_MIC + otherdepth = __kmp_affinity_create_apicid_map(&otheraddress2os, &msg_id); +# else + otherdepth = __kmp_affinity_create_x2apicid_map(&otheraddress2os, &msg_id); +# endif + if(otheraddress2os != NULL && address2os != NULL) { + int i; + unsigned arent_equal_flag = 0; + for(i=0;i<__kmp_avail_proc;i++) { + if(otheraddress2os[i] != address2os[i]) arent_equal_flag = 1; + } + if(arent_equal_flag) { + KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are different from APICID\n")); + KA_TRACE(10, ("__kmp_aux_affinity_initialize: APICID Table:\n")); + for(i=0;i<__kmp_avail_proc;i++) { + otheraddress2os[i].print(); __kmp_printf("\n"); + } + KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc Table:\n")); + for(i=0;i<__kmp_avail_proc;i++) { + address2os[i].print(); __kmp_printf("\n"); + } + } + else { + KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are same as APICID\n")); + } + } +# endif // KMP_DEBUG + } +# endif // KMP_USE_HWLOC + if (address2os == NULL) { if (KMP_AFFINITY_CAPABLE() && (__kmp_affinity_verbose || (__kmp_affinity_warnings @@ -3608,8 +3988,7 @@ } # endif - __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( - __kmp_affinity_num_masks * __kmp_affin_mask_size); + KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); // // Sort the address2os table according to the current setting of @@ -3679,7 +4058,7 @@ __kmp_affinity_uninitialize(void) { if (__kmp_affinity_masks != NULL) { - __kmp_free(__kmp_affinity_masks); + KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); __kmp_affinity_masks = NULL; } if (fullMask != NULL) { @@ -3909,7 +4288,7 @@ unsigned proc; int num_procs = 0; - for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { + KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) { if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { continue; } @@ -4027,7 +4406,11 @@ } } - if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { + if ((proc < 0) +# if !KMP_USE_HWLOC + || ((unsigned)proc >= KMP_CPU_SETSIZE) +# endif + ) { return -1; } if (! KMP_CPU_ISSET(proc, fullMask)) { @@ -4063,7 +4446,11 @@ } } - if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { + if ((proc < 0) +# if !KMP_USE_HWLOC + || ((unsigned)proc >= KMP_CPU_SETSIZE) +# endif + ) { return -1; } if (! KMP_CPU_ISSET(proc, fullMask)) { @@ -4099,8 +4486,12 @@ } } - if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { - return 0; + if ((proc < 0) +# if !KMP_USE_HWLOC + || ((unsigned)proc >= KMP_CPU_SETSIZE) +# endif + ) { + return -1; } if (! KMP_CPU_ISSET(proc, fullMask)) { return 0; @@ -4137,7 +4528,8 @@ KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), "Illegal set affinity operation when not capable"); - kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); + kmp_affin_mask_t *mask; + KMP_CPU_ALLOC_ON_STACK(mask); KMP_CPU_ZERO(mask); // Granularity == thread @@ -4158,9 +4550,11 @@ tid, buf); } __kmp_set_system_affinity( mask, TRUE ); + KMP_CPU_FREE_FROM_STACK(mask); } else { // Non-uniform topology - kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); + kmp_affin_mask_t *mask; + KMP_CPU_ALLOC_ON_STACK(mask); KMP_CPU_ZERO(mask); // Number of hyper threads per core in HT machine @@ -4334,6 +4728,7 @@ tid, buf); } __kmp_set_system_affinity( mask, TRUE ); + KMP_CPU_FREE_FROM_STACK(mask); } } Index: openmp/trunk/runtime/src/kmp_config.h.cmake =================================================================== --- openmp/trunk/runtime/src/kmp_config.h.cmake +++ openmp/trunk/runtime/src/kmp_config.h.cmake @@ -51,6 +51,8 @@ #cmakedefine01 LIBOMP_ENABLE_ASSERTIONS #define KMP_USE_ASSERT LIBOMP_ENABLE_ASSERTIONS #cmakedefine01 STUBS_LIBRARY +#cmakedefine01 LIBOMP_USE_HWLOC +#define KMP_USE_HWLOC LIBOMP_USE_HWLOC #define KMP_ARCH_STR "@LIBOMP_LEGAL_ARCH@" #define KMP_LIBRARY_FILE "@LIBOMP_LIB_FILE@" #define KMP_VERSION_MAJOR @LIBOMP_VERSION_MAJOR@ Index: openmp/trunk/runtime/src/kmp_ftn_entry.h =================================================================== --- openmp/trunk/runtime/src/kmp_ftn_entry.h +++ openmp/trunk/runtime/src/kmp_ftn_entry.h @@ -257,7 +257,7 @@ return 0; } - #if KMP_GROUP_AFFINITY + #if KMP_GROUP_AFFINITY && !KMP_USE_HWLOC if ( __kmp_num_proc_groups > 1 ) { return (int)KMP_CPU_SETSIZE; } @@ -278,7 +278,11 @@ if ( ! TCR_4(__kmp_init_middle) ) { __kmp_middle_initialize(); } + # if KMP_USE_HWLOC + *mask = (hwloc_cpuset_t)hwloc_bitmap_alloc(); + # else *mask = kmpc_malloc( __kmp_affin_mask_size ); + # endif KMP_CPU_ZERO( (kmp_affin_mask_t *)(*mask) ); #endif } @@ -300,7 +304,11 @@ KMP_FATAL( AffinityInvalidMask, "kmp_destroy_affinity_mask" ); } } + # if KMP_USE_HWLOC + hwloc_bitmap_free((hwloc_cpuset_t)(*mask)); + # else kmpc_free( *mask ); + # endif *mask = NULL; #endif } Index: openmp/trunk/runtime/src/kmp_global.c =================================================================== --- openmp/trunk/runtime/src/kmp_global.c +++ openmp/trunk/runtime/src/kmp_global.c @@ -33,6 +33,10 @@ // gives reference tick for all events (considered the 0 tick) tsc_tick_count __kmp_stats_start_time; #endif +#if KMP_USE_HWLOC +int __kmp_hwloc_error = FALSE; +hwloc_topology_t __kmp_hwloc_topology = NULL; +#endif /* ----------------------------------------------------- */ /* INITIALIZATION VARIABLES */ Index: openmp/trunk/runtime/src/kmp_settings.c =================================================================== --- openmp/trunk/runtime/src/kmp_settings.c +++ openmp/trunk/runtime/src/kmp_settings.c @@ -3009,6 +3009,11 @@ else if ( __kmp_str_match( "flat", 1, value ) ) { __kmp_affinity_top_method = affinity_top_method_flat; } +# if KMP_USE_HWLOC + else if ( __kmp_str_match( "hwloc", 1, value) ) { + __kmp_affinity_top_method = affinity_top_method_hwloc; + } +# endif else { KMP_WARNING( StgInvalidValue, name, value ); } @@ -5119,11 +5124,43 @@ // affinity. // const char *var = "KMP_AFFINITY"; +# if KMP_USE_HWLOC + if(hwloc_topology_init(&__kmp_hwloc_topology) < 0) { + __kmp_hwloc_error = TRUE; + if(__kmp_affinity_verbose) + KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()"); + } + hwloc_topology_ignore_type(__kmp_hwloc_topology, HWLOC_OBJ_CACHE); +# endif if ( __kmp_affinity_type == affinity_disabled ) { KMP_AFFINITY_DISABLE(); } else if ( ! KMP_AFFINITY_CAPABLE() ) { +# if KMP_USE_HWLOC + const hwloc_topology_support* topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); + if(hwloc_topology_load(__kmp_hwloc_topology) < 0) { + __kmp_hwloc_error = TRUE; + if(__kmp_affinity_verbose) + KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()"); + } + // Is the system capable of setting/getting this thread's affinity? + // also, is topology discovery possible? (pu indicates ability to discover processing units) + // and finally, were there no errors when calling any hwloc_* API functions? + if(topology_support->cpubind->set_thisthread_cpubind && + topology_support->cpubind->get_thisthread_cpubind && + topology_support->discovery->pu && + !__kmp_hwloc_error) + { + // enables affinity according to KMP_AFFINITY_CAPABLE() macro + KMP_AFFINITY_ENABLE(TRUE); + } else { + // indicate that hwloc didn't work and disable affinity + __kmp_hwloc_error = TRUE; + KMP_AFFINITY_DISABLE(); + } +# else __kmp_affinity_determine_capable( var ); +# endif // KMP_USE_HWLOC if ( ! KMP_AFFINITY_CAPABLE() ) { if ( __kmp_affinity_verbose || ( __kmp_affinity_warnings && ( __kmp_affinity_type != affinity_default ) Index: openmp/trunk/runtime/src/z_Linux_util.c =================================================================== --- openmp/trunk/runtime/src/z_Linux_util.c +++ openmp/trunk/runtime/src/z_Linux_util.c @@ -175,8 +175,11 @@ { KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), "Illegal set affinity operation when not capable"); - +#if KMP_USE_HWLOC + int retval = hwloc_set_cpubind(__kmp_hwloc_topology, (hwloc_cpuset_t)mask, HWLOC_CPUBIND_THREAD); +#else int retval = syscall( __NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask ); +#endif if (retval >= 0) { return 0; } @@ -198,7 +201,11 @@ KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), "Illegal get affinity operation when not capable"); +#if KMP_USE_HWLOC + int retval = hwloc_get_cpubind(__kmp_hwloc_topology, (hwloc_cpuset_t)mask, HWLOC_CPUBIND_THREAD); +#else int retval = syscall( __NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask ); +#endif if (retval >= 0) { return 0; } @@ -220,10 +227,12 @@ KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), "Illegal set affinity operation when not capable"); - kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); + kmp_affin_mask_t *mask; + KMP_CPU_ALLOC_ON_STACK(mask); KMP_CPU_ZERO(mask); KMP_CPU_SET(which, mask); __kmp_set_system_affinity(mask, TRUE); + KMP_CPU_FREE_FROM_STACK(mask); } /* Index: openmp/trunk/runtime/test/CMakeLists.txt =================================================================== --- openmp/trunk/runtime/test/CMakeLists.txt +++ openmp/trunk/runtime/test/CMakeLists.txt @@ -1,12 +1,23 @@ # CMakeLists.txt file for unit testing OpenMP Library include(FindPythonInterp) include(CheckTypeSize) + if(NOT PYTHONINTERP_FOUND) libomp_warning_say("Could not find Python.") libomp_warning_say("The check-libomp target will not be available!") return() endif() +macro(pythonize_bool var) + if (${var}) + set(${var} True) + else() + set(${var} False) + endif() +endmacro() + +pythonize_bool(LIBOMP_USE_HWLOC) + set(LIBOMP_TEST_CFLAGS "" CACHE STRING "Extra compiler flags to send to the test compiler") Index: openmp/trunk/runtime/test/lit.cfg =================================================================== --- openmp/trunk/runtime/test/lit.cfg +++ openmp/trunk/runtime/test/lit.cfg @@ -9,11 +9,20 @@ config = object() lit_config = object() -def append_dynamic_library_path(name, value, sep): +def append_dynamic_library_path(path): + if config.operating_system == 'Windows': + name = 'PATH' + sep = ';' + elif config.operating_system == 'Darwin': + name = 'DYLD_LIBRARY_PATH' + sep = ':' + else: + name = 'LD_LIBRARY_PATH' + sep = ':' if name in config.environment: - config.environment[name] = value + sep + config.environment[name] + config.environment[name] = path + sep + config.environment[name] else: - config.environment[name] = value + config.environment[name] = path # name: The name of this test suite. config.name = 'libomp' @@ -38,13 +47,15 @@ " " + config.test_extra_cflags # Setup environment to find dynamic library at runtime -if config.operating_system == 'Windows': - append_dynamic_library_path('PATH', config.library_dir, ";") -elif config.operating_system == 'Darwin': - append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":") +append_dynamic_library_path(config.library_dir) +if config.using_hwloc: + append_dynamic_library_path(config.hwloc_library_dir) + +# Rpath modifications for Darwin +if config.operating_system == 'Darwin': config.test_cflags += " -Wl,-rpath," + config.library_dir -else: # Unices - append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":") + if config.using_hwloc: + config.test_cflags += " -Wl,-rpath," + config.hwloc_library_dir # substitutions config.substitutions.append(("%libomp-compile-and-run", \ Index: openmp/trunk/runtime/test/lit.site.cfg.in =================================================================== --- openmp/trunk/runtime/test/lit.site.cfg.in +++ openmp/trunk/runtime/test/lit.site.cfg.in @@ -7,6 +7,8 @@ config.library_dir = "@LIBOMP_LIBRARY_DIR@" config.omp_header_directory = "@LIBOMP_BINARY_DIR@/src" config.operating_system = "@CMAKE_SYSTEM_NAME@" +config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@" +config.using_hwloc = @LIBOMP_USE_HWLOC@ # Let the main config do the real work. lit_config.load_config(config, "@LIBOMP_BASE_DIR@/test/lit.cfg")