diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2874,6 +2874,9 @@ kmp_lock_t r_begin_lock; volatile int r_begin; int r_blocktime; /* blocktime for this root and descendants */ +#if KMP_AFFINITY_SUPPORTED + int r_affinity_assigned; +#endif // KMP_AFFINITY_SUPPORTED } kmp_base_root_t; typedef union KMP_ALIGN_CACHE kmp_root { @@ -3495,6 +3498,16 @@ #if KMP_OS_LINUX || KMP_OS_FREEBSD extern int kmp_set_thread_affinity_mask_initial(void); #endif +static inline void __kmp_assign_root_init_mask() { + int gtid = __kmp_entry_gtid(); + kmp_root_t *r = __kmp_threads[gtid]->th.th_root; + if (r->r.r_uber_thread == __kmp_threads[gtid] && !r->r.r_affinity_assigned) { + __kmp_affinity_set_init_mask(gtid, TRUE); + r->r.r_affinity_assigned = TRUE; + } +} +#else /* KMP_AFFINITY_SUPPORTED */ +#define __kmp_assign_root_init_mask() /* Nothing */ #endif /* KMP_AFFINITY_SUPPORTED */ // No need for KMP_AFFINITY_SUPPORTED guard as only one field in the // format string is for affinity, so platforms that do not support diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -39,6 +39,7 @@ if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && __kmp_str_match_true(env)) { __kmp_middle_initialize(); + __kmp_assign_root_init_mask(); KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); } else if (__kmp_ignore_mppbeg() == FALSE) { // By default __kmp_ignore_mppbeg() returns TRUE. @@ -2023,6 +2024,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); gtid = __kmp_get_gtid(); __kmp_aux_display_affinity(gtid, format); } @@ -2035,6 +2037,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); gtid = __kmp_get_gtid(); __kmp_str_buf_init(&capture_buf); num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf); @@ -2093,6 +2096,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_aux_set_affinity_mask_proc(proc, mask); #endif } @@ -2104,6 +2108,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_aux_unset_affinity_mask_proc(proc, mask); #endif } @@ -2115,6 +2120,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_aux_get_affinity_mask_proc(proc, mask); #endif } diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h --- a/openmp/runtime/src/kmp_ftn_entry.h +++ b/openmp/runtime/src/kmp_ftn_entry.h @@ -217,6 +217,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_aux_set_affinity(mask); #endif } @@ -228,6 +229,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_aux_get_affinity(mask); #endif } @@ -240,6 +242,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_aux_get_affinity_max_proc(); #endif } @@ -253,6 +256,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); mask_internals = __kmp_affinity_dispatch->allocate_mask(); KMP_CPU_ZERO(mask_internals); *mask = mask_internals; @@ -268,6 +272,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); if (__kmp_env_consistency_check) { if (*mask == NULL) { KMP_FATAL(AffinityInvalidMask, "kmp_destroy_affinity_mask"); @@ -286,6 +291,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_aux_set_affinity_mask_proc(KMP_DEREF proc, mask); #endif } @@ -297,6 +303,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_aux_unset_affinity_mask_proc(KMP_DEREF proc, mask); #endif } @@ -308,6 +315,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_aux_get_affinity_mask_proc(KMP_DEREF proc, mask); #endif } @@ -342,6 +350,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); gtid = __kmp_entry_gtid(); thread = __kmp_threads[gtid]; // return thread -> th.th_team -> t.t_current_task[ @@ -487,6 +496,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); gtid = __kmp_get_gtid(); ConvertedString cformat(format, size); __kmp_aux_display_affinity(gtid, cformat.get()); @@ -514,6 +524,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); gtid = __kmp_get_gtid(); __kmp_str_buf_init(&capture_buf); ConvertedString cformat(format, for_size); @@ -590,6 +601,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); return __kmp_avail_proc; #endif } @@ -779,6 +791,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return 0; return __kmp_affinity_num_masks; @@ -794,6 +807,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return 0; if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks) @@ -819,6 +833,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return; if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks) @@ -844,6 +859,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return -1; gtid = __kmp_entry_gtid(); @@ -863,6 +879,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return 0; gtid = __kmp_entry_gtid(); @@ -889,6 +906,7 @@ if (!TCR_4(__kmp_init_middle)) { __kmp_middle_initialize(); } + __kmp_assign_root_init_mask(); if (!KMP_AFFINITY_CAPABLE()) return; gtid = __kmp_entry_gtid(); diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -1412,6 +1412,9 @@ } #endif + // Assign affinity to root thread if it hasn't happened yet + __kmp_assign_root_init_mask(); + // Nested level will be an index in the nested nthreads array level = parent_team->t.t_level; // used to launch non-serial teams even if nested is not allowed @@ -3171,6 +3174,9 @@ root->r.r_active = FALSE; root->r.r_in_parallel = 0; root->r.r_blocktime = __kmp_dflt_blocktime; +#if KMP_AFFINITY_SUPPORTED + root->r.r_affinity_assigned = FALSE; +#endif /* setup the root team for this task */ /* allocate the root team structure */ @@ -3816,9 +3822,6 @@ root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; - if (TCR_4(__kmp_init_middle)) { - __kmp_affinity_set_init_mask(gtid, TRUE); - } #endif /* KMP_AFFINITY_SUPPORTED */ root_thread->th.th_def_allocator = __kmp_def_allocator; root_thread->th.th_prev_level = 0; @@ -7037,13 +7040,6 @@ // number of cores on the machine. __kmp_affinity_initialize(); - // Run through the __kmp_threads array and set the affinity mask - // for each root thread that is currently registered with the RTL. - for (i = 0; i < __kmp_threads_capacity; i++) { - if (TCR_PTR(__kmp_threads[i]) != NULL) { - __kmp_affinity_set_init_mask(i, TRUE); - } - } #endif /* KMP_AFFINITY_SUPPORTED */ KMP_ASSERT(__kmp_xproc > 0); @@ -7165,6 +7161,7 @@ if (!__kmp_init_middle) { __kmp_do_middle_initialize(); } + __kmp_assign_root_init_mask(); __kmp_resume_if_hard_paused(); /* begin initialization */ @@ -7471,6 +7468,7 @@ // Remember the number of threads for inner parallel regions if (!TCR_4(__kmp_init_middle)) __kmp_middle_initialize(); // get internal globals calculated + __kmp_assign_root_init_mask(); KMP_DEBUG_ASSERT(__kmp_avail_proc); KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); diff --git a/openmp/runtime/test/affinity/root-threads-affinity.c b/openmp/runtime/test/affinity/root-threads-affinity.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/affinity/root-threads-affinity.c @@ -0,0 +1,197 @@ +// RUN: %libomp-compile && env LIBOMP_NUM_HIDDEN_HELPER_THREADS=0 OMP_PROC_BIND=close OMP_PLACES=cores KMP_AFFINITY=verbose %libomp-run 8 1 4 +// REQUIRED: linux +// +// This test pthread_creates 8 root threads before any OpenMP +// runtime entry is ever called. We have all the root threads +// register with the runtime by calling omp_set_num_threads(), +// but this does not initialize their affinity. The fourth root thread +// then calls a parallel region and we make sure its affinity +// is correct. We also make sure all the other root threads are +// free-floating since they have not called into a parallel region. + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "libomp_test_affinity.h" + +volatile int entry_flag = 0; +volatile int flag = 0; +volatile int num_roots_arrived = 0; +int num_roots; +int spawner = 0; +pthread_mutex_t lock; +int register_workers = 0; // boolean +affinity_mask_t *full_mask; + +int __kmpc_global_thread_num(void*); + +int get_os_thread_id() { + return (int)syscall(SYS_gettid); +} + +int place_and_affinity_match() { + int i, max_cpu; + char buf[512]; + affinity_mask_t *mask = affinity_mask_alloc(); + int place = omp_get_place_num(); + int num_procs = omp_get_place_num_procs(place); + int *ids = (int*)malloc(sizeof(int) * num_procs); + omp_get_place_proc_ids(place, ids); + get_thread_affinity(mask); + affinity_mask_snprintf(buf, sizeof(buf), mask); + printf("Primary Thread Place: %d\n", place); + printf("Primary Thread mask: %s\n", buf); + + for (i = 0; i < num_procs; ++i) { + int cpu = ids[i]; + if (!affinity_mask_isset(mask, cpu)) + return 0; + } + + max_cpu = AFFINITY_MAX_CPUS; + for (i = 0; i < max_cpu; ++i) { + int cpu = i; + if (affinity_mask_isset(mask, cpu)) { + int j, found = 0; + for (j = 0; j < num_procs; ++j) { + if (ids[j] == cpu) { + found = 1; + break; + } + } + if (!found) + return 0; + } + } + + affinity_mask_free(mask); + free(ids); + return 1; +} + +void* thread_func(void *arg) { + int place, nplaces; + int root_id = *((int*)arg); + int pid = getpid(); + int tid = get_os_thread_id(); + + // Order how the root threads are assigned a gtid in the runtime + // i.e., root_id = gtid + while (1) { + int v = entry_flag; + if (v == root_id) + break; + } + + // If main root thread + if (root_id == spawner) { + printf("Initial application thread (pid=%d, tid=%d, spawner=%d) reached thread_func (will call OpenMP)\n", pid, tid, spawner); + omp_set_num_threads(4); + #pragma omp atomic + entry_flag++; + // Wait for the workers to signal their arrival before #pragma omp parallel + while (num_roots_arrived < num_roots - 1) {} + // This will trigger the output for KMP_AFFINITY in this case + #pragma omp parallel + { + int gtid = __kmpc_global_thread_num(NULL); + #pragma omp single + { + printf("Exactly %d threads in the #pragma omp parallel\n", + omp_get_num_threads()); + } + #pragma omp critical + { + printf("OpenMP thread %d: gtid=%d\n", omp_get_thread_num(), gtid); + } + } + flag = 1; + if (!place_and_affinity_match()) { + fprintf(stderr, "error: place and affinity mask do not match for primary thread\n"); + exit (EXIT_FAILURE); + } + + } else { // If worker root thread + // Worker root threads, register with OpenMP through omp_set_num_threads() + // if designated to, signal their arrival and then wait for the main root + // thread to signal them to exit. + printf("New root pthread (pid=%d, tid=%d) reached thread_func\n", pid, tid); + if (register_workers) + omp_set_num_threads(4); + #pragma omp atomic + entry_flag++; + + pthread_mutex_lock(&lock); + num_roots_arrived++; + pthread_mutex_unlock(&lock); + while (flag == 0) {} + + // Main check whether root threads' mask is equal to the + // initial affinity mask + affinity_mask_t *mask = affinity_mask_alloc(); + get_thread_affinity(mask); + if (!affinity_mask_equal(mask, full_mask)) { + char buf[1024]; + printf("root thread %d mask: ", root_id); + affinity_mask_snprintf(buf, sizeof(buf), mask); + printf("initial affinity mask: %s\n", buf); + fprintf(stderr, "error: root thread %d affinity mask not equal" + " to initial full mask\n", root_id); + affinity_mask_free(mask); + exit(EXIT_FAILURE); + } + affinity_mask_free(mask); + } + return NULL; +} + +int main(int argc, char** argv) { + int i; + if (argc != 3 && argc != 4) { + fprintf(stderr, "usage: %s []\n", argv[0]); + exit(EXIT_FAILURE); + } + + // Initialize pthread mutex + pthread_mutex_init(&lock, NULL); + + // Get initial full mask + full_mask = affinity_mask_alloc(); + get_thread_affinity(full_mask); + + // Get the number of root pthreads to create and allocate resources for them + num_roots = atoi(argv[1]); + pthread_t *roots = (pthread_t*)malloc(sizeof(pthread_t) * num_roots); + int *root_ids = (int*)malloc(sizeof(int) * num_roots); + + // Get the flag indicating whether to have root pthreads call omp_set_num_threads() or not + register_workers = atoi(argv[2]); + + if (argc == 4) + spawner = atoi(argv[3]); + + // Spawn worker root threads + for (i = 1; i < num_roots; ++i) { + *(root_ids + i) = i; + pthread_create(roots + i, NULL, thread_func, root_ids + i); + } + // Have main root thread (root 0) go into thread_func + *root_ids = 0; + thread_func(root_ids); + + // Cleanup all resources + for (i = 1; i < num_roots; ++i) { + void *status; + pthread_join(roots[i], &status); + } + free(roots); + free(root_ids); + pthread_mutex_destroy(&lock); + return EXIT_SUCCESS; +}