Index: runtime/src/kmp_affinity.cpp =================================================================== --- runtime/src/kmp_affinity.cpp +++ runtime/src/kmp_affinity.cpp @@ -3964,7 +3964,7 @@ case affinity_balanced: // Balanced works only for the case of a single package - if( nPackages > 1 ) { + if( depth <= 1 ) { if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); } @@ -3984,7 +3984,12 @@ if( nth_per_core > 1 ) { core_level = depth - 2; } else { - core_level = depth - 1; + if(( nPackages > 1) && ( nCoresPerPkg > 1)) { + core_level = depth - 2; + nth_per_core = nCoresPerPkg; + } else { + core_level = depth - 1; + } } int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; int nproc = nth_per_core * ncores; @@ -4556,6 +4561,26 @@ // Dynamic affinity settings - Affinity balanced void __kmp_balanced_affinity( int tid, int nthreads ) { + bool fine_gran = true; + + switch (__kmp_affinity_gran) { + case affinity_gran_fine: + case affinity_gran_thread: + break; + case affinity_gran_core: + if( __kmp_nThreadsPerCore > 1) { + fine_gran = false; + } + break; + case affinity_gran_package: + if( nCoresPerPkg > 1) { + fine_gran = false; + } + break; + default: + fine_gran = false; + } + if( __kmp_affinity_uniform_topology() ) { int coreID; int threadID; @@ -4563,6 +4588,10 @@ int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; // Number of cores int ncores = __kmp_ncores; + if(( nPackages > 1) && ( __kmp_nth_per_core <= 1)) { + __kmp_nth_per_core = __kmp_avail_proc / nPackages; + ncores = nPackages; + } // How many threads will be bound to each core int chunk = nthreads / ncores; // How many cores will have an additional thread bound to it - "big cores" @@ -4584,11 +4613,10 @@ KMP_CPU_ALLOC_ON_STACK(mask); KMP_CPU_ZERO(mask); - // Granularity == thread - if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { + if( fine_gran) { int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; KMP_CPU_SET( osID, mask); - } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core + } else { for( int i = 0; i < __kmp_nth_per_core; i++ ) { int osID; osID = address2os[ coreID * __kmp_nth_per_core + i ].second; @@ -4615,7 +4643,12 @@ if( nth_per_core > 1 ) { core_level = __kmp_aff_depth - 2; } else { - core_level = __kmp_aff_depth - 1; + if(( nPackages > 1) && ( nCoresPerPkg > 1)) { + nth_per_core = nCoresPerPkg; + core_level = __kmp_aff_depth - 2; + } else { + core_level = __kmp_aff_depth - 1; + } } // Number of cores - maximum value; it does not count trail cores with 0 processors @@ -4623,10 +4656,10 @@ // For performance gain consider the special case nthreads == __kmp_avail_proc if( nthreads == __kmp_avail_proc ) { - if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { + if( fine_gran) { int osID = address2os[ tid ].second; KMP_CPU_SET( osID, mask); - } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core + } else { int coreID = address2os[ tid ].first.labels[ core_level ]; // We'll count found osIDs for the current core; they can be not more than nth_per_core; // since the address2os is sortied we can break when cnt==nth_per_core @@ -4643,7 +4676,7 @@ } } } - } else if( nthreads <= __kmp_ncores ) { + } else if( nthreads <= ncores ) { int core = 0; for( int i = 0; i < ncores; i++ ) { @@ -4661,8 +4694,8 @@ int osID = procarr[ i * nth_per_core + j ]; if( osID != -1 ) { KMP_CPU_SET( osID, mask ); - // For granularity=thread it is enough to set the first available osID for this core - if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { + // For fine granularity it is enough to set the first available osID for this core + if( fine_gran) { break; } } @@ -4674,7 +4707,7 @@ } } - } else { // nthreads > __kmp_ncores + } else { // nthreads > ncores // Array to save the number of processors at each core int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores); @@ -4754,11 +4787,10 @@ for( int i = 0; i < nproc; i++ ) { sum += newarr[ i ]; if( sum > tid ) { - // Granularity == thread - if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { + if( fine_gran) { int osID = procarr[ i ]; KMP_CPU_SET( osID, mask); - } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core + } else { int coreID = i / nth_per_core; for( int ii = 0; ii < nth_per_core; ii++ ) { int osID = procarr[ coreID * nth_per_core + ii ];