Index: runtime/src/kmp_affinity.cpp
===================================================================
--- runtime/src/kmp_affinity.cpp
+++ runtime/src/kmp_affinity.cpp
@@ -3524,6 +3524,86 @@
     }
 }
 
+//
+// This function figures out the deepest level at which there is at least one cluster/core
+// with more than one processing unit bound to it.
+//
+static int
+__kmp_affinity_find_core_level(const AddrUnsPair *address2os, int nprocs, int bottom_level)
+{
+    int core_level = 0;
+
+    for( int i = 0; i < nprocs; i++ ) {
+        for( int j = bottom_level; j > 0; j-- ) {
+            if( address2os[i].first.labels[j] > 0 ) {
+                if( core_level < ( j - 1 ) ) {
+                    core_level = j - 1;
+                }
+            }
+        }
+    }
+    return core_level;
+}
+
+//
+// This function counts number of clusters/cores at given level.
+//
+static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level)
+{
+    int ncores = 0;
+    int i, j;
+
+    j = bottom_level;
+    for( i = 0; i < nprocs; i++ ) {
+        for ( j = bottom_level; j > core_level; j-- ) {
+            if( ( i + 1 ) < nprocs ) {
+                if( address2os[i + 1].first.labels[j] > 0 ) {
+                    break;
+                }
+            }
+        }
+        if( j == core_level ) {
+            ncores++;
+        }
+    }
+    if( j > core_level ) {
+        //
+        // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one core.
+        // May occur when called from __kmp_affinity_find_core().
+        //
+        ncores++;
+    }
+    return ncores;
+}
+
+//
+// This function finds to which cluster/core given processing unit is bound.
+//
+static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, int bottom_level, int core_level)
+{
+    return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, core_level) - 1;
+}
+
+//
+// This function finds maximal number of processing units bound to a cluster/core at given level.
+//
+static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level)
+{
+    int maxprocpercore = 0;
+
+    if( core_level < bottom_level ) {
+        for( int i = 0; i < nprocs; i++ ) {
+            int percore = address2os[i].first.labels[core_level + 1] + 1;
+
+            if( percore > maxprocpercore ) {
+                maxprocpercore = percore;
+            }
+       }
+    } else {
+        maxprocpercore = 1;
+    }
+    return maxprocpercore;
+}
 
 static AddrUnsPair *address2os = NULL;
 static int           * procarr = NULL;
@@ -3963,8 +4043,7 @@
         goto sortAddresses;
 
         case affinity_balanced:
-        // Balanced works only for the case of a single package
-        if( nPackages > 1 ) {
+        if( depth <= 1 ) {
             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
             }
@@ -3977,39 +4056,38 @@
             // Save the depth for further usage
             __kmp_aff_depth = depth;
 
-            // Number of hyper threads per core in HT machine
-            int nth_per_core = __kmp_nThreadsPerCore;
+            int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, depth - 1);
+            int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, depth - 1, core_level);
+            int maxprocpercore = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, depth - 1, core_level);
 
-            int core_level;
-            if( nth_per_core > 1 ) {
-                core_level = depth - 2;
-            } else {
-                core_level = depth - 1;
+            int nproc = ncores * maxprocpercore;
+            if( ( nproc < 2 ) || ( nproc < __kmp_avail_proc ) ) {
+                if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
+                    KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
+                }
+                __kmp_affinity_type = affinity_none;
+                return;
             }
-            int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
-            int nproc = nth_per_core * ncores;
 
             procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
             for( int i = 0; i < nproc; i++ ) {
                 procarr[ i ] = -1;
             }
 
+            int lastcore = -1;
+            int inlastcore = 0;
             for( int i = 0; i < __kmp_avail_proc; i++ ) {
                 int proc = address2os[ i ].second;
-                // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
-                // If there is only one thread per core then depth == 2: level 0 - package,
-                // level 1 - core.
-                int level = depth - 1;
-
-                // __kmp_nth_per_core == 1
-                int thread = 0;
-                int core = address2os[ i ].first.labels[ level ];
-                // If the thread level exists, that is we have more than one thread context per core
-                if( nth_per_core > 1 ) {
-                    thread = address2os[ i ].first.labels[ level ] % nth_per_core;
-                    core = address2os[ i ].first.labels[ level - 1 ];
+                int core = __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
+
+                if ( core == lastcore ) {
+                    inlastcore++;
+                } else {
+                    inlastcore = 0;
                 }
-                procarr[ core * nth_per_core + thread ] = proc;
+                lastcore = core;
+
+                procarr[ core * maxprocpercore + inlastcore ] = proc;
             }
 
             break;
@@ -4556,6 +4634,26 @@
 // Dynamic affinity settings - Affinity balanced
 void __kmp_balanced_affinity( int tid, int nthreads )
 {
+    bool fine_gran = true;
+
+    switch (__kmp_affinity_gran) {
+        case affinity_gran_fine:
+        case affinity_gran_thread:
+            break;
+        case affinity_gran_core:
+            if( __kmp_nThreadsPerCore > 1) {
+                fine_gran = false;
+            }
+            break;
+        case affinity_gran_package:
+            if( nCoresPerPkg > 1) {
+                fine_gran = false;
+            }
+            break;
+        default:
+            fine_gran = false;
+    }
+
     if( __kmp_affinity_uniform_topology() ) {
         int coreID;
         int threadID;
@@ -4563,6 +4661,10 @@
         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
         // Number of cores
         int ncores = __kmp_ncores;
+        if( ( nPackages > 1 ) && ( __kmp_nth_per_core <= 1 ) ) {
+            __kmp_nth_per_core = __kmp_avail_proc / nPackages;
+            ncores = nPackages;
+        }
         // How many threads will be bound to each core
         int chunk = nthreads / ncores;
         // How many cores will have an additional thread bound to it - "big cores"
@@ -4584,11 +4686,10 @@
         KMP_CPU_ALLOC_ON_STACK(mask);
         KMP_CPU_ZERO(mask);
 
-        // Granularity == thread
-        if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+        if( fine_gran ) {
             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
             KMP_CPU_SET( osID, mask);
-        } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
+        } else {
             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
                 int osID;
                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
@@ -4609,41 +4710,25 @@
         KMP_CPU_ALLOC_ON_STACK(mask);
         KMP_CPU_ZERO(mask);
 
-        // Number of hyper threads per core in HT machine
-        int nth_per_core = __kmp_nThreadsPerCore;
-        int core_level;
-        if( nth_per_core > 1 ) {
-            core_level = __kmp_aff_depth - 2;
-        } else {
-            core_level = __kmp_aff_depth - 1;
-        }
-
-        // Number of cores - maximum value; it does not count trail cores with 0 processors
-        int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
+        int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
+        int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
+        int nth_per_core = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
 
         // For performance gain consider the special case nthreads == __kmp_avail_proc
         if( nthreads == __kmp_avail_proc ) {
-            if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+            if( fine_gran ) {
                 int osID = address2os[ tid ].second;
                 KMP_CPU_SET( osID, mask);
-            } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
-                int coreID = address2os[ tid ].first.labels[ core_level ];
-                // We'll count found osIDs for the current core; they can be not more than nth_per_core;
-                // since the address2os is sortied we can break when cnt==nth_per_core
-                int cnt = 0;
+            } else {
+                int core = __kmp_affinity_find_core(address2os, tid, __kmp_aff_depth - 1, core_level);
                 for( int i = 0; i < __kmp_avail_proc; i++ ) {
                     int osID = address2os[ i ].second;
-                    int core = address2os[ i ].first.labels[ core_level ];
-                    if( core == coreID ) {
+                    if( __kmp_affinity_find_core(address2os, i,  __kmp_aff_depth - 1, core_level) == core ) {
                         KMP_CPU_SET( osID, mask);
-                        cnt++;
-                        if( cnt == nth_per_core ) {
-                            break;
-                        }
                     }
                 }
             }
-        } else if( nthreads <= __kmp_ncores ) {
+        } else if( nthreads <= ncores ) {
 
             int core = 0;
             for( int i = 0; i < ncores; i++ ) {
@@ -4661,8 +4746,8 @@
                             int osID = procarr[ i * nth_per_core + j ];
                             if( osID != -1 ) {
                                 KMP_CPU_SET( osID, mask );
-                                // For granularity=thread it is enough to set the first available osID for this core
-                                if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+                                // For fine granularity it is enough to set the first available osID for this core
+                                if( fine_gran) {
                                     break;
                                 }
                             }
@@ -4674,7 +4759,7 @@
                 }
             }
 
-        } else { // nthreads > __kmp_ncores
+        } else { // nthreads > ncores
 
             // Array to save the number of processors at each core
             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
@@ -4754,11 +4839,10 @@
             for( int i = 0; i < nproc; i++ ) {
                 sum += newarr[ i ];
                 if( sum > tid ) {
-                    // Granularity == thread
-                    if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+                    if( fine_gran) {
                         int osID = procarr[ i ];
                         KMP_CPU_SET( osID, mask);
-                    } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
+                    } else {
                         int coreID = i / nth_per_core;
                         for( int ii = 0; ii < nth_per_core; ii++ ) {
                             int osID = procarr[ coreID * nth_per_core + ii ];