Index: runtime/src/kmp_affinity.cpp
===================================================================
--- runtime/src/kmp_affinity.cpp
+++ runtime/src/kmp_affinity.cpp
@@ -3964,7 +3964,7 @@
 
         case affinity_balanced:
         // Balanced works only for the case of a single package
-        if( nPackages > 1 ) {
+        if( depth <= 1 ) {
             if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
                 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
             }
@@ -3984,7 +3984,12 @@
             if( nth_per_core > 1 ) {
                 core_level = depth - 2;
             } else {
-                core_level = depth - 1;
+                if(( nPackages > 1) && ( nCoresPerPkg > 1)) {
+                  core_level = depth - 2;
+                  nth_per_core = nCoresPerPkg;
+                } else {
+                  core_level = depth - 1;
+                }
             }
             int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
             int nproc = nth_per_core * ncores;
@@ -4556,6 +4561,26 @@
 // Dynamic affinity settings - Affinity balanced
 void __kmp_balanced_affinity( int tid, int nthreads )
 {
+    bool fine_gran = true;
+
+    switch (__kmp_affinity_gran) {
+        case affinity_gran_fine:
+        case affinity_gran_thread:
+            break;
+        case affinity_gran_core:
+            if( __kmp_nThreadsPerCore > 1) {
+                fine_gran = false;
+            }
+            break;
+        case affinity_gran_package:
+            if( nCoresPerPkg > 1) {
+                fine_gran = false;
+            }
+            break;
+        default:
+            fine_gran = false;
+    }
+
     if( __kmp_affinity_uniform_topology() ) {
         int coreID;
         int threadID;
@@ -4563,6 +4588,10 @@
         int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
         // Number of cores
         int ncores = __kmp_ncores;
+        if(( nPackages > 1) && ( __kmp_nth_per_core <= 1)) {
+            __kmp_nth_per_core = __kmp_avail_proc / nPackages;
+            ncores = nPackages;
+        }
         // How many threads will be bound to each core
         int chunk = nthreads / ncores;
         // How many cores will have an additional thread bound to it - "big cores"
@@ -4584,11 +4613,10 @@
         KMP_CPU_ALLOC_ON_STACK(mask);
         KMP_CPU_ZERO(mask);
 
-        // Granularity == thread
-        if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+        if( fine_gran) {
             int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
             KMP_CPU_SET( osID, mask);
-        } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
+        } else {
             for( int i = 0; i < __kmp_nth_per_core; i++ ) {
                 int osID;
                 osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
@@ -4615,7 +4643,12 @@
         if( nth_per_core > 1 ) {
             core_level = __kmp_aff_depth - 2;
         } else {
-            core_level = __kmp_aff_depth - 1;
+            if(( nPackages > 1) && ( nCoresPerPkg > 1)) {
+              nth_per_core = nCoresPerPkg;
+              core_level = __kmp_aff_depth - 2;
+            } else {
+              core_level = __kmp_aff_depth - 1;
+            }
         }
 
         // Number of cores - maximum value; it does not count trail cores with 0 processors
@@ -4623,10 +4656,10 @@
 
         // For performance gain consider the special case nthreads == __kmp_avail_proc
         if( nthreads == __kmp_avail_proc ) {
-            if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+            if( fine_gran) {
                 int osID = address2os[ tid ].second;
                 KMP_CPU_SET( osID, mask);
-            } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
+            } else {
                 int coreID = address2os[ tid ].first.labels[ core_level ];
                 // We'll count found osIDs for the current core; they can be not more than nth_per_core;
                 // since the address2os is sortied we can break when cnt==nth_per_core
@@ -4643,7 +4676,7 @@
                     }
                 }
             }
-        } else if( nthreads <= __kmp_ncores ) {
+        } else if( nthreads <= ncores ) {
 
             int core = 0;
             for( int i = 0; i < ncores; i++ ) {
@@ -4661,8 +4694,8 @@
                             int osID = procarr[ i * nth_per_core + j ];
                             if( osID != -1 ) {
                                 KMP_CPU_SET( osID, mask );
-                                // For granularity=thread it is enough to set the first available osID for this core
-                                if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+                                // For fine granularity it is enough to set the first available osID for this core
+                                if( fine_gran) {
                                     break;
                                 }
                             }
@@ -4674,7 +4707,7 @@
                 }
             }
 
-        } else { // nthreads > __kmp_ncores
+        } else { // nthreads > ncores
 
             // Array to save the number of processors at each core
             int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
@@ -4754,11 +4787,10 @@
             for( int i = 0; i < nproc; i++ ) {
                 sum += newarr[ i ];
                 if( sum > tid ) {
-                    // Granularity == thread
-                    if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+                    if( fine_gran) {
                         int osID = procarr[ i ];
                         KMP_CPU_SET( osID, mask);
-                    } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
+                    } else {
                         int coreID = i / nth_per_core;
                         for( int ii = 0; ii < nth_per_core; ii++ ) {
                             int osID = procarr[ coreID * nth_per_core + ii ];