diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@@ -124,6 +124,9 @@
 ProcGroups                   "processor groups"
 Unknown                      "unknown"
 NoLeaf31Support              "cpuid leaf 31 not supported"
+HwlocFailed                  "Hwloc api failure"
+LLCache                      "LL cache"
+LLCaches                     "LL caches"
 
 
 
@@ -355,6 +358,7 @@
                              "This issue is fixed in an up-to-date compiler."
 OmpNoAllocator               "Allocator %1$s is not available, will use default allocator."
 TopologyGeneric              "%1$s: %2$s (%3$d total cores)"
+AffGranularityBad            "%1$s: granularity setting: %2$s does not exist in topology.  Using granularity=%3$s instead."
 
 # --- OpenMP errors detected at runtime ---
 #
@@ -458,6 +462,11 @@
 UserDirectedError            "%1$s: Encountered user-directed error: %2$s."
 UserDirectedWarning          "%1$s: Encountered user-directed warning: %2$s."
 FailedToCreateTeam           "Failed to create teams between lower bound (%1$d) and upper bound (%2$d)."
+AffHWSubsetManyGeneric       "KMP_HW_SUBSET ignored: %1$s: too many requested."
+AffHWSubsetNotExistGeneric   "KMP_HW_SUBSET ignored: %1$s: level not detected in machine topology."
+AffHWSubsetEqvLayers         "KMP_HW_SUBSET ignored: %1$s, %2$s: layers are equivalent, please only specify one."
+AffHWSubsetOutOfOrder        "KMP_HW_SUBSET ignored: %1$s layer should come after %2$s."
+AffEqualTopologyTypes        "%1$s: topology layer \"%2$s\" is equivalent to \"%3$s\"."
 
 # --------------------------------------------------------------------------------------------------
 -*- HINTS -*-
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -597,11 +597,11 @@
 
 enum kmp_hw_t : int {
   KMP_HW_UNKNOWN = -1,
-  KMP_HW_MACHINE = 0,
-  KMP_HW_SOCKET,
+  KMP_HW_SOCKET = 0,
   KMP_HW_PROC_GROUP,
   KMP_HW_NUMA,
   KMP_HW_DIE,
+  KMP_HW_LLC,
   KMP_HW_L3,
   KMP_HW_TILE,
   KMP_HW_MODULE,
@@ -612,13 +612,16 @@
   KMP_HW_LAST
 };
 
-#define KMP_ASSERT_VALID_HW_TYPE(type)                                         \
+#define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type)                                   \
   KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
+#define KMP_ASSERT_VALID_HW_TYPE(type)                                         \
+  KMP_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
 
 #define KMP_FOREACH_HW_TYPE(type)                                              \
   for (kmp_hw_t type = (kmp_hw_t)0; type < KMP_HW_LAST;                        \
        type = (kmp_hw_t)((int)type + 1))
 
+const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false);
 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false);
 
 /* Only Linux* OS and Windows* OS support thread affinity. */
@@ -655,8 +658,6 @@
 #if KMP_USE_HWLOC
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
-extern int __kmp_numa_detected;
-extern int __kmp_tile_depth;
 #endif
 
 extern size_t __kmp_affin_mask_size;
@@ -784,23 +785,6 @@
   affinity_default
 };
 
-enum affinity_gran {
-  affinity_gran_fine = 0,
-  affinity_gran_thread,
-  affinity_gran_core,
-  affinity_gran_tile,
-  affinity_gran_die,
-  affinity_gran_numa,
-  affinity_gran_package,
-  affinity_gran_node,
-#if KMP_GROUP_AFFINITY
-  // The "group" granularity isn't necesssarily coarser than all of the
-  // other levels, but we put it last in the enum.
-  affinity_gran_group,
-#endif /* KMP_GROUP_AFFINITY */
-  affinity_gran_default
-};
-
 enum affinity_top_method {
   affinity_top_method_all = 0, // try all (supported) methods, in order
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
@@ -822,7 +806,7 @@
 #define affinity_respect_mask_default (-1)
 
 extern enum affinity_type __kmp_affinity_type; /* Affinity type */
-extern enum affinity_gran __kmp_affinity_gran; /* Affinity granularity */
+extern kmp_hw_t __kmp_affinity_gran; /* Affinity granularity */
 extern int __kmp_affinity_gran_levels; /* corresponding int value */
 extern int __kmp_affinity_dups; /* Affinity duplicate masks */
 extern enum affinity_top_method __kmp_affinity_top_method;
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -598,91 +598,274 @@
 #endif /* KMP_OS_WINDOWS */
 #endif /* KMP_AFFINITY_SUPPORTED */
 
-class Address {
+class kmp_hw_thread_t {
 public:
-  static const unsigned maxDepth = 32;
-  unsigned labels[maxDepth];
-  unsigned childNums[maxDepth];
-  unsigned depth;
-  unsigned leader;
-  Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
-  Address &operator=(const Address &b) {
-    depth = b.depth;
-    for (unsigned i = 0; i < depth; i++) {
-      labels[i] = b.labels[i];
-      childNums[i] = b.childNums[i];
-    }
-    leader = FALSE;
-    return *this;
-  }
-  bool operator==(const Address &b) const {
-    if (depth != b.depth)
-      return false;
-    for (unsigned i = 0; i < depth; i++)
-      if (labels[i] != b.labels[i])
-        return false;
-    return true;
-  }
-  bool isClose(const Address &b, int level) const {
-    if (depth != b.depth)
-      return false;
-    if ((unsigned)level >= depth)
-      return true;
-    for (unsigned i = 0; i < (depth - level); i++)
-      if (labels[i] != b.labels[i])
-        return false;
-    return true;
-  }
-  bool operator!=(const Address &b) const { return !operator==(b); }
-  void print() const {
-    unsigned i;
-    printf("Depth: %u --- ", depth);
-    for (i = 0; i < depth; i++) {
-      printf("%u ", labels[i]);
-    }
+  static const int UNKNOWN_ID = -1;
+  static int compare_ids(const void *a, const void *b);
+  static int compare_compact(const void *a, const void *b);
+  int ids[KMP_HW_LAST];
+  int sub_ids[KMP_HW_LAST];
+  bool leader;
+  int os_id;
+  void print() const;
+  void clear() {
+    for (int i = 0; i < (int)KMP_HW_LAST; ++i)
+      ids[i] = UNKNOWN_ID;
+    leader = false;
   }
 };
 
-class AddrUnsPair {
+class kmp_topology_t {
+
+  struct flags_t {
+    int uniform : 1;
+    int reserved : 31;
+  };
+
+  int depth;
+
+  // The following arrays are all 'depth' long
+
+  // Orderd array of the types in the topology
+  kmp_hw_t *types;
+
+  // Keep quick topology ratios, for non-uniform topologies,
+  // this ratio holds the max number of itemAs per itemB
+  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
+  int *ratio;
+
+  // Storage containing the absolute number of each topology layer
+  int *count;
+
+  // The hardware threads array
+  // hw_threads is num_hw_threads long
+  // Each hw_thread's ids and sub_ids are depth deep
+  int num_hw_threads;
+  kmp_hw_thread_t *hw_threads;
+
+  // Equivalence hash where the key is the hardware topology item
+  // and the value is the equivalent hardware topology type in the
+  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
+  // known equivalence for the topology type
+  kmp_hw_t equivalent[KMP_HW_LAST];
+
+  // Flags describing the topology
+  flags_t flags;
+
+  // Count each item & get the num x's per y
+  // e.g., get the number of cores and the number of threads per core
+  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
+  void _gather_enumeration_information();
+
+  // Remove layers that don't add information to the topology.
+  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
+  void _remove_radix1_layers();
+
+  // Find out if the topology is uniform
+  void _discover_uniformity();
+
+  // Set all the sub_ids for each hardware thread
+  void _set_sub_ids();
+
+  // Set global affinity variables describing the number of threads per
+  // core, the number of packages, the number of cores per package, and
+  // the number of cores.
+  void _set_globals();
+
+  // Set the last level cache equivalent type
+  void _set_last_level_cache();
+
 public:
-  Address first;
-  unsigned second;
-  AddrUnsPair(Address _first, unsigned _second)
-      : first(_first), second(_second) {}
-  AddrUnsPair &operator=(const AddrUnsPair &b) {
-    first = b.first;
-    second = b.second;
-    return *this;
-  }
-  void print() const {
-    printf("first = ");
-    first.print();
-    printf(" --- second = %u", second);
-  }
-  bool operator==(const AddrUnsPair &b) const {
-    if (first != b.first)
-      return false;
-    if (second != b.second)
-      return false;
-    return true;
-  }
-  bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
-};
+  // Force use of allocate()/deallocate()
+  kmp_topology_t() = delete;
+  kmp_topology_t(const kmp_topology_t &t) = delete;
+  kmp_topology_t(kmp_topology_t &&t) = delete;
+  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
+  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
+
+  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
+  static void deallocate(kmp_topology_t *);
+
+  // Functions used in create_map() routines
+  kmp_hw_thread_t &at(int index) {
+    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
+    return hw_threads[index];
+  }
+  const kmp_hw_thread_t &at(int index) const {
+    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
+    return hw_threads[index];
+  }
+  int get_num_hw_threads() const { return num_hw_threads; }
+  void sort_ids() {
+    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
+          kmp_hw_thread_t::compare_ids);
+  }
+  // Check if the hardware ids are unique, if they are
+  // return true, otherwise return false
+  bool check_ids() const;
 
-static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
-  const Address *aa = &(((const AddrUnsPair *)a)->first);
-  const Address *bb = &(((const AddrUnsPair *)b)->first);
-  unsigned depth = aa->depth;
-  unsigned i;
-  KMP_DEBUG_ASSERT(depth == bb->depth);
-  for (i = 0; i < depth; i++) {
-    if (aa->labels[i] < bb->labels[i])
+  // Function to call after the create_map() routine
+  void canonicalize();
+  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
+
+  // Functions used after canonicalize() called
+  bool filter_hw_subset();
+  bool is_close(int hwt1, int hwt2, int level) const;
+  bool is_uniform() const { return flags.uniform; }
+  // Tell whether a type is a valid type in the topology
+  // returns KMP_HW_UNKNOWN when there is no equivalent type
+  kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
+  // Set type1 = type2
+  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
+    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
+    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
+    kmp_hw_t real_type2 = equivalent[type2];
+    if (real_type2 == KMP_HW_UNKNOWN)
+      real_type2 = type2;
+    equivalent[type1] = real_type2;
+    // This loop is required since any of the types may have been set to
+    // be equivalent to type1.  They all must be checked and reset to type2.
+    KMP_FOREACH_HW_TYPE(type) {
+      if (equivalent[type] == type1) {
+        equivalent[type] = real_type2;
+      }
+    }
+  }
+  // Calculate number of types corresponding to level1
+  // per types corresponding to level2 (e.g., number of threads per core)
+  int calculate_ratio(int level1, int level2) const {
+    KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
+    KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
+    int r = 1;
+    for (int level = level1; level > level2; --level)
+      r *= ratio[level];
+    return r;
+  }
+  int get_ratio(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+    return ratio[level];
+  }
+  int get_depth() const { return depth; };
+  kmp_hw_t get_type(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+    return types[level];
+  }
+  int get_level(kmp_hw_t type) const {
+    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
+    int eq_type = equivalent[type];
+    if (eq_type == KMP_HW_UNKNOWN)
       return -1;
-    if (aa->labels[i] > bb->labels[i])
-      return 1;
+    for (int i = 0; i < depth; ++i)
+      if (types[i] == eq_type)
+        return i;
+    return -1;
+  }
+  int get_count(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+    return count[level];
+  }
+#if KMP_AFFINITY_SUPPORTED
+  void sort_compact() {
+    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
+          kmp_hw_thread_t::compare_compact);
+  }
+#endif
+  void print(const char *env_var = "KMP_AFFINITY") const;
+  void dump() const;
+};
+
+class kmp_hw_subset_t {
+public:
+  struct item_t {
+    int num;
+    kmp_hw_t type;
+    int offset;
+  };
+
+private:
+  int depth;
+  int capacity;
+  item_t *items;
+  kmp_uint64 set;
+  bool absolute;
+  // The set must be able to handle up to KMP_HW_LAST number of layers
+  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
+
+public:
+  // Force use of allocate()/deallocate()
+  kmp_hw_subset_t() = delete;
+  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
+  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
+  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
+  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
+
+  static kmp_hw_subset_t *allocate() {
+    int initial_capacity = 5;
+    kmp_hw_subset_t *retval =
+        (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
+    retval->depth = 0;
+    retval->capacity = initial_capacity;
+    retval->set = 0ull;
+    retval->absolute = false;
+    retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
+    return retval;
+  }
+  static void deallocate(kmp_hw_subset_t *subset) {
+    __kmp_free(subset->items);
+    __kmp_free(subset);
+  }
+  void set_absolute() { absolute = true; }
+  bool is_absolute() const { return absolute; }
+  void push_back(int num, kmp_hw_t type, int offset) {
+    if (depth == capacity - 1) {
+      capacity *= 2;
+      item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
+      for (int i = 0; i < depth; ++i)
+        new_items[i] = items[i];
+      __kmp_free(items);
+      items = new_items;
+    }
+    items[depth].num = num;
+    items[depth].type = type;
+    items[depth].offset = offset;
+    depth++;
+    set |= (1ull << type);
+  }
+  int get_depth() const { return depth; }
+  const item_t &at(int index) const {
+    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+    return items[index];
+  }
+  item_t &at(int index) {
+    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+    return items[index];
   }
-  return 0;
-}
+  void remove(int index) {
+    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+    set &= ~(1ull << items[index].type);
+    for (int j = index + 1; j < depth; ++j) {
+      items[j - 1] = items[j];
+    }
+    depth--;
+  }
+  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
+  void dump() const {
+    printf("**********************\n");
+    printf("*** kmp_hw_subset: ***\n");
+    printf("* depth: %d\n", depth);
+    printf("* items:\n");
+    for (int i = 0; i < depth; ++i) {
+      printf("num: %d, type: %s, offset: %d\n", items[i].num,
+             __kmp_hw_get_keyword(items[i].type), items[i].offset);
+    }
+    printf("* set: 0x%llx\n", set);
+    printf("* absolute: %d\n", absolute);
+    printf("**********************\n");
+  }
+};
+
+extern kmp_topology_t *__kmp_topology;
+extern kmp_hw_subset_t *__kmp_hw_subset;
 
 /* A structure for holding machine-specific hierarchy info to be computed once
    at init. This structure represents a mapping of threads to the actual machine
@@ -721,18 +904,10 @@
   kmp_uint32 *numPerLevel;
   kmp_uint32 *skipPerLevel;
 
-  void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
-    int hier_depth = adr2os[0].first.depth;
-    int level = 0;
-    for (int i = hier_depth - 1; i >= 0; --i) {
-      int max = -1;
-      for (int j = 0; j < num_addrs; ++j) {
-        int next = adr2os[j].first.childNums[i];
-        if (next > max)
-          max = next;
-      }
-      numPerLevel[level] = max + 1;
-      ++level;
+  void deriveLevels() {
+    int hier_depth = __kmp_topology->get_depth();
+    for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
+      numPerLevel[level] = __kmp_topology->get_ratio(i);
     }
   }
 
@@ -747,7 +922,7 @@
     }
   }
 
-  void init(AddrUnsPair *adr2os, int num_addrs) {
+  void init(int num_addrs) {
     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
         &uninitialized, not_initialized, initializing);
     if (bool_result == 0) { // Wait for initialization
@@ -774,10 +949,8 @@
     }
 
     // Sort table by physical ID
-    if (adr2os) {
-      qsort(adr2os, num_addrs, sizeof(*adr2os),
-            __kmp_affinity_cmp_Address_labels);
-      deriveLevels(adr2os, num_addrs);
+    if (__kmp_topology && __kmp_topology->get_depth() > 0) {
+      deriveLevels();
     } else {
       numPerLevel[0] = maxLeaves;
       numPerLevel[1] = num_addrs / maxLeaves;
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -21,9 +21,17 @@
 #endif
 #if KMP_USE_HWLOC
 // Copied from hwloc
+#define HWLOC_GROUP_KIND_INTEL_MODULE 102
+#define HWLOC_GROUP_KIND_INTEL_TILE 103
 #define HWLOC_GROUP_KIND_INTEL_DIE 104
+#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
 #endif
 
+// The machine topology
+kmp_topology_t *__kmp_topology = nullptr;
+// KMP_HW_SUBSET environment variable
+kmp_hw_subset_t *__kmp_hw_subset = nullptr;
+
 // Store the real or imagined machine hierarchy here
 static hierarchy_info machine_hierarchy;
 
@@ -34,7 +42,7 @@
   // The test below is true if affinity is available, but set to "none". Need to
   // init on first use of hierarchical barrier.
   if (TCR_1(machine_hierarchy.uninitialized))
-    machine_hierarchy.init(NULL, nproc);
+    machine_hierarchy.init(nproc);
 
   // Adjust the hierarchy in case num threads exceeds original
   if (nproc > machine_hierarchy.base_num_threads)
@@ -49,7 +57,11 @@
   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
 }
 
-#if KMP_AFFINITY_SUPPORTED
+static int nCoresPerPkg, nPackages;
+static int __kmp_nThreadsPerCore;
+#ifndef KMP_DFLT_NTH_CORES
+static int __kmp_ncores;
+#endif
 
 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
   switch (type) {
@@ -69,6 +81,8 @@
     return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
   case KMP_HW_L1:
     return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
+  case KMP_HW_LLC:
+    return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
   case KMP_HW_CORE:
     return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
   case KMP_HW_THREAD:
@@ -79,13 +93,96 @@
   return KMP_I18N_STR(Unknown);
 }
 
-// This function removes the topology levels that are radix 1 and don't offer
-// further information about the topology.  The most common example is when you
-// have one thread context per core, we don't want the extra thread context
-// level if it offers no unique labels.  So they are removed.
-// return value: the new depth of address2os
-static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
-                                                  int depth, kmp_hw_t *types) {
+const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
+  switch (type) {
+  case KMP_HW_SOCKET:
+    return ((plural) ? "sockets" : "socket");
+  case KMP_HW_DIE:
+    return ((plural) ? "dice" : "die");
+  case KMP_HW_MODULE:
+    return ((plural) ? "modules" : "module");
+  case KMP_HW_TILE:
+    return ((plural) ? "tiles" : "tile");
+  case KMP_HW_NUMA:
+    return ((plural) ? "numa_domains" : "numa_domain");
+  case KMP_HW_L3:
+    return ((plural) ? "l3_caches" : "l3_cache");
+  case KMP_HW_L2:
+    return ((plural) ? "l2_caches" : "l2_cache");
+  case KMP_HW_L1:
+    return ((plural) ? "l1_caches" : "l1_cache");
+  case KMP_HW_LLC:
+    return ((plural) ? "ll_caches" : "ll_cache");
+  case KMP_HW_CORE:
+    return ((plural) ? "cores" : "core");
+  case KMP_HW_THREAD:
+    return ((plural) ? "threads" : "thread");
+  case KMP_HW_PROC_GROUP:
+    return ((plural) ? "proc_groups" : "proc_group");
+  }
+  return ((plural) ? "unknowns" : "unknown");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp_hw_thread_t methods
+int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
+  const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
+  const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
+  int depth = __kmp_topology->get_depth();
+  for (int level = 0; level < depth; ++level) {
+    if (ahwthread->ids[level] < bhwthread->ids[level])
+      return -1;
+    else if (ahwthread->ids[level] > bhwthread->ids[level])
+      return 1;
+  }
+  if (ahwthread->os_id < bhwthread->os_id)
+    return -1;
+  else if (ahwthread->os_id > bhwthread->os_id)
+    return 1;
+  return 0;
+}
+
+#if KMP_AFFINITY_SUPPORTED
+int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
+  int i;
+  const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
+  const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
+  int depth = __kmp_topology->get_depth();
+  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
+  KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth);
+  for (i = 0; i < __kmp_affinity_compact; i++) {
+    int j = depth - i - 1;
+    if (aa->sub_ids[j] < bb->sub_ids[j])
+      return -1;
+    if (aa->sub_ids[j] > bb->sub_ids[j])
+      return 1;
+  }
+  for (; i < depth; i++) {
+    int j = i - __kmp_affinity_compact;
+    if (aa->sub_ids[j] < bb->sub_ids[j])
+      return -1;
+    if (aa->sub_ids[j] > bb->sub_ids[j])
+      return 1;
+  }
+  return 0;
+}
+#endif
+
+void kmp_hw_thread_t::print() const {
+  int depth = __kmp_topology->get_depth();
+  printf("%4d ", os_id);
+  for (int i = 0; i < depth; ++i) {
+    printf("%4d ", ids[i]);
+  }
+  printf("\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp_topology_t methods
+
+// Remove layers that don't add information to the topology.
+// This is done by having the layer take on the id = UNKNOWN_ID (-1)
+void kmp_topology_t::_remove_radix1_layers() {
   int preference[KMP_HW_LAST];
   int top_index1, top_index2;
   // Set up preference associative array
@@ -93,40 +190,46 @@
   preference[KMP_HW_SOCKET] = 100;
   preference[KMP_HW_CORE] = 95;
   preference[KMP_HW_THREAD] = 90;
-  preference[KMP_HW_DIE] = 85;
-  preference[KMP_HW_NUMA] = 80;
+  preference[KMP_HW_NUMA] = 85;
+  preference[KMP_HW_DIE] = 80;
   preference[KMP_HW_TILE] = 75;
   preference[KMP_HW_MODULE] = 73;
   preference[KMP_HW_L3] = 70;
   preference[KMP_HW_L2] = 65;
   preference[KMP_HW_L1] = 60;
+  preference[KMP_HW_LLC] = 5;
   top_index1 = 0;
   top_index2 = 1;
   while (top_index1 < depth - 1 && top_index2 < depth) {
-    KMP_DEBUG_ASSERT(top_index1 >= 0 && top_index1 < depth);
-    KMP_DEBUG_ASSERT(top_index2 >= 0 && top_index2 < depth);
     kmp_hw_t type1 = types[top_index1];
     kmp_hw_t type2 = types[top_index2];
-    if (type1 == KMP_HW_SOCKET && type2 == KMP_HW_CORE) {
+    KMP_ASSERT_VALID_HW_TYPE(type1);
+    KMP_ASSERT_VALID_HW_TYPE(type2);
+    // Do not allow the three main topology levels (sockets, cores, threads) to
+    // be compacted down
+    if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
+         type1 == KMP_HW_SOCKET) &&
+        (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
+         type2 == KMP_HW_SOCKET)) {
       top_index1 = top_index2++;
       continue;
     }
     bool radix1 = true;
     bool all_same = true;
-    unsigned id1 = addrP[0].first.labels[top_index1];
-    unsigned id2 = addrP[0].first.labels[top_index2];
+    int id1 = hw_threads[0].ids[top_index1];
+    int id2 = hw_threads[0].ids[top_index2];
     int pref1 = preference[type1];
     int pref2 = preference[type2];
-    for (int hwidx = 1; hwidx < nTh; ++hwidx) {
-      if (addrP[hwidx].first.labels[top_index1] == id1 &&
-          addrP[hwidx].first.labels[top_index2] != id2) {
+    for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
+      if (hw_threads[hwidx].ids[top_index1] == id1 &&
+          hw_threads[hwidx].ids[top_index2] != id2) {
         radix1 = false;
         break;
       }
-      if (addrP[hwidx].first.labels[top_index2] != id2)
+      if (hw_threads[hwidx].ids[top_index2] != id2)
         all_same = false;
-      id1 = addrP[hwidx].first.labels[top_index1];
-      id2 = addrP[hwidx].first.labels[top_index2];
+      id1 = hw_threads[hwidx].ids[top_index1];
+      id2 = hw_threads[hwidx].ids[top_index2];
     }
     if (radix1) {
       // Select the layer to remove based on preference
@@ -147,11 +250,11 @@
         remove_layer_ids = top_index2;
       // Remove radix one type by setting the equivalence, removing the id from
       // the hw threads and removing the layer from types and depth
-      for (int idx = 0; idx < nTh; ++idx) {
-        Address &hw_thread = addrP[idx].first;
+      set_equivalent_type(remove_type, keep_type);
+      for (int idx = 0; idx < num_hw_threads; ++idx) {
+        kmp_hw_thread_t &hw_thread = hw_threads[idx];
         for (int d = remove_layer_ids; d < depth - 1; ++d)
-          hw_thread.labels[d] = hw_thread.labels[d + 1];
-        hw_thread.depth--;
+          hw_thread.ids[d] = hw_thread.ids[d + 1];
       }
       for (int idx = remove_layer; idx < depth - 1; ++idx)
         types[idx] = types[idx + 1];
@@ -161,29 +264,51 @@
     }
   }
   KMP_ASSERT(depth > 0);
-  return depth;
 }
+
+void kmp_topology_t::_set_last_level_cache() {
+  if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
+    set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
+  else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
+    set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
+#if KMP_MIC_SUPPORTED
+  else if (__kmp_mic_type == mic3) {
+    if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
+    else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
+    // L2/Tile wasn't detected so just say L1
+    else
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
+  }
+#endif
+  else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
+    set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
+  // Fallback is to set last level cache to socket or core
+  if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
+    if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
+    else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
+  }
+  KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
+}
+
 // Gather the count of each topology layer and the ratio
-// ratio contains the number of types[i] / types[i+1] and so forth
-// count contains the absolute number of types[i]
-static void __kmp_affinity_gather_enumeration_information(AddrUnsPair *addrP,
-                                                          int nTh, int depth,
-                                                          kmp_hw_t *types,
-                                                          int *ratio,
-                                                          int *count) {
+void kmp_topology_t::_gather_enumeration_information() {
   int previous_id[KMP_HW_LAST];
   int max[KMP_HW_LAST];
 
   for (int i = 0; i < depth; ++i) {
-    previous_id[i] = -1;
+    previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
     max[i] = 0;
     count[i] = 0;
     ratio[i] = 0;
   }
-  for (int i = 0; i < nTh; ++i) {
-    Address &hw_thread = addrP[i].first;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
     for (int layer = 0; layer < depth; ++layer) {
-      int id = hw_thread.labels[layer];
+      int id = hw_thread.ids[layer];
       if (id != previous_id[layer]) {
         // Add an additional increment to each count
         for (int l = layer; l < depth; ++l)
@@ -199,7 +324,7 @@
       }
     }
     for (int layer = 0; layer < depth; ++layer) {
-      previous_id[layer] = hw_thread.labels[layer];
+      previous_id[layer] = hw_thread.ids[layer];
     }
   }
   for (int layer = 0; layer < depth; ++layer) {
@@ -209,75 +334,484 @@
 }
 
 // Find out if the topology is uniform
-static bool __kmp_affinity_discover_uniformity(int depth, int *ratio,
-                                               int *count) {
+void kmp_topology_t::_discover_uniformity() {
   int num = 1;
   for (int level = 0; level < depth; ++level)
     num *= ratio[level];
-  return (num == count[depth - 1]);
+  flags.uniform = (num == count[depth - 1]);
+}
+
+// Set all the sub_ids for each hardware thread
+void kmp_topology_t::_set_sub_ids() {
+  int previous_id[KMP_HW_LAST];
+  int sub_id[KMP_HW_LAST];
+
+  for (int i = 0; i < depth; ++i) {
+    previous_id[i] = -1;
+    sub_id[i] = -1;
+  }
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    // Setup the sub_id
+    for (int j = 0; j < depth; ++j) {
+      if (hw_thread.ids[j] != previous_id[j]) {
+        sub_id[j]++;
+        for (int k = j + 1; k < depth; ++k) {
+          sub_id[k] = 0;
+        }
+        break;
+      }
+    }
+    // Set previous_id
+    for (int j = 0; j < depth; ++j) {
+      previous_id[j] = hw_thread.ids[j];
+    }
+    // Set the sub_ids field
+    for (int j = 0; j < depth; ++j) {
+      hw_thread.sub_ids[j] = sub_id[j];
+    }
+  }
+}
+
+void kmp_topology_t::_set_globals() {
+  // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
+  int core_level, thread_level, package_level;
+  package_level = get_level(KMP_HW_SOCKET);
+#if KMP_GROUP_AFFINITY
+  if (package_level == -1)
+    package_level = get_level(KMP_HW_PROC_GROUP);
+#endif
+  core_level = get_level(KMP_HW_CORE);
+  thread_level = get_level(KMP_HW_THREAD);
+
+  KMP_ASSERT(core_level != -1);
+  KMP_ASSERT(thread_level != -1);
+
+  __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
+  if (package_level != -1) {
+    nCoresPerPkg = calculate_ratio(core_level, package_level);
+    nPackages = get_count(package_level);
+  } else {
+    // assume one socket
+    nCoresPerPkg = get_count(core_level);
+    nPackages = 1;
+  }
+#ifndef KMP_DFLT_NTH_CORES
+  __kmp_ncores = get_count(core_level);
+#endif
 }
 
-// calculate the number of X's per Y
-static inline int __kmp_affinity_calculate_ratio(int *ratio, int deep_level,
-                                                 int shallow_level) {
-  int retval = 1;
-  if (deep_level < 0 || shallow_level < 0)
-    return retval;
-  for (int level = deep_level; level > shallow_level; --level)
-    retval *= ratio[level];
+kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
+                                         const kmp_hw_t *types) {
+  kmp_topology_t *retval;
+  // Allocate all data in one large allocation
+  size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
+                sizeof(int) * ndepth * 3;
+  char *bytes = (char *)__kmp_allocate(size);
+  retval = (kmp_topology_t *)bytes;
+  if (nproc > 0) {
+    retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
+  } else {
+    retval->hw_threads = nullptr;
+  }
+  retval->num_hw_threads = nproc;
+  retval->depth = ndepth;
+  int *arr =
+      (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
+  retval->types = (kmp_hw_t *)arr;
+  retval->ratio = arr + ndepth;
+  retval->count = arr + 2 * ndepth;
+  KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
+  for (int i = 0; i < ndepth; ++i) {
+    retval->types[i] = types[i];
+    retval->equivalent[types[i]] = types[i];
+  }
   return retval;
 }
 
-static void __kmp_affinity_print_topology(AddrUnsPair *addrP, int len,
-                                          int depth, kmp_hw_t *types) {
-  int proc;
+void kmp_topology_t::deallocate(kmp_topology_t *topology) {
+  if (topology)
+    __kmp_free(topology);
+}
+
+bool kmp_topology_t::check_ids() const {
+  // Assume ids have been sorted
+  if (num_hw_threads == 0)
+    return true;
+  for (int i = 1; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &current_thread = hw_threads[i];
+    kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
+    bool unique = false;
+    for (int j = 0; j < depth; ++j) {
+      if (previous_thread.ids[j] != current_thread.ids[j]) {
+        unique = true;
+        break;
+      }
+    }
+    if (unique)
+      continue;
+    return false;
+  }
+  return true;
+}
+
+void kmp_topology_t::dump() const {
+  printf("***********************\n");
+  printf("*** __kmp_topology: ***\n");
+  printf("***********************\n");
+  printf("* depth: %d\n", depth);
+
+  printf("* types: ");
+  for (int i = 0; i < depth; ++i)
+    printf("%15s ", __kmp_hw_get_keyword(types[i]));
+  printf("\n");
+
+  printf("* ratio: ");
+  for (int i = 0; i < depth; ++i) {
+    printf("%15d ", ratio[i]);
+  }
+  printf("\n");
+
+  printf("* count: ");
+  for (int i = 0; i < depth; ++i) {
+    printf("%15d ", count[i]);
+  }
+  printf("\n");
+
+  printf("* equivalent map:\n");
+  KMP_FOREACH_HW_TYPE(i) {
+    const char *key = __kmp_hw_get_keyword(i);
+    const char *value = __kmp_hw_get_keyword(equivalent[i]);
+    printf("%-15s -> %-15s\n", key, value);
+  }
+
+  printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
+
+  printf("* num_hw_threads: %d\n", num_hw_threads);
+  printf("* hw_threads:\n");
+  for (int i = 0; i < num_hw_threads; ++i) {
+    hw_threads[i].print();
+  }
+  printf("***********************\n");
+}
+
+void kmp_topology_t::print(const char *env_var) const {
   kmp_str_buf_t buf;
+  int print_types_depth;
   __kmp_str_buf_init(&buf);
-  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
-  for (proc = 0; proc < len; proc++) {
-    for (int i = 0; i < depth; ++i) {
-      __kmp_str_buf_print(&buf, "%s %d ", __kmp_hw_get_catalog_string(types[i]),
-                          addrP[proc].first.labels[i]);
+  kmp_hw_t print_types[KMP_HW_LAST + 2];
+
+  // Num Available Threads
+  KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
+
+  // Uniform or not
+  if (is_uniform()) {
+    KMP_INFORM(Uniform, env_var);
+  } else {
+    KMP_INFORM(NonUniform, env_var);
+  }
+
+  // Equivalent types
+  KMP_FOREACH_HW_TYPE(type) {
+    kmp_hw_t eq_type = equivalent[type];
+    if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
+      KMP_INFORM(AffEqualTopologyTypes, env_var,
+                 __kmp_hw_get_catalog_string(type),
+                 __kmp_hw_get_catalog_string(eq_type));
+    }
+  }
+
+  // Quick topology
+  KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
+  // Create a print types array that always guarantees printing
+  // the core and thread level
+  print_types_depth = 0;
+  for (int level = 0; level < depth; ++level)
+    print_types[print_types_depth++] = types[level];
+  if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
+    // Force in the core level for quick topology
+    if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
+      // Force core before thread e.g., 1 socket X 2 threads/socket
+      // becomes 1 socket X 1 core/socket X 2 threads/socket
+      print_types[print_types_depth - 1] = KMP_HW_CORE;
+      print_types[print_types_depth++] = KMP_HW_THREAD;
+    } else {
+      print_types[print_types_depth++] = KMP_HW_CORE;
+    }
+  }
+  // Always put threads at very end of quick topology
+  if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
+    print_types[print_types_depth++] = KMP_HW_THREAD;
+
+  __kmp_str_buf_clear(&buf);
+  kmp_hw_t numerator_type;
+  kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
+  int core_level = get_level(KMP_HW_CORE);
+  int ncores = get_count(core_level);
+
+  for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
+    int c;
+    bool plural;
+    numerator_type = print_types[plevel];
+    KMP_ASSERT_VALID_HW_TYPE(numerator_type);
+    if (equivalent[numerator_type] != numerator_type)
+      c = 1;
+    else
+      c = get_ratio(level++);
+    plural = (c > 1);
+    if (plevel == 0) {
+      __kmp_str_buf_print(&buf, "%d %s", c,
+                          __kmp_hw_get_catalog_string(numerator_type, plural));
+    } else {
+      __kmp_str_buf_print(&buf, " x %d %s/%s", c,
+                          __kmp_hw_get_catalog_string(numerator_type, plural),
+                          __kmp_hw_get_catalog_string(denominator_type));
     }
-    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
+    denominator_type = numerator_type;
+  }
+  KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
+
+  if (num_hw_threads <= 0) {
+    __kmp_str_buf_free(&buf);
+    return;
+  }
+
+  // Full OS proc to hardware thread map
+  KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
+  for (int i = 0; i < num_hw_threads; i++) {
     __kmp_str_buf_clear(&buf);
+    for (int level = 0; level < depth; ++level) {
+      kmp_hw_t type = types[level];
+      __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
+      __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
+    }
+    KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
   }
+
   __kmp_str_buf_free(&buf);
 }
 
-// Print out the detailed machine topology map, i.e. the physical locations
-// of each OS proc.
-static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
-                                          int depth, int pkgLevel,
-                                          int coreLevel, int threadLevel) {
-  int proc;
-
-  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
-  for (proc = 0; proc < len; proc++) {
-    int level;
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-    for (level = 0; level < depth; level++) {
-      if (level == threadLevel) {
-        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
-      } else if (level == coreLevel) {
-        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
-      } else if (level == pkgLevel) {
-        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
-      } else if (level > pkgLevel) {
-        __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
-                            level - pkgLevel - 1);
-      } else {
-        __kmp_str_buf_print(&buf, "L%d ", level);
+void kmp_topology_t::canonicalize() {
+  _remove_radix1_layers();
+  _gather_enumeration_information();
+  _discover_uniformity();
+  _set_sub_ids();
+  _set_globals();
+  _set_last_level_cache();
+
+  // Perform post canonicalization checking
+  KMP_ASSERT(depth > 0);
+  for (int level = 0; level < depth; ++level) {
+    // All counts, ratios, and types must be valid
+    KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
+    KMP_ASSERT_VALID_HW_TYPE(types[level]);
+    // Detected types must point to themselves
+    KMP_ASSERT(equivalent[types[level]] == types[level]);
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+  // Set the number of affinity granularity levels
+  if (__kmp_affinity_gran_levels < 0) {
+    kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran);
+    // Check if user's granularity request is valid
+    if (gran_type == KMP_HW_UNKNOWN) {
+      // First try core, then thread, then package
+      kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
+      for (auto g : gran_types) {
+        if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) {
+          gran_type = g;
+          break;
+        }
       }
-      __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
+      KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
+      // Warn user what granularity setting will be used instead
+      KMP_WARNING(AffGranularityBad, "KMP_AFFINITY",
+                  __kmp_hw_get_catalog_string(__kmp_affinity_gran),
+                  __kmp_hw_get_catalog_string(gran_type));
+      __kmp_affinity_gran = gran_type;
     }
-    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
-               buf.str);
-    __kmp_str_buf_free(&buf);
+    __kmp_affinity_gran_levels = 0;
+    for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
+      __kmp_affinity_gran_levels++;
+  }
+#endif // KMP_AFFINITY_SUPPORTED
+}
+
+// Canonicalize an explicit packages X cores/pkg X threads/core topology
+void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
+                                  int nthreads_per_core, int ncores) {
+  int ndepth = 3;
+  depth = ndepth;
+  KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
+  for (int level = 0; level < depth; ++level) {
+    count[level] = 0;
+    ratio[level] = 0;
+  }
+  count[0] = npackages;
+  count[1] = ncores;
+  count[2] = __kmp_xproc;
+  ratio[0] = npackages;
+  ratio[1] = ncores_per_pkg;
+  ratio[2] = nthreads_per_core;
+  equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
+  equivalent[KMP_HW_CORE] = KMP_HW_CORE;
+  equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
+  types[0] = KMP_HW_SOCKET;
+  types[1] = KMP_HW_CORE;
+  types[2] = KMP_HW_THREAD;
+  //__kmp_avail_proc = __kmp_xproc;
+  _discover_uniformity();
+}
+
+// Apply the KMP_HW_SUBSET envirable to the topology
+// Returns true if KMP_HW_SUBSET filtered any processors
+// otherwise, returns false
+bool kmp_topology_t::filter_hw_subset() {
+  // If KMP_HW_SUBSET wasn't requested, then do nothing.
+  if (!__kmp_hw_subset)
+    return false;
+
+  // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
+  int hw_subset_depth = __kmp_hw_subset->get_depth();
+  kmp_hw_t specified[KMP_HW_LAST];
+  KMP_ASSERT(hw_subset_depth > 0);
+  KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
+  for (int i = 0; i < hw_subset_depth; ++i) {
+    int max_count;
+    int num = __kmp_hw_subset->at(i).num;
+    int offset = __kmp_hw_subset->at(i).offset;
+    kmp_hw_t type = __kmp_hw_subset->at(i).type;
+    kmp_hw_t equivalent_type = equivalent[type];
+    int level = get_level(type);
+
+    // Check to see if current layer is in detected machine topology
+    if (equivalent_type != KMP_HW_UNKNOWN) {
+      __kmp_hw_subset->at(i).type = equivalent_type;
+    } else {
+      KMP_WARNING(AffHWSubsetNotExistGeneric,
+                  __kmp_hw_get_catalog_string(type));
+      return false;
+    }
+
+    // Check to see if current layer has already been specified
+    // either directly or through an equivalent type
+    if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
+      KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
+                  __kmp_hw_get_catalog_string(specified[equivalent_type]));
+      return false;
+    }
+    specified[equivalent_type] = type;
+
+    // Check to see if layers are in order
+    if (i + 1 < hw_subset_depth) {
+      kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type);
+      if (next_type == KMP_HW_UNKNOWN) {
+        KMP_WARNING(
+            AffHWSubsetNotExistGeneric,
+            __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type));
+        return false;
+      }
+      int next_topology_level = get_level(next_type);
+      if (level > next_topology_level) {
+        KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type),
+                    __kmp_hw_get_catalog_string(next_type));
+        return false;
+      }
+    }
+
+    // Check to see if each layer's num & offset parameters are valid
+    max_count = get_ratio(level);
+    if (max_count < 0 || num + offset > max_count) {
+      bool plural = (num > 1);
+      KMP_WARNING(AffHWSubsetManyGeneric,
+                  __kmp_hw_get_catalog_string(type, plural));
+      return false;
+    }
+  }
+
+  // Apply the filtered hardware subset
+  int new_index = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    // Check to see if this hardware thread should be filtered
+    bool should_be_filtered = false;
+    for (int level = 0, hw_subset_index = 0;
+         level < depth && hw_subset_index < hw_subset_depth; ++level) {
+      kmp_hw_t topology_type = types[level];
+      auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
+      kmp_hw_t hw_subset_type = hw_subset_item.type;
+      if (topology_type != hw_subset_type)
+        continue;
+      int num = hw_subset_item.num;
+      int offset = hw_subset_item.offset;
+      hw_subset_index++;
+      if (hw_thread.sub_ids[level] < offset ||
+          hw_thread.sub_ids[level] >= offset + num) {
+        should_be_filtered = true;
+        break;
+      }
+    }
+    if (!should_be_filtered) {
+      if (i != new_index)
+        hw_threads[new_index] = hw_thread;
+      new_index++;
+    } else {
+#if KMP_AFFINITY_SUPPORTED
+      KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask);
+#endif
+      __kmp_avail_proc--;
+    }
+  }
+  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
+  num_hw_threads = new_index;
+
+  // Post hardware subset canonicalization
+  _gather_enumeration_information();
+  _discover_uniformity();
+  _set_globals();
+  _set_last_level_cache();
+  return true;
+}
+
+bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
+  if (hw_level >= depth)
+    return true;
+  bool retval = true;
+  const kmp_hw_thread_t &t1 = hw_threads[hwt1];
+  const kmp_hw_thread_t &t2 = hw_threads[hwt2];
+  for (int i = 0; i < (depth - hw_level); ++i) {
+    if (t1.ids[i] != t2.ids[i])
+      return false;
   }
+  return retval;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
+#if KMP_AFFINITY_SUPPORTED
+class kmp_affinity_raii_t {
+  kmp_affin_mask_t *mask;
+  bool restored;
+
+public:
+  kmp_affinity_raii_t() : restored(false) {
+    KMP_CPU_ALLOC(mask);
+    KMP_ASSERT(mask != NULL);
+    __kmp_get_system_affinity(mask, TRUE);
+  }
+  void restore() {
+    __kmp_set_system_affinity(mask, TRUE);
+    KMP_CPU_FREE(mask);
+    restored = true;
+  }
+  ~kmp_affinity_raii_t() {
+    if (!restored) {
+      __kmp_set_system_affinity(mask, TRUE);
+      KMP_CPU_FREE(mask);
+    }
+  }
+};
+
 bool KMPAffinity::picked_api = false;
 
 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
@@ -469,85 +1003,12 @@
   }
 }
 
-// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
-// called to renumber the labels from [0..n] and place them into the child_num
-// vector of the address object.  This is done in case the labels used for
-// the children at one node of the hierarchy differ from those used for
-// another node at the same level.  Example:  suppose the machine has 2 nodes
-// with 2 packages each.  The first node contains packages 601 and 602, and
-// second node contains packages 603 and 604.  If we try to sort the table
-// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
-// because we are paying attention to the labels themselves, not the ordinal
-// child numbers.  By using the child numbers in the sort, the result is
-// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
-static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
-                                             int numAddrs) {
-  KMP_DEBUG_ASSERT(numAddrs > 0);
-  int depth = address2os->first.depth;
-  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  int labCt;
-  for (labCt = 0; labCt < depth; labCt++) {
-    address2os[0].first.childNums[labCt] = counts[labCt] = 0;
-    lastLabel[labCt] = address2os[0].first.labels[labCt];
-  }
-  int i;
-  for (i = 1; i < numAddrs; i++) {
-    for (labCt = 0; labCt < depth; labCt++) {
-      if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
-        int labCt2;
-        for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
-          counts[labCt2] = 0;
-          lastLabel[labCt2] = address2os[i].first.labels[labCt2];
-        }
-        counts[labCt]++;
-        lastLabel[labCt] = address2os[i].first.labels[labCt];
-        break;
-      }
-    }
-    for (labCt = 0; labCt < depth; labCt++) {
-      address2os[i].first.childNums[labCt] = counts[labCt];
-    }
-    for (; labCt < (int)Address::maxDepth; labCt++) {
-      address2os[i].first.childNums[labCt] = 0;
-    }
-  }
-  __kmp_free(lastLabel);
-  __kmp_free(counts);
-}
-
-// All of the __kmp_affinity_create_*_map() routines should set
-// __kmp_affinity_masks to a vector of affinity mask objects of length
-// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
-// the number of levels in the machine topology tree (zero if
-// __kmp_affinity_type == affinity_none).
-//
-// All of the __kmp_affinity_create_*_map() routines should set
-// *__kmp_affin_fullMask to the affinity mask for the initialization thread.
-// They need to save and restore the mask, and it could be needed later, so
-// saving it is just an optimization to avoid calling kmp_get_system_affinity()
-// again.
+// All of the __kmp_affinity_create_*_map() routines should allocate the
+// internal topology object and set the layer ids for it.  Each routine
+// returns a boolean on whether it was successful at doing so.
 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
 
-static int nCoresPerPkg, nPackages;
-static int __kmp_nThreadsPerCore;
-#ifndef KMP_DFLT_NTH_CORES
-static int __kmp_ncores;
-#endif
-static int *__kmp_pu_os_idx = NULL;
-static int nDiesPerPkg = 1;
-
-// __kmp_affinity_uniform_topology() doesn't work when called from
-// places which support arbitrarily many levels in the machine topology
-// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
-// __kmp_affinity_create_x2apicid_map().
-inline static bool __kmp_affinity_uniform_topology() {
-  return __kmp_avail_proc ==
-         (__kmp_nThreadsPerCore * nCoresPerPkg * nDiesPerPkg * nPackages);
-}
-
 #if KMP_USE_HWLOC
-
 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
 #if HWLOC_API_VERSION >= 0x00020000
   return hwloc_obj_type_is_cache(obj->type);
@@ -590,6 +1051,13 @@
   case HWLOC_OBJ_GROUP:
     if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
       return KMP_HW_DIE;
+    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
+      return KMP_HW_TILE;
+    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
+      return KMP_HW_MODULE;
+    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
+      return KMP_HW_PROC_GROUP;
+    return KMP_HW_UNKNOWN;
 #if HWLOC_API_VERSION >= 0x00020100
   case HWLOC_OBJ_DIE:
     return KMP_HW_DIE;
@@ -617,35 +1085,6 @@
   return retval;
 }
 
-static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
-                                               hwloc_obj_t o,
-                                               kmp_hwloc_depth_t depth,
-                                               hwloc_obj_t *f) {
-  if (o->depth == depth) {
-    if (*f == NULL)
-      *f = o; // output first descendant found
-    return 1;
-  }
-  int sum = 0;
-  for (unsigned i = 0; i < o->arity; i++)
-    sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
-  return sum; // will be 0 if no one found (as PU arity is 0)
-}
-
-static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
-                                              hwloc_obj_type_t type,
-                                              hwloc_obj_t *f) {
-  if (!hwloc_compare_types(o->type, type)) {
-    if (*f == NULL)
-      *f = o; // output first descendant found
-    return 1;
-  }
-  int sum = 0;
-  for (unsigned i = 0; i < o->arity; i++)
-    sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
-  return sum; // will be 0 if no one found (as PU arity is 0)
-}
-
 // This gets the sub_id for a lower object under a higher object in the
 // topology tree
 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
@@ -671,27 +1110,23 @@
   return sub_id;
 }
 
-static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
-                                           kmp_i18n_id_t *const msg_id) {
+static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
   kmp_hw_t type;
-  int hw_thread_index, sub_id, nActiveThreads;
+  int hw_thread_index, sub_id;
   int depth;
   hwloc_obj_t pu, obj, root, prev;
-  int ratio[KMP_HW_LAST];
-  int count[KMP_HW_LAST];
   kmp_hw_t types[KMP_HW_LAST];
+  hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
 
   hwloc_topology_t tp = __kmp_hwloc_topology;
   *msg_id = kmp_i18n_null;
-
-  // Save the affinity mask for the current thread.
-  kmp_affin_mask_t *oldMask;
-  KMP_CPU_ALLOC(oldMask);
-  __kmp_get_system_affinity(oldMask, TRUE);
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+  }
 
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
-    // available from cpuid on the current thread, and __kmp_xproc.
+    // available from hwloc on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
     // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
     hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
@@ -708,19 +1143,7 @@
     if (nCoresPerPkg == 0)
       nCoresPerPkg = 1; // to prevent possible division by 0
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffNotUsingHwloc, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (__kmp_affinity_uniform_topology()) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
-      }
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-    KMP_CPU_FREE(oldMask);
-    return 0;
+    return true;
   }
 
   root = hwloc_get_root_obj(tp);
@@ -728,8 +1151,10 @@
   // Figure out the depth and types in the topology
   depth = 0;
   pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
+  KMP_ASSERT(pu);
   obj = pu;
   types[depth] = KMP_HW_THREAD;
+  hwloc_types[depth] = obj->type;
   depth++;
   while (obj != root && obj != NULL) {
     obj = obj->parent;
@@ -743,6 +1168,7 @@
       }
       if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
         types[depth] = KMP_HW_NUMA;
+        hwloc_types[depth] = memory->type;
         depth++;
       }
     }
@@ -750,36 +1176,36 @@
     type = __kmp_hwloc_type_2_topology_type(obj);
     if (type != KMP_HW_UNKNOWN) {
       types[depth] = type;
+      hwloc_types[depth] = obj->type;
       depth++;
     }
   }
-  KMP_ASSERT(depth > 0 && depth <= KMP_HW_LAST);
+  KMP_ASSERT(depth > 0);
 
   // Get the order for the types correct
   for (int i = 0, j = depth - 1; i < j; ++i, --j) {
+    hwloc_obj_type_t hwloc_temp = hwloc_types[i];
     kmp_hw_t temp = types[i];
     types[i] = types[j];
     types[j] = temp;
+    hwloc_types[i] = hwloc_types[j];
+    hwloc_types[j] = hwloc_temp;
   }
 
   // Allocate the data structure to be returned.
-  AddrUnsPair *retval =
-      (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
 
   hw_thread_index = 0;
   pu = NULL;
-  nActiveThreads = 0;
   while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) {
     int index = depth - 1;
     bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
-    Address hw_thread(depth);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
     if (included) {
-      hw_thread.labels[index] = pu->logical_index;
-      __kmp_pu_os_idx[hw_thread_index] = pu->os_index;
+      hw_thread.clear();
+      hw_thread.ids[index] = pu->logical_index;
+      hw_thread.os_id = pu->os_index;
       index--;
-      nActiveThreads++;
     }
     obj = pu;
     prev = obj;
@@ -799,214 +1225,54 @@
         if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
           sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
           if (included) {
-            hw_thread.labels[index] = memory->logical_index;
-            hw_thread.labels[index + 1] = sub_id;
+            hw_thread.ids[index] = memory->logical_index;
+            hw_thread.ids[index + 1] = sub_id;
             index--;
           }
           prev = memory;
         }
+        prev = obj;
       }
 #endif
       type = __kmp_hwloc_type_2_topology_type(obj);
       if (type != KMP_HW_UNKNOWN) {
         sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
         if (included) {
-          hw_thread.labels[index] = obj->logical_index;
-          hw_thread.labels[index + 1] = sub_id;
+          hw_thread.ids[index] = obj->logical_index;
+          hw_thread.ids[index + 1] = sub_id;
           index--;
         }
         prev = obj;
       }
     }
-    if (included) {
-      retval[hw_thread_index] = AddrUnsPair(hw_thread, pu->os_index);
+    if (included)
       hw_thread_index++;
-    }
   }
+  __kmp_topology->sort_ids();
+  return true;
+}
+#endif // KMP_USE_HWLOC
 
-  // If there's only one thread context to bind to, return now.
-  KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
-  KMP_ASSERT(nActiveThreads > 0);
-  if (nActiveThreads == 1) {
-    __kmp_ncores = nPackages = 1;
-    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-
-    if (__kmp_affinity_type == affinity_none) {
-      __kmp_free(retval);
-      KMP_CPU_FREE(oldMask);
-      return 0;
-    }
-
-    // Form an Address object which only includes the package level.
-    Address addr(1);
-    addr.labels[0] = retval[0].first.labels[0];
-    retval[0].first = addr;
-
-    if (__kmp_affinity_gran_levels < 0) {
-      __kmp_affinity_gran_levels = 0;
-    }
-
-    if (__kmp_affinity_verbose) {
-      __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
-    }
+// If we don't know how to retrieve the machine's processor topology, or
+// encounter an error in doing so, this routine is called to form a "flat"
+// mapping of os thread id's <-> processor id's.
+static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
+  *msg_id = kmp_i18n_null;
+  int depth = 3;
+  kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
 
-    *address2os = retval;
-    KMP_CPU_FREE(oldMask);
-    return 1;
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
   }
 
-  // Sort the table by physical Id.
-  qsort(retval, nActiveThreads, sizeof(*retval),
-        __kmp_affinity_cmp_Address_labels);
-
-  // Find any levels with radiix 1, and remove them from the map
-  // (except for the package level).
-  depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
-                                                 types);
-
-  __kmp_affinity_gather_enumeration_information(retval, nActiveThreads, depth,
-                                                types, ratio, count);
-
-  for (int level = 0; level < depth; ++level) {
-    if ((types[level] == KMP_HW_L2 || types[level] == KMP_HW_L3))
-      __kmp_tile_depth = level;
-  }
-
-  // This routine should set __kmp_ncores, as well as
-  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
-  int thread_level, core_level, tile_level, numa_level, socket_level;
-  thread_level = core_level = tile_level = numa_level = socket_level = -1;
-  for (int level = 0; level < depth; ++level) {
-    if (types[level] == KMP_HW_THREAD)
-      thread_level = level;
-    else if (types[level] == KMP_HW_CORE)
-      core_level = level;
-    else if (types[level] == KMP_HW_SOCKET)
-      socket_level = level;
-    else if (types[level] == KMP_HW_TILE)
-      tile_level = level;
-    else if (types[level] == KMP_HW_NUMA)
-      numa_level = level;
-  }
-  __kmp_nThreadsPerCore =
-      __kmp_affinity_calculate_ratio(ratio, thread_level, core_level);
-  nCoresPerPkg =
-      __kmp_affinity_calculate_ratio(ratio, core_level, socket_level);
-  if (socket_level >= 0)
-    nPackages = count[socket_level];
-  else
-    nPackages = 1;
-  if (core_level >= 0)
-    __kmp_ncores = count[core_level];
-  else
-    __kmp_ncores = 1;
-
-  unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count);
-
-  // Print the machine topology summary.
-  if (__kmp_affinity_verbose) {
-    kmp_hw_t numerator_type, denominator_type;
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-    if (uniform) {
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-    } else {
-      KMP_INFORM(NonUniform, "KMP_AFFINITY");
-    }
-
-    __kmp_str_buf_clear(&buf);
-
-    if (core_level < 0)
-      core_level = depth - 1;
-    int ncores = count[core_level];
-
-    denominator_type = KMP_HW_UNKNOWN;
-    for (int level = 0; level < depth; ++level) {
-      int c;
-      bool plural;
-      numerator_type = types[level];
-      c = ratio[level];
-      plural = (c > 1);
-      if (level == 0) {
-        __kmp_str_buf_print(
-            &buf, "%d %s", c,
-            __kmp_hw_get_catalog_string(numerator_type, plural));
-      } else {
-        __kmp_str_buf_print(&buf, " x %d %s/%s", c,
-                            __kmp_hw_get_catalog_string(numerator_type, plural),
-                            __kmp_hw_get_catalog_string(denominator_type));
-      }
-      denominator_type = numerator_type;
-    }
-    KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores);
-    __kmp_str_buf_free(&buf);
-  }
-
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_free(retval);
-    KMP_CPU_FREE(oldMask);
-    return 0;
-  }
-
-  // Set the granularity level based on what levels are modeled
-  // in the machine topology map.
-  if (__kmp_affinity_gran == affinity_gran_node)
-    __kmp_affinity_gran = affinity_gran_numa;
-  KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
-  if (__kmp_affinity_gran_levels < 0) {
-    __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
-    if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread))
-      __kmp_affinity_gran_levels++;
-    if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core))
-      __kmp_affinity_gran_levels++;
-    if ((tile_level >= 0) && (__kmp_affinity_gran > affinity_gran_tile))
-      __kmp_affinity_gran_levels++;
-    if ((numa_level >= 0) && (__kmp_affinity_gran > affinity_gran_numa))
-      __kmp_affinity_gran_levels++;
-    if ((socket_level >= 0) && (__kmp_affinity_gran > affinity_gran_package))
-      __kmp_affinity_gran_levels++;
-  }
-
-  if (__kmp_affinity_verbose)
-    __kmp_affinity_print_topology(retval, nActiveThreads, depth, types);
-
-  KMP_CPU_FREE(oldMask);
-  *address2os = retval;
-  return depth;
-}
-#endif // KMP_USE_HWLOC
-
-// If we don't know how to retrieve the machine's processor topology, or
-// encounter an error in doing so, this routine is called to form a "flat"
-// mapping of os thread id's <-> processor id's.
-static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
-                                          kmp_i18n_id_t *const msg_id) {
-  *address2os = NULL;
-  *msg_id = kmp_i18n_null;
-
-  // Even if __kmp_affinity_type == affinity_none, this routine might still
-  // called to set __kmp_ncores, as well as
-  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
-  if (!KMP_AFFINITY_CAPABLE()) {
-    KMP_ASSERT(__kmp_affinity_type == affinity_none);
-    __kmp_ncores = nPackages = __kmp_xproc;
-    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-    return 0;
+  // Even if __kmp_affinity_type == affinity_none, this routine might still
+  // called to set __kmp_ncores, as well as
+  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    __kmp_ncores = nPackages = __kmp_xproc;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+    return true;
   }
 
   // When affinity is off, this routine will still be called to set
@@ -1015,29 +1281,9 @@
   // not enabled.
   __kmp_ncores = nPackages = __kmp_avail_proc;
   __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-  if (__kmp_affinity_verbose) {
-    KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
-    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-    KMP_INFORM(Uniform, "KMP_AFFINITY");
-    KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-               __kmp_nThreadsPerCore, __kmp_ncores);
-  }
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-  if (__kmp_affinity_type == affinity_none) {
-    int avail_ct = 0;
-    int i;
-    KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
-      if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
-        continue;
-      __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
-    }
-    return 0;
-  }
 
   // Construct the data structure to be returned.
-  *address2os =
-      (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
   int avail_ct = 0;
   int i;
   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
@@ -1045,50 +1291,47 @@
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
-    __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
-    Address addr(1);
-    addr.labels[0] = i;
-    (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
+    hw_thread.clear();
+    hw_thread.os_id = i;
+    hw_thread.ids[0] = i;
+    hw_thread.ids[1] = 0;
+    hw_thread.ids[2] = 0;
+    avail_ct++;
   }
   if (__kmp_affinity_verbose) {
     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
   }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    // Only the package level is modeled in the machine topology map,
-    // so the #levels of granularity is either 0 or 1.
-    if (__kmp_affinity_gran > affinity_gran_package) {
-      __kmp_affinity_gran_levels = 1;
-    } else {
-      __kmp_affinity_gran_levels = 0;
-    }
-  }
-  return 1;
+  return true;
 }
 
 #if KMP_GROUP_AFFINITY
-
 // If multiple Windows* OS processor groups exist, we can create a 2-level
 // topology map with the groups at level 0 and the individual procs at level 1.
 // This facilitates letting the threads float among all procs in a group,
 // if granularity=group (the default when there are multiple groups).
-static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
-                                                kmp_i18n_id_t *const msg_id) {
-  *address2os = NULL;
+static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
   *msg_id = kmp_i18n_null;
+  int depth = 3;
+  kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
+  const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
+
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+  }
 
-  // If we aren't affinity capable, then return now.
-  // The flat mapping will be used.
+  // If we aren't affinity capable, then use flat topology
   if (!KMP_AFFINITY_CAPABLE()) {
-    // FIXME set *msg_id
-    return -1;
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    nPackages = __kmp_num_proc_groups;
+    __kmp_nThreadsPerCore = 1;
+    __kmp_ncores = __kmp_xproc;
+    nCoresPerPkg = nPackages / __kmp_ncores;
+    return true;
   }
 
   // Construct the data structure to be returned.
-  *address2os =
-      (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
   int avail_ct = 0;
   int i;
   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
@@ -1096,77 +1339,18 @@
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
-    __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
-    Address addr(2);
-    addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
-    addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
-    (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
-
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
-                 addr.labels[1]);
-    }
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
+    hw_thread.clear();
+    hw_thread.os_id = i;
+    hw_thread.ids[0] = i / BITS_PER_GROUP;
+    hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
   }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    if (__kmp_affinity_gran == affinity_gran_group) {
-      __kmp_affinity_gran_levels = 1;
-    } else if ((__kmp_affinity_gran == affinity_gran_fine) ||
-               (__kmp_affinity_gran == affinity_gran_thread)) {
-      __kmp_affinity_gran_levels = 0;
-    } else {
-      const char *gran_str = NULL;
-      if (__kmp_affinity_gran == affinity_gran_core) {
-        gran_str = "core";
-      } else if (__kmp_affinity_gran == affinity_gran_package) {
-        gran_str = "package";
-      } else if (__kmp_affinity_gran == affinity_gran_node) {
-        gran_str = "node";
-      } else {
-        KMP_ASSERT(0);
-      }
-
-      // Warning: can't use affinity granularity \"gran\" with group topology
-      // method, using "thread"
-      __kmp_affinity_gran_levels = 0;
-    }
-  }
-  return 2;
+  return true;
 }
-
 #endif /* KMP_GROUP_AFFINITY */
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
-/*
- * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
-    Bits            Bits            Bits           Bits
-    31-16           15-8            7-4            4-0
----+-----------+--------------+-------------+-----------------+
-EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
----+-----------|--------------+-------------+-----------------|
-EBX| reserved  | Num logical processors at level (16 bits)    |
----+-----------|--------------+-------------------------------|
-ECX| reserved  |   Level Type |      Level Number (8 bits)    |
----+-----------+--------------+-------------------------------|
-EDX|                    X2APIC ID (32 bits)                   |
----+----------------------------------------------------------+
-*/
-
-enum {
-  INTEL_LEVEL_TYPE_INVALID = 0, // Package level
-  INTEL_LEVEL_TYPE_SMT = 1,
-  INTEL_LEVEL_TYPE_CORE = 2,
-  INTEL_LEVEL_TYPE_TILE = 3,
-  INTEL_LEVEL_TYPE_MODULE = 4,
-  INTEL_LEVEL_TYPE_DIE = 5,
-  INTEL_LEVEL_TYPE_LAST = 6,
-};
-
-struct cpuid_level_info_t {
-  unsigned level_type, mask, mask_width, nitems, cache_mask;
-};
-
 template <kmp_uint32 LSB, kmp_uint32 MSB>
 static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
   const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
@@ -1177,84 +1361,6 @@
   return retval;
 }
 
-static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
-  switch (intel_type) {
-  case INTEL_LEVEL_TYPE_INVALID:
-    return KMP_HW_SOCKET;
-  case INTEL_LEVEL_TYPE_SMT:
-    return KMP_HW_THREAD;
-  case INTEL_LEVEL_TYPE_CORE:
-    return KMP_HW_CORE;
-  // TODO: add support for the tile and module
-  case INTEL_LEVEL_TYPE_TILE:
-    return KMP_HW_UNKNOWN;
-  case INTEL_LEVEL_TYPE_MODULE:
-    return KMP_HW_UNKNOWN;
-  case INTEL_LEVEL_TYPE_DIE:
-    return KMP_HW_DIE;
-  }
-  return KMP_HW_UNKNOWN;
-}
-
-// This function takes the topology leaf, a levels array to store the levels
-// detected and a bitmap of the known levels.
-// Returns the number of levels in the topology
-static unsigned
-__kmp_x2apicid_get_levels(int leaf,
-                          cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
-                          kmp_uint64 known_levels) {
-  unsigned level, levels_index;
-  unsigned level_type, mask_width, nitems;
-  kmp_cpuid buf;
-
-  // The new algorithm has known topology layers act as highest unknown topology
-  // layers when unknown topology layers exist.
-  // e.g., Suppose layers were SMT CORE <Y> <Z> PACKAGE
-  // Then CORE will take the characteristics (nitems and mask width) of <Z>.
-  // In developing the id mask for each layer, this eliminates unknown portions
-  // of the topology while still keeping the correct underlying structure.
-  level = levels_index = 0;
-  do {
-    __kmp_x86_cpuid(leaf, level, &buf);
-    level_type = __kmp_extract_bits<8, 15>(buf.ecx);
-    mask_width = __kmp_extract_bits<0, 4>(buf.eax);
-    nitems = __kmp_extract_bits<0, 15>(buf.ebx);
-    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
-      return 0;
-
-    if (known_levels & (1ull << level_type)) {
-      // Add a new level to the topology
-      KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
-      levels[levels_index].level_type = level_type;
-      levels[levels_index].mask_width = mask_width;
-      levels[levels_index].nitems = nitems;
-      levels_index++;
-    } else {
-      // If it is an unknown level, then logically move the previous layer up
-      if (levels_index > 0) {
-        levels[levels_index - 1].mask_width = mask_width;
-        levels[levels_index - 1].nitems = nitems;
-      }
-    }
-    level++;
-  } while (level_type != INTEL_LEVEL_TYPE_INVALID);
-
-  // Set the masks to & with apicid
-  for (unsigned i = 0; i < levels_index; ++i) {
-    if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
-      levels[i].mask = ~((-1) << levels[i].mask_width);
-      levels[i].cache_mask = (-1) << levels[i].mask_width;
-      for (unsigned j = 0; j < i; ++j)
-        levels[i].mask ^= levels[j].mask;
-    } else {
-      KMP_DEBUG_ASSERT(levels_index > 0);
-      levels[i].mask = (-1) << levels[i - 1].mask_width;
-      levels[i].cache_mask = 0;
-    }
-  }
-  return levels_index;
-}
-
 static int __kmp_cpuid_mask_width(int count) {
   int r = 0;
 
@@ -1293,21 +1399,78 @@
   return 0;
 }
 
+class kmp_cache_info_t {
+public:
+  struct info_t {
+    unsigned level, mask;
+  };
+  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
+  size_t get_depth() const { return depth; }
+  info_t &operator[](size_t index) { return table[index]; }
+  const info_t &operator[](size_t index) const { return table[index]; }
+
+  static kmp_hw_t get_topology_type(unsigned level) {
+    KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
+    switch (level) {
+    case 1:
+      return KMP_HW_L1;
+    case 2:
+      return KMP_HW_L2;
+    case 3:
+      return KMP_HW_L3;
+    }
+    return KMP_HW_UNKNOWN;
+  }
+
+private:
+  static const int MAX_CACHE_LEVEL = 3;
+
+  size_t depth;
+  info_t table[MAX_CACHE_LEVEL];
+
+  void get_leaf4_levels() {
+    unsigned level = 0;
+    while (depth < MAX_CACHE_LEVEL) {
+      unsigned cache_type, max_threads_sharing;
+      unsigned cache_level, cache_mask_width;
+      kmp_cpuid buf2;
+      __kmp_x86_cpuid(4, level, &buf2);
+      cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
+      if (!cache_type)
+        break;
+      // Skip instruction caches
+      if (cache_type == 2) {
+        level++;
+        continue;
+      }
+      max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
+      cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
+      cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
+      table[depth].level = cache_level;
+      table[depth].mask = ((-1) << cache_mask_width);
+      depth++;
+      level++;
+    }
+  }
+};
+
 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
 // an algorithm which cycles through the available os threads, setting
 // the current thread's affinity mask to that thread, and then retrieves
 // the Apic Id for each thread context using the cpuid instruction.
-static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
-                                            kmp_i18n_id_t *const msg_id) {
+static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
   kmp_cpuid buf;
-  *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
+  }
+
   // Check if cpuid leaf 4 is supported.
   __kmp_x86_cpuid(0, 0, &buf);
   if (buf.eax < 4) {
     *msg_id = kmp_i18n_str_NoLeaf4Support;
-    return -1;
+    return false;
   }
 
   // The algorithm used starts by setting the affinity to each available thread
@@ -1365,18 +1528,7 @@
     __kmp_ncores = __kmp_xproc;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
     __kmp_nThreadsPerCore = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (__kmp_affinity_uniform_topology()) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
-      }
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-    return 0;
+    return true;
   }
 
   // From here on, we can assume that it is safe to call
@@ -1384,10 +1536,7 @@
   // __kmp_affinity_type = affinity_none.
 
   // Save the affinity mask for the current thread.
-  kmp_affin_mask_t *oldMask;
-  KMP_CPU_ALLOC(oldMask);
-  KMP_ASSERT(oldMask != NULL);
-  __kmp_get_system_affinity(oldMask, TRUE);
+  kmp_affinity_raii_t previous_affinity;
 
   // Run through each of the available contexts, binding the current thread
   // to it, and obtaining the pertinent information using the cpuid instr.
@@ -1431,11 +1580,9 @@
     // The apic id and max threads per pkg come from cpuid(1).
     __kmp_x86_cpuid(1, 0, &buf);
     if (((buf.edx >> 9) & 1) == 0) {
-      __kmp_set_system_affinity(oldMask, TRUE);
       __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_ApicNotPresent;
-      return -1;
+      return false;
     }
     threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
     threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
@@ -1467,11 +1614,9 @@
       // I've never seen this one happen, but I suppose it could, if the cpuid
       // instruction on a chip was really screwed up. Make sure to restore the
       // affinity mask before the tail call.
-      __kmp_set_system_affinity(oldMask, TRUE);
       __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-      return -1;
+      return false;
     }
 
     int maskC = (1 << widthC) - 1;
@@ -1485,50 +1630,7 @@
 
   // We've collected all the info we need.
   // Restore the old affinity mask for this thread.
-  __kmp_set_system_affinity(oldMask, TRUE);
-
-  // If there's only one thread context to bind to, form an Address object
-  // with depth 1 and return immediately (or, if affinity is off, set
-  // address2os to NULL and return).
-  //
-  // If it is configured to omit the package level when there is only a single
-  // package, the logic at the end of this routine won't work if there is only
-  // a single thread - it would try to form an Address object with depth 0.
-  KMP_ASSERT(nApics > 0);
-  if (nApics == 1) {
-    __kmp_ncores = nPackages = 1;
-    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-
-    if (__kmp_affinity_type == affinity_none) {
-      __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
-      return 0;
-    }
-
-    *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
-    Address addr(1);
-    addr.labels[0] = threadInfo[0].pkgId;
-    (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
-
-    if (__kmp_affinity_gran_levels < 0) {
-      __kmp_affinity_gran_levels = 0;
-    }
-
-    if (__kmp_affinity_verbose) {
-      __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
-    }
-
-    __kmp_free(threadInfo);
-    KMP_CPU_FREE(oldMask);
-    return 1;
-  }
+  previous_affinity.restore();
 
   // Sort the threadInfo table by physical Id.
   qsort(threadInfo, nApics, sizeof(*threadInfo),
@@ -1597,9 +1699,8 @@
       lastThreadId = threadInfo[i].threadId;
     } else {
       __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
-      return -1;
+      return false;
     }
 
     // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
@@ -1607,108 +1708,183 @@
     if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
         (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
       __kmp_free(threadInfo);
-      KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
-      return -1;
+      return false;
     }
   }
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly
   nPackages = pkgCt;
   if ((int)coreCt > nCoresPerPkg)
     nCoresPerPkg = coreCt;
   if ((int)threadCt > __kmp_nThreadsPerCore)
     __kmp_nThreadsPerCore = threadCt;
-
-  // When affinity is off, this routine will still be called to set
-  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
-  // Make sure all these vars are set correctly, and return now if affinity is
-  // not enabled.
   __kmp_ncores = nCores;
-  if (__kmp_affinity_verbose) {
-    KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
-    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-    if (__kmp_affinity_uniform_topology()) {
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-    } else {
-      KMP_INFORM(NonUniform, "KMP_AFFINITY");
-    }
-    KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-               __kmp_nThreadsPerCore, __kmp_ncores);
-  }
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-  for (i = 0; i < nApics; ++i) {
-    __kmp_pu_os_idx[i] = threadInfo[i].osId;
-  }
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_free(threadInfo);
-    KMP_CPU_FREE(oldMask);
-    return 0;
-  }
 
   // Now that we've determined the number of packages, the number of cores per
   // package, and the number of threads per core, we can construct the data
   // structure that is to be returned.
+  int idx = 0;
   int pkgLevel = 0;
-  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
-  int threadLevel =
-      (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
-  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
+  int coreLevel = 1;
+  int threadLevel = 2;
+  //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
+  int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
+  kmp_hw_t types[3];
+  if (pkgLevel >= 0)
+    types[idx++] = KMP_HW_SOCKET;
+  if (coreLevel >= 0)
+    types[idx++] = KMP_HW_CORE;
+  if (threadLevel >= 0)
+    types[idx++] = KMP_HW_THREAD;
 
   KMP_ASSERT(depth > 0);
-  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
+  __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
 
   for (i = 0; i < nApics; ++i) {
-    Address addr(depth);
+    idx = 0;
     unsigned os = threadInfo[i].osId;
-    int d = 0;
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
 
     if (pkgLevel >= 0) {
-      addr.labels[d++] = threadInfo[i].pkgId;
+      hw_thread.ids[idx++] = threadInfo[i].pkgId;
     }
     if (coreLevel >= 0) {
-      addr.labels[d++] = threadInfo[i].coreId;
+      hw_thread.ids[idx++] = threadInfo[i].coreId;
     }
     if (threadLevel >= 0) {
-      addr.labels[d++] = threadInfo[i].threadId;
+      hw_thread.ids[idx++] = threadInfo[i].threadId;
     }
-    (*address2os)[i] = AddrUnsPair(addr, os);
-  }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    // Set the granularity level based on what levels are modeled in the machine
-    // topology map.
-    __kmp_affinity_gran_levels = 0;
-    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
-      __kmp_affinity_gran_levels++;
-    }
-  }
-
-  if (__kmp_affinity_verbose) {
-    __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
-                                  coreLevel, threadLevel);
+    hw_thread.os_id = os;
   }
 
   __kmp_free(threadInfo);
-  KMP_CPU_FREE(oldMask);
-  return depth;
+  __kmp_topology->sort_ids();
+  if (!__kmp_topology->check_ids()) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+    *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
+    return false;
+  }
+  return true;
 }
 
 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
 // architectures support a newer interface for specifying the x2APIC Ids,
 // based on CPUID.B or CPUID.1F
-static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
-                                              kmp_i18n_id_t *const msg_id) {
+/*
+ * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
+    Bits            Bits            Bits           Bits
+    31-16           15-8            7-4            4-0
+---+-----------+--------------+-------------+-----------------+
+EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
+---+-----------|--------------+-------------+-----------------|
+EBX| reserved  | Num logical processors at level (16 bits)    |
+---+-----------|--------------+-------------------------------|
+ECX| reserved  |   Level Type |      Level Number (8 bits)    |
+---+-----------+--------------+-------------------------------|
+EDX|                    X2APIC ID (32 bits)                   |
+---+----------------------------------------------------------+
+*/
+
+enum {
+  INTEL_LEVEL_TYPE_INVALID = 0, // Package level
+  INTEL_LEVEL_TYPE_SMT = 1,
+  INTEL_LEVEL_TYPE_CORE = 2,
+  INTEL_LEVEL_TYPE_TILE = 3,
+  INTEL_LEVEL_TYPE_MODULE = 4,
+  INTEL_LEVEL_TYPE_DIE = 5,
+  INTEL_LEVEL_TYPE_LAST = 6,
+};
+
+struct cpuid_level_info_t {
+  unsigned level_type, mask, mask_width, nitems, cache_mask;
+};
+
+static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
+  switch (intel_type) {
+  case INTEL_LEVEL_TYPE_INVALID:
+    return KMP_HW_SOCKET;
+  case INTEL_LEVEL_TYPE_SMT:
+    return KMP_HW_THREAD;
+  case INTEL_LEVEL_TYPE_CORE:
+    return KMP_HW_CORE;
+  case INTEL_LEVEL_TYPE_TILE:
+    return KMP_HW_TILE;
+  case INTEL_LEVEL_TYPE_MODULE:
+    return KMP_HW_MODULE;
+  case INTEL_LEVEL_TYPE_DIE:
+    return KMP_HW_DIE;
+  }
+  return KMP_HW_UNKNOWN;
+}
+
+// This function takes the topology leaf, a levels array to store the levels
+// detected and a bitmap of the known levels.
+// Returns the number of levels in the topology
+static unsigned
+__kmp_x2apicid_get_levels(int leaf,
+                          cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
+                          kmp_uint64 known_levels) {
+  unsigned level, levels_index;
+  unsigned level_type, mask_width, nitems;
+  kmp_cpuid buf;
+
+  // New algorithm has known topology layers act as highest unknown topology
+  // layers when unknown topology layers exist.
+  // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
+  // are unknown topology layers, Then SMT will take the characteristics of
+  // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
+  // This eliminates unknown portions of the topology while still keeping the
+  // correct structure.
+  level = levels_index = 0;
+  do {
+    __kmp_x86_cpuid(leaf, level, &buf);
+    level_type = __kmp_extract_bits<8, 15>(buf.ecx);
+    mask_width = __kmp_extract_bits<0, 4>(buf.eax);
+    nitems = __kmp_extract_bits<0, 15>(buf.ebx);
+    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
+      return 0;
+
+    if (known_levels & (1ull << level_type)) {
+      // Add a new level to the topology
+      KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
+      levels[levels_index].level_type = level_type;
+      levels[levels_index].mask_width = mask_width;
+      levels[levels_index].nitems = nitems;
+      levels_index++;
+    } else {
+      // If it is an unknown level, then logically move the previous layer up
+      if (levels_index > 0) {
+        levels[levels_index - 1].mask_width = mask_width;
+        levels[levels_index - 1].nitems = nitems;
+      }
+    }
+    level++;
+  } while (level_type != INTEL_LEVEL_TYPE_INVALID);
+
+  // Set the masks to & with apicid
+  for (unsigned i = 0; i < levels_index; ++i) {
+    if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
+      levels[i].mask = ~((-1) << levels[i].mask_width);
+      levels[i].cache_mask = (-1) << levels[i].mask_width;
+      for (unsigned j = 0; j < i; ++j)
+        levels[i].mask ^= levels[j].mask;
+    } else {
+      KMP_DEBUG_ASSERT(levels_index > 0);
+      levels[i].mask = (-1) << levels[i - 1].mask_width;
+      levels[i].cache_mask = 0;
+    }
+  }
+  return levels_index;
+}
+
+static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
 
   cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
-  int ratio[KMP_HW_LAST];
-  int count[KMP_HW_LAST];
   kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
   unsigned levels_index;
   kmp_cpuid buf;
@@ -1722,6 +1898,9 @@
   KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
 
   *msg_id = kmp_i18n_null;
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+  }
 
   // Figure out the known topology levels
   known_levels = 0ull;
@@ -1771,7 +1950,7 @@
   }
   if (topology_leaf == -1 || levels_index == 0) {
     *msg_id = leaf_message_id;
-    return -1;
+    return false;
   }
   KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
 
@@ -1784,30 +1963,40 @@
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
-
     for (unsigned i = 0; i < levels_index; ++i) {
       if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
         __kmp_nThreadsPerCore = levels[i].nitems;
       } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
         nCoresPerPkg = levels[i].nitems;
-      } else if (levels[i].level_type == INTEL_LEVEL_TYPE_DIE) {
-        nDiesPerPkg = levels[i].nitems;
       }
     }
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffNotCapableUseLocCpuidL, "KMP_AFFINITY", topology_leaf);
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (__kmp_affinity_uniform_topology()) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+    return true;
+  }
+
+  // Allocate the data structure to be returned.
+  int depth = levels_index;
+  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
+    types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
+  __kmp_topology =
+      kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
+
+  // Insert equivalent cache types if they exist
+  kmp_cache_info_t cache_info;
+  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
+    const kmp_cache_info_t::info_t &info = cache_info[i];
+    unsigned cache_mask = info.mask;
+    unsigned cache_level = info.level;
+    for (unsigned j = 0; j < levels_index; ++j) {
+      unsigned hw_cache_mask = levels[j].cache_mask;
+      kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
+      if (hw_cache_mask == cache_mask && j < levels_index - 1) {
+        kmp_hw_t type =
+            __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
+        __kmp_topology->set_equivalent_type(cache_type, type);
       }
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
     }
-    return 0;
   }
 
   // From here on, we can assume that it is safe to call
@@ -1815,21 +2004,12 @@
   // __kmp_affinity_type = affinity_none.
 
   // Save the affinity mask for the current thread.
-  kmp_affin_mask_t *oldMask;
-  KMP_CPU_ALLOC(oldMask);
-  __kmp_get_system_affinity(oldMask, TRUE);
-
-  // Allocate the data structure to be returned.
-  int depth = levels_index;
-  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
-    types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
-  AddrUnsPair *retval =
-      (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+  kmp_affinity_raii_t previous_affinity;
 
   // Run through each of the available contexts, binding the current thread
   // to it, and obtaining the pertinent information using the cpuid instr.
   unsigned int proc;
-  int nApics = 0;
+  int hw_thread_index = 0;
   KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
     cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
     unsigned my_levels_index;
@@ -1838,216 +2018,41 @@
     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
       continue;
     }
-    KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
+    KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
 
     __kmp_affinity_dispatch->bind_thread(proc);
 
     // New algorithm
     __kmp_x86_cpuid(topology_leaf, 0, &buf);
     apic_id = buf.edx;
-    Address addr(depth);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
     my_levels_index =
         __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
     if (my_levels_index == 0 || my_levels_index != levels_index) {
-      KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-      return -1;
+      return false;
     }
+    hw_thread.clear();
+    hw_thread.os_id = proc;
     // Put in topology information
     for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
-      addr.labels[idx] = apic_id & my_levels[j].mask;
-      if (j > 0)
-        addr.labels[idx] >>= my_levels[j - 1].mask_width;
-    }
-    retval[nApics++] = AddrUnsPair(addr, proc);
-  }
-
-  // We've collected all the info we need.
-  // Restore the old affinity mask for this thread.
-  __kmp_set_system_affinity(oldMask, TRUE);
-
-  // If there's only one thread context to bind to, return now.
-  KMP_ASSERT(nApics > 0);
-  if (nApics == 1) {
-    int pkg_level;
-    __kmp_ncores = nPackages = 1;
-    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf);
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
-                 __kmp_nThreadsPerCore, __kmp_ncores);
-    }
-
-    if (__kmp_affinity_type == affinity_none) {
-      __kmp_free(retval);
-      KMP_CPU_FREE(oldMask);
-      return 0;
-    }
-
-    pkg_level = 0;
-    for (int i = 0; i < depth; ++i)
-      if (types[i] == KMP_HW_SOCKET) {
-        pkg_level = i;
-        break;
-      }
-    // Form an Address object which only includes the package level.
-    Address addr(1);
-    addr.labels[0] = retval[0].first.labels[pkg_level];
-    retval[0].first = addr;
-
-    if (__kmp_affinity_gran_levels < 0) {
-      __kmp_affinity_gran_levels = 0;
-    }
-
-    if (__kmp_affinity_verbose) {
-      __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
-    }
-
-    *address2os = retval;
-    KMP_CPU_FREE(oldMask);
-    return 1;
-  }
-
-  // Sort the table by physical Id.
-  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
-
-  __kmp_affinity_gather_enumeration_information(retval, nApics, depth, types,
-                                                ratio, count);
-
-  // When affinity is off, this routine will still be called to set
-  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
-  // Make sure all these vars are set correctly, and return if affinity is not
-  // enabled.
-  int thread_level, core_level, socket_level, die_level;
-  thread_level = core_level = die_level = socket_level = -1;
-  for (int level = 0; level < depth; ++level) {
-    if (types[level] == KMP_HW_THREAD)
-      thread_level = level;
-    else if (types[level] == KMP_HW_CORE)
-      core_level = level;
-    else if (types[level] == KMP_HW_DIE)
-      die_level = level;
-    else if (types[level] == KMP_HW_SOCKET)
-      socket_level = level;
-  }
-  __kmp_nThreadsPerCore =
-      __kmp_affinity_calculate_ratio(ratio, thread_level, core_level);
-  if (die_level > 0) {
-    nDiesPerPkg =
-        __kmp_affinity_calculate_ratio(ratio, die_level, socket_level);
-    nCoresPerPkg = __kmp_affinity_calculate_ratio(ratio, core_level, die_level);
-  } else {
-    nCoresPerPkg =
-        __kmp_affinity_calculate_ratio(ratio, core_level, socket_level);
-  }
-  if (socket_level >= 0)
-    nPackages = count[socket_level];
-  else
-    nPackages = 1;
-  if (core_level >= 0)
-    __kmp_ncores = count[core_level];
-  else
-    __kmp_ncores = 1;
-
-  // Check to see if the machine topology is uniform
-  unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count);
-
-  // Print the machine topology summary.
-  if (__kmp_affinity_verbose) {
-    kmp_hw_t numerator_type, denominator_type;
-    KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf);
-    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-    if (uniform) {
-      KMP_INFORM(Uniform, "KMP_AFFINITY");
-    } else {
-      KMP_INFORM(NonUniform, "KMP_AFFINITY");
-    }
-
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-
-    if (core_level < 0)
-      core_level = depth - 1;
-    int ncores = count[core_level];
-
-    denominator_type = KMP_HW_UNKNOWN;
-    for (int level = 0; level < depth; ++level) {
-      int c;
-      bool plural;
-      numerator_type = types[level];
-      c = ratio[level];
-      plural = (c > 1);
-      if (level == 0) {
-        __kmp_str_buf_print(
-            &buf, "%d %s", c,
-            __kmp_hw_get_catalog_string(numerator_type, plural));
-      } else {
-        __kmp_str_buf_print(&buf, " x %d %s/%s", c,
-                            __kmp_hw_get_catalog_string(numerator_type, plural),
-                            __kmp_hw_get_catalog_string(denominator_type));
+      hw_thread.ids[idx] = apic_id & my_levels[j].mask;
+      if (j > 0) {
+        hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
       }
-      denominator_type = numerator_type;
-    }
-    KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores);
-    __kmp_str_buf_free(&buf);
-  }
-
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
-  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-  for (proc = 0; (int)proc < nApics; ++proc) {
-    __kmp_pu_os_idx[proc] = retval[proc].second;
-  }
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_free(retval);
-    KMP_CPU_FREE(oldMask);
-    return 0;
-  }
-
-  // Find any levels with radix 1, and remove them from the map
-  // (except for the package level).
-  depth = __kmp_affinity_remove_radix_one_levels(retval, nApics, depth, types);
-  thread_level = core_level = die_level = socket_level = -1;
-  for (int level = 0; level < depth; ++level) {
-    if (types[level] == KMP_HW_THREAD)
-      thread_level = level;
-    else if (types[level] == KMP_HW_CORE)
-      core_level = level;
-    else if (types[level] == KMP_HW_DIE)
-      die_level = level;
-    else if (types[level] == KMP_HW_SOCKET)
-      socket_level = level;
-  }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    // Set the granularity level based on what levels are modeled
-    // in the machine topology map.
-    __kmp_affinity_gran_levels = 0;
-    if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if ((die_level >= 0) && (__kmp_affinity_gran > affinity_gran_die)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if (__kmp_affinity_gran > affinity_gran_package) {
-      __kmp_affinity_gran_levels++;
     }
+    hw_thread_index++;
   }
-
-  if (__kmp_affinity_verbose) {
-    __kmp_affinity_print_topology(retval, nApics, depth, types);
+  KMP_ASSERT(hw_thread_index > 0);
+  __kmp_topology->sort_ids();
+  if (!__kmp_topology->check_ids()) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+    *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+    return false;
   }
-
-  KMP_CPU_FREE(oldMask);
-  *address2os = retval;
-  return depth;
+  return true;
 }
-
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #define osIdIndex 0
@@ -2147,15 +2152,36 @@
 }
 #endif // KMP_USE_HIER_SCHED
 
+static inline const char *__kmp_cpuinfo_get_filename() {
+  const char *filename;
+  if (__kmp_cpuinfo_file != nullptr)
+    filename = __kmp_cpuinfo_file;
+  else
+    filename = "/proc/cpuinfo";
+  return filename;
+}
+
+static inline const char *__kmp_cpuinfo_get_envvar() {
+  const char *envvar = nullptr;
+  if (__kmp_cpuinfo_file != nullptr)
+    envvar = "KMP_CPUINFO_FILE";
+  return envvar;
+}
+
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
 // affinity map.
-static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
-                                             int *line,
-                                             kmp_i18n_id_t *const msg_id,
-                                             FILE *f) {
-  *address2os = NULL;
+static bool __kmp_affinity_create_cpuinfo_map(int *line,
+                                              kmp_i18n_id_t *const msg_id) {
+  const char *filename = __kmp_cpuinfo_get_filename();
+  const char *envvar = __kmp_cpuinfo_get_envvar();
   *msg_id = kmp_i18n_null;
 
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
+  }
+
+  kmp_safe_raii_file_t f(filename, "r", envvar);
+
   // Scan of the file, and count the number of "processor" (osId) fields,
   // and find the highest value of <n> for a node_<n> field.
   char buf[256];
@@ -2190,14 +2216,12 @@
   // Check for empty file / no valid processor records, or too many. The number
   // of records can't exceed the number of valid bits in the affinity mask.
   if (num_records == 0) {
-    *line = 0;
     *msg_id = kmp_i18n_str_NoProcRecords;
-    return -1;
+    return false;
   }
   if (num_records > (unsigned)__kmp_xproc) {
-    *line = 0;
     *msg_id = kmp_i18n_str_TooManyProcRecords;
-    return -1;
+    return false;
   }
 
   // Set the file pointer back to the beginning, so that we can scan the file
@@ -2206,9 +2230,8 @@
   // at the end allows us to remove a lot of extra checks for termination
   // conditions.
   if (fseek(f, 0, SEEK_SET) != 0) {
-    *line = 0;
     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
-    return -1;
+    return false;
   }
 
   // Allocate the array of records to store the proc info in.  The dummy
@@ -2271,7 +2294,7 @@
   if (long_line) {                                                             \
     CLEANUP_THREAD_INFO;                                                       \
     *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
-    return -1;                                                                 \
+    return false;                                                              \
   }
       }
       (*line)++;
@@ -2379,7 +2402,7 @@
       if ((int)num_avail == __kmp_xproc) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_TooManyEntries;
-        return -1;
+        return false;
       }
 
       // Check for missing fields.  The osId field must be there, and we
@@ -2387,12 +2410,12 @@
       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_MissingProcField;
-        return -1;
+        return false;
       }
       if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_MissingPhysicalIDField;
-        return -1;
+        return false;
       }
 
       // Skip this proc if it is not included in the machine model.
@@ -2413,12 +2436,12 @@
   no_val:
     CLEANUP_THREAD_INFO;
     *msg_id = kmp_i18n_str_MissingValCpuinfo;
-    return -1;
+    return false;
 
   dup_field:
     CLEANUP_THREAD_INFO;
     *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
-    return -1;
+    return false;
   }
   *line = 0;
 
@@ -2428,60 +2451,11 @@
 
   // check for num_records == __kmp_xproc ???
 
-  // If there's only one thread context to bind to, form an Address object with
-  // depth 1 and return immediately (or, if affinity is off, set address2os to
-  // NULL and return).
-  //
-  // If it is configured to omit the package level when there is only a single
-  // package, the logic at the end of this routine won't work if there is only a
-  // single thread - it would try to form an Address object with depth 0.
-  KMP_ASSERT(num_avail > 0);
-  KMP_ASSERT(num_avail <= num_records);
-  if (num_avail == 1) {
-    __kmp_ncores = 1;
-    __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
-    if (__kmp_affinity_verbose) {
-      if (!KMP_AFFINITY_CAPABLE()) {
-        KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
-        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
-        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      }
-      int index;
-      kmp_str_buf_t buf;
-      __kmp_str_buf_init(&buf);
-      __kmp_str_buf_print(&buf, "1");
-      for (index = maxIndex - 1; index > pkgIdIndex; index--) {
-        __kmp_str_buf_print(&buf, " x 1");
-      }
-      KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
-      __kmp_str_buf_free(&buf);
-    }
-
-    if (__kmp_affinity_type == affinity_none) {
-      CLEANUP_THREAD_INFO;
-      return 0;
-    }
-
-    *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
-    Address addr(1);
-    addr.labels[0] = threadInfo[0][pkgIdIndex];
-    (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
-
-    if (__kmp_affinity_gran_levels < 0) {
-      __kmp_affinity_gran_levels = 0;
-    }
-
-    if (__kmp_affinity_verbose) {
-      __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
-    }
-
-    CLEANUP_THREAD_INFO;
-    return 1;
-  }
+  // If it is configured to omit the package level when there is only a single
+  // package, the logic at the end of this routine won't work if there is only a
+  // single thread
+  KMP_ASSERT(num_avail > 0);
+  KMP_ASSERT(num_avail <= num_records);
 
   // Sort the threadInfo table by physical Id.
   qsort(threadInfo, num_avail, sizeof(*threadInfo),
@@ -2598,7 +2572,7 @@
         __kmp_free(counts);
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
-        return -1;
+        return false;
       }
 
       // If the thread ids were not specified and we see entries entries that
@@ -2629,43 +2603,15 @@
   for (index = threadIdIndex; index < maxIndex; index++) {
     prod *= maxCt[index];
   }
-  bool uniform = (prod == totals[threadIdIndex]);
 
   // When affinity is off, this routine will still be called to set
   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
   // Make sure all these vars are set correctly, and return now if affinity is
   // not enabled.
   __kmp_ncores = totals[coreIdIndex];
-
-  if (__kmp_affinity_verbose) {
-    if (!KMP_AFFINITY_CAPABLE()) {
-      KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (uniform) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
-      }
-    } else {
-      KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
-      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
-      if (uniform) {
-        KMP_INFORM(Uniform, "KMP_AFFINITY");
-      } else {
-        KMP_INFORM(NonUniform, "KMP_AFFINITY");
-      }
-    }
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-
-    __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
-    for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
-      __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
-    }
-    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
-               maxCt[threadIdIndex], __kmp_ncores);
-
-    __kmp_str_buf_free(&buf);
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    return true;
   }
 
 #if KMP_MIC && REDUCE_TEAM_SIZE
@@ -2678,21 +2624,7 @@
   }
 #endif // KMP_MIC && REDUCE_TEAM_SIZE
 
-  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
-  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
-  for (i = 0; i < num_avail; ++i) { // fill the os indices
-    __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
-  }
-
-  if (__kmp_affinity_type == affinity_none) {
-    __kmp_free(lastId);
-    __kmp_free(totals);
-    __kmp_free(maxCt);
-    __kmp_free(counts);
-    CLEANUP_THREAD_INFO;
-    return 0;
-  }
 
   // Count the number of levels which have more nodes at that level than at the
   // parent's level (with there being an implicit root node of the top level).
@@ -2706,78 +2638,59 @@
   }
   inMap[maxIndex] = (totals[maxIndex] > 1);
   inMap[pkgIdIndex] = true;
+  inMap[coreIdIndex] = true;
+  inMap[threadIdIndex] = true;
 
   int depth = 0;
+  int idx = 0;
+  kmp_hw_t types[KMP_HW_LAST];
+  int pkgLevel = -1;
+  int coreLevel = -1;
+  int threadLevel = -1;
   for (index = threadIdIndex; index <= maxIndex; index++) {
     if (inMap[index]) {
       depth++;
     }
   }
+  if (inMap[pkgIdIndex]) {
+    pkgLevel = idx;
+    types[idx++] = KMP_HW_SOCKET;
+  }
+  if (inMap[coreIdIndex]) {
+    coreLevel = idx;
+    types[idx++] = KMP_HW_CORE;
+  }
+  if (inMap[threadIdIndex]) {
+    threadLevel = idx;
+    types[idx++] = KMP_HW_THREAD;
+  }
   KMP_ASSERT(depth > 0);
 
   // Construct the data structure that is to be returned.
-  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
-  int pkgLevel = -1;
-  int coreLevel = -1;
-  int threadLevel = -1;
+  __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
 
   for (i = 0; i < num_avail; ++i) {
-    Address addr(depth);
     unsigned os = threadInfo[i][osIdIndex];
     int src_index;
     int dst_index = 0;
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
+    hw_thread.os_id = os;
 
+    idx = 0;
     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
       if (!inMap[src_index]) {
         continue;
       }
-      addr.labels[dst_index] = threadInfo[i][src_index];
       if (src_index == pkgIdIndex) {
-        pkgLevel = dst_index;
+        hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
       } else if (src_index == coreIdIndex) {
-        coreLevel = dst_index;
+        hw_thread.ids[coreLevel] = threadInfo[i][src_index];
       } else if (src_index == threadIdIndex) {
-        threadLevel = dst_index;
+        hw_thread.ids[threadLevel] = threadInfo[i][src_index];
       }
       dst_index++;
     }
-    (*address2os)[i] = AddrUnsPair(addr, os);
-  }
-
-  if (__kmp_affinity_gran_levels < 0) {
-    // Set the granularity level based on what levels are modeled
-    // in the machine topology map.
-    unsigned src_index;
-    __kmp_affinity_gran_levels = 0;
-    for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
-      if (!inMap[src_index]) {
-        continue;
-      }
-      switch (src_index) {
-      case threadIdIndex:
-        if (__kmp_affinity_gran > affinity_gran_thread) {
-          __kmp_affinity_gran_levels++;
-        }
-
-        break;
-      case coreIdIndex:
-        if (__kmp_affinity_gran > affinity_gran_core) {
-          __kmp_affinity_gran_levels++;
-        }
-        break;
-
-      case pkgIdIndex:
-        if (__kmp_affinity_gran > affinity_gran_package) {
-          __kmp_affinity_gran_levels++;
-        }
-        break;
-      }
-    }
-  }
-
-  if (__kmp_affinity_verbose) {
-    __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
-                                  coreLevel, threadLevel);
   }
 
   __kmp_free(inMap);
@@ -2786,27 +2699,32 @@
   __kmp_free(maxCt);
   __kmp_free(counts);
   CLEANUP_THREAD_INFO;
-  return depth;
+  __kmp_topology->sort_ids();
+  if (!__kmp_topology->check_ids()) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+    *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
+    return false;
+  }
+  return true;
 }
 
 // Create and return a table of affinity masks, indexed by OS thread ID.
 // This routine handles OR'ing together all the affinity masks of threads
 // that are sufficiently close, if granularity > fine.
 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
-                                            unsigned *numUnique,
-                                            AddrUnsPair *address2os,
-                                            unsigned numAddrs) {
+                                            unsigned *numUnique) {
   // First form a table of affinity masks in order of OS thread id.
-  unsigned depth;
-  unsigned maxOsId;
-  unsigned i;
-
-  KMP_ASSERT(numAddrs > 0);
-  depth = address2os[0].first.depth;
+  int maxOsId;
+  int i;
+  int numAddrs = __kmp_topology->get_num_hw_threads();
+  int depth = __kmp_topology->get_depth();
+  KMP_ASSERT(numAddrs);
+  KMP_ASSERT(depth);
 
   maxOsId = 0;
   for (i = numAddrs - 1;; --i) {
-    unsigned osId = address2os[i].second;
+    int osId = __kmp_topology->at(i).os_id;
     if (osId > maxOsId) {
       maxOsId = osId;
     }
@@ -2815,12 +2733,6 @@
   }
   kmp_affin_mask_t *osId2Mask;
   KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
-
-  // Sort the address2os table according to physical order. Doing so will put
-  // all threads on the same core/package/node in consecutive locations.
-  qsort(address2os, numAddrs, sizeof(*address2os),
-        __kmp_affinity_cmp_Address_labels);
-
   KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
   if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
     KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
@@ -2833,52 +2745,50 @@
   }
 
   // Run through the table, forming the masks for all threads on each core.
-  // Threads on the same core will have identical "Address" objects, not
+  // Threads on the same core will have identical kmp_hw_thread_t objects, not
   // considering the last level, which must be the thread id. All threads on a
   // core will appear consecutively.
-  unsigned unique = 0;
-  unsigned j = 0; // index of 1st thread on core
-  unsigned leader = 0;
-  Address *leaderAddr = &(address2os[0].first);
+  int unique = 0;
+  int j = 0; // index of 1st thread on core
+  int leader = 0;
   kmp_affin_mask_t *sum;
   KMP_CPU_ALLOC_ON_STACK(sum);
   KMP_CPU_ZERO(sum);
-  KMP_CPU_SET(address2os[0].second, sum);
+  KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
   for (i = 1; i < numAddrs; i++) {
     // If this thread is sufficiently close to the leader (within the
     // granularity setting), then set the bit for this os thread in the
     // affinity mask for this group, and go on to the next thread.
-    if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
-      KMP_CPU_SET(address2os[i].second, sum);
+    if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) {
+      KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
       continue;
     }
 
     // For every thread in this group, copy the mask to the thread's entry in
     // the osId2Mask table.  Mark the first address as a leader.
     for (; j < i; j++) {
-      unsigned osId = address2os[j].second;
+      int osId = __kmp_topology->at(j).os_id;
       KMP_DEBUG_ASSERT(osId <= maxOsId);
       kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
       KMP_CPU_COPY(mask, sum);
-      address2os[j].first.leader = (j == leader);
+      __kmp_topology->at(j).leader = (j == leader);
     }
     unique++;
 
     // Start a new mask.
     leader = i;
-    leaderAddr = &(address2os[i].first);
     KMP_CPU_ZERO(sum);
-    KMP_CPU_SET(address2os[i].second, sum);
+    KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
   }
 
   // For every thread in last group, copy the mask to the thread's
   // entry in the osId2Mask table.
   for (; j < i; j++) {
-    unsigned osId = address2os[j].second;
+    int osId = __kmp_topology->at(j).os_id;
     KMP_DEBUG_ASSERT(osId <= maxOsId);
     kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
     KMP_CPU_COPY(mask, sum);
-    address2os[j].first.leader = (j == leader);
+    __kmp_topology->at(j).leader = (j == leader);
   }
   unique++;
   KMP_CPU_FREE_FROM_STACK(sum);
@@ -3468,673 +3378,15 @@
 #undef ADD_MASK
 #undef ADD_MASK_OSID
 
-#if KMP_USE_HWLOC
-static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
-  // skip PUs descendants of the object o
-  int skipped = 0;
-  hwloc_obj_t hT = NULL;
-  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
-  for (int i = 0; i < N; ++i) {
-    KMP_DEBUG_ASSERT(hT);
-    unsigned idx = hT->os_index;
-    if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-      KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-      KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-      ++skipped;
-    }
-    hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
-  }
-  return skipped; // count number of skipped units
-}
-
-static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
-  // check if obj has PUs present in fullMask
-  hwloc_obj_t hT = NULL;
-  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
-  for (int i = 0; i < N; ++i) {
-    KMP_DEBUG_ASSERT(hT);
-    unsigned idx = hT->os_index;
-    if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
-      return 1; // found PU
-    hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
-  }
-  return 0; // no PUs found
-}
-#endif // KMP_USE_HWLOC
-
-static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
-  AddrUnsPair *newAddr;
-  if (__kmp_hws_requested == 0)
-    goto _exit; // no topology limiting actions requested, exit
-#if KMP_USE_HWLOC
-  if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
-    // Number of subobjects calculated dynamically, this works fine for
-    // any non-uniform topology.
-    // L2 cache objects are determined by depth, other objects - by type.
-    hwloc_topology_t tp = __kmp_hwloc_topology;
-    int nS = 0, nN = 0, nL = 0, nC = 0,
-        nT = 0; // logical index including skipped
-    int nCr = 0, nTr = 0; // number of requested units
-    int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
-    hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
-    int L2depth, idx;
-
-    // check support of extensions ----------------------------------
-    int numa_support = 0, tile_support = 0;
-    if (__kmp_pu_os_idx)
-      hT = hwloc_get_pu_obj_by_os_index(tp,
-                                        __kmp_pu_os_idx[__kmp_avail_proc - 1]);
-    else
-      hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
-    if (hT == NULL) { // something's gone wrong
-      KMP_WARNING(AffHWSubsetUnsupported);
-      goto _exit;
-    }
-    // check NUMA node
-    hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
-    hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
-    if (hN != NULL && hN->depth > hS->depth) {
-      numa_support = 1; // 1 in case socket includes node(s)
-    } else if (__kmp_hws_node.num > 0) {
-      // don't support sockets inside NUMA node (no such HW found for testing)
-      KMP_WARNING(AffHWSubsetUnsupported);
-      goto _exit;
-    }
-    // check L2 cahce, get object by depth because of multiple caches
-    L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
-    hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
-    if (hL != NULL &&
-        __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
-      tile_support = 1; // no sense to count L2 if it includes single core
-    } else if (__kmp_hws_tile.num > 0) {
-      if (__kmp_hws_core.num == 0) {
-        __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
-        __kmp_hws_tile.num = 0;
-      } else {
-        // L2 and core are both requested, but represent same object
-        KMP_WARNING(AffHWSubsetInvalid);
-        goto _exit;
-      }
-    }
-    // end of check of extensions -----------------------------------
-
-    // fill in unset items, validate settings -----------------------
-    if (__kmp_hws_socket.num == 0)
-      __kmp_hws_socket.num = nPackages; // use all available sockets
-    if (__kmp_hws_socket.offset >= nPackages) {
-      KMP_WARNING(AffHWSubsetManySockets);
-      goto _exit;
-    }
-    if (numa_support) {
-      hN = NULL;
-      int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
-                                                  &hN); // num nodes in socket
-      if (__kmp_hws_node.num == 0)
-        __kmp_hws_node.num = NN; // use all available nodes
-      if (__kmp_hws_node.offset >= NN) {
-        KMP_WARNING(AffHWSubsetManyNodes);
-        goto _exit;
-      }
-      if (tile_support) {
-        // get num tiles in node
-        int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
-        if (__kmp_hws_tile.num == 0) {
-          __kmp_hws_tile.num = NL + 1;
-        } // use all available tiles, some node may have more tiles, thus +1
-        if (__kmp_hws_tile.offset >= NL) {
-          KMP_WARNING(AffHWSubsetManyTiles);
-          goto _exit;
-        }
-        int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
-                                                    &hC); // num cores in tile
-        if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC; // use all available cores
-        if (__kmp_hws_core.offset >= NC) {
-          KMP_WARNING(AffHWSubsetManyCores);
-          goto _exit;
-        }
-      } else { // tile_support
-        int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
-                                                    &hC); // num cores in node
-        if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC; // use all available cores
-        if (__kmp_hws_core.offset >= NC) {
-          KMP_WARNING(AffHWSubsetManyCores);
-          goto _exit;
-        }
-      } // tile_support
-    } else { // numa_support
-      if (tile_support) {
-        // get num tiles in socket
-        int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
-        if (__kmp_hws_tile.num == 0)
-          __kmp_hws_tile.num = NL; // use all available tiles
-        if (__kmp_hws_tile.offset >= NL) {
-          KMP_WARNING(AffHWSubsetManyTiles);
-          goto _exit;
-        }
-        int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
-                                                    &hC); // num cores in tile
-        if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC; // use all available cores
-        if (__kmp_hws_core.offset >= NC) {
-          KMP_WARNING(AffHWSubsetManyCores);
-          goto _exit;
-        }
-      } else { // tile_support
-        int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
-                                                    &hC); // num cores in socket
-        if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC; // use all available cores
-        if (__kmp_hws_core.offset >= NC) {
-          KMP_WARNING(AffHWSubsetManyCores);
-          goto _exit;
-        }
-      } // tile_support
-    }
-    if (__kmp_hws_proc.num == 0)
-      __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
-    if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
-      KMP_WARNING(AffHWSubsetManyProcs);
-      goto _exit;
-    }
-    // end of validation --------------------------------------------
-
-    if (pAddr) // pAddr is NULL in case of affinity_none
-      newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
-                                              __kmp_avail_proc); // max size
-    // main loop to form HW subset ----------------------------------
-    hS = NULL;
-    int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
-    for (int s = 0; s < NP; ++s) {
-      // Check Socket -----------------------------------------------
-      hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
-      if (!__kmp_hwloc_obj_has_PUs(tp, hS))
-        continue; // skip socket if all PUs are out of fullMask
-      ++nS; // only count objects those have PUs in affinity mask
-      if (nS <= __kmp_hws_socket.offset ||
-          nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
-        n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
-        continue; // move to next socket
-      }
-      nCr = 0; // count number of cores per socket
-      // socket requested, go down the topology tree
-      // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
-      if (numa_support) {
-        nN = 0;
-        hN = NULL;
-        // num nodes in current socket
-        int NN =
-            __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
-        for (int n = 0; n < NN; ++n) {
-          // Check NUMA Node ----------------------------------------
-          if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
-            hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
-            continue; // skip node if all PUs are out of fullMask
-          }
-          ++nN;
-          if (nN <= __kmp_hws_node.offset ||
-              nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
-            // skip node as not requested
-            n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
-            hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
-            continue; // move to next node
-          }
-          // node requested, go down the topology tree
-          if (tile_support) {
-            nL = 0;
-            hL = NULL;
-            int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
-            for (int l = 0; l < NL; ++l) {
-              // Check L2 (tile) ------------------------------------
-              if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
-                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-                continue; // skip tile if all PUs are out of fullMask
-              }
-              ++nL;
-              if (nL <= __kmp_hws_tile.offset ||
-                  nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
-                // skip tile as not requested
-                n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
-                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-                continue; // move to next tile
-              }
-              // tile requested, go down the topology tree
-              nC = 0;
-              hC = NULL;
-              // num cores in current tile
-              int NC = __kmp_hwloc_count_children_by_type(tp, hL,
-                                                          HWLOC_OBJ_CORE, &hC);
-              for (int c = 0; c < NC; ++c) {
-                // Check Core ---------------------------------------
-                if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                  continue; // skip core if all PUs are out of fullMask
-                }
-                ++nC;
-                if (nC <= __kmp_hws_core.offset ||
-                    nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-                  // skip node as not requested
-                  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                  continue; // move to next node
-                }
-                // core requested, go down to PUs
-                nT = 0;
-                nTr = 0;
-                hT = NULL;
-                // num procs in current core
-                int NT = __kmp_hwloc_count_children_by_type(tp, hC,
-                                                            HWLOC_OBJ_PU, &hT);
-                for (int t = 0; t < NT; ++t) {
-                  // Check PU ---------------------------------------
-                  idx = hT->os_index;
-                  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                    continue; // skip PU if not in fullMask
-                  }
-                  ++nT;
-                  if (nT <= __kmp_hws_proc.offset ||
-                      nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                    // skip PU
-                    KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                    ++n_old;
-                    KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                    continue; // move to next node
-                  }
-                  ++nTr;
-                  if (pAddr) // collect requested thread's data
-                    newAddr[n_new] = (*pAddr)[n_old];
-                  ++n_new;
-                  ++n_old;
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                } // threads loop
-                if (nTr > 0) {
-                  ++nCr; // num cores per socket
-                  ++nCo; // total num cores
-                  if (nTr > nTpC)
-                    nTpC = nTr; // calc max threads per core
-                }
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-              } // cores loop
-              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-            } // tiles loop
-          } else { // tile_support
-            // no tiles, check cores
-            nC = 0;
-            hC = NULL;
-            // num cores in current node
-            int NC =
-                __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
-            for (int c = 0; c < NC; ++c) {
-              // Check Core ---------------------------------------
-              if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                continue; // skip core if all PUs are out of fullMask
-              }
-              ++nC;
-              if (nC <= __kmp_hws_core.offset ||
-                  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-                // skip node as not requested
-                n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                continue; // move to next node
-              }
-              // core requested, go down to PUs
-              nT = 0;
-              nTr = 0;
-              hT = NULL;
-              int NT =
-                  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
-              for (int t = 0; t < NT; ++t) {
-                // Check PU ---------------------------------------
-                idx = hT->os_index;
-                if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                  continue; // skip PU if not in fullMask
-                }
-                ++nT;
-                if (nT <= __kmp_hws_proc.offset ||
-                    nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                  // skip PU
-                  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                  ++n_old;
-                  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                  continue; // move to next node
-                }
-                ++nTr;
-                if (pAddr) // collect requested thread's data
-                  newAddr[n_new] = (*pAddr)[n_old];
-                ++n_new;
-                ++n_old;
-                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-              } // threads loop
-              if (nTr > 0) {
-                ++nCr; // num cores per socket
-                ++nCo; // total num cores
-                if (nTr > nTpC)
-                  nTpC = nTr; // calc max threads per core
-              }
-              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-            } // cores loop
-          } // tiles support
-          hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
-        } // nodes loop
-      } else { // numa_support
-        // no NUMA support
-        if (tile_support) {
-          nL = 0;
-          hL = NULL;
-          // num tiles in current socket
-          int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
-          for (int l = 0; l < NL; ++l) {
-            // Check L2 (tile) ------------------------------------
-            if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
-              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-              continue; // skip tile if all PUs are out of fullMask
-            }
-            ++nL;
-            if (nL <= __kmp_hws_tile.offset ||
-                nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
-              // skip tile as not requested
-              n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
-              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-              continue; // move to next tile
-            }
-            // tile requested, go down the topology tree
-            nC = 0;
-            hC = NULL;
-            // num cores per tile
-            int NC =
-                __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
-            for (int c = 0; c < NC; ++c) {
-              // Check Core ---------------------------------------
-              if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                continue; // skip core if all PUs are out of fullMask
-              }
-              ++nC;
-              if (nC <= __kmp_hws_core.offset ||
-                  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-                // skip node as not requested
-                n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-                continue; // move to next node
-              }
-              // core requested, go down to PUs
-              nT = 0;
-              nTr = 0;
-              hT = NULL;
-              // num procs per core
-              int NT =
-                  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
-              for (int t = 0; t < NT; ++t) {
-                // Check PU ---------------------------------------
-                idx = hT->os_index;
-                if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                  continue; // skip PU if not in fullMask
-                }
-                ++nT;
-                if (nT <= __kmp_hws_proc.offset ||
-                    nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                  // skip PU
-                  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                  ++n_old;
-                  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                  continue; // move to next node
-                }
-                ++nTr;
-                if (pAddr) // collect requested thread's data
-                  newAddr[n_new] = (*pAddr)[n_old];
-                ++n_new;
-                ++n_old;
-                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-              } // threads loop
-              if (nTr > 0) {
-                ++nCr; // num cores per socket
-                ++nCo; // total num cores
-                if (nTr > nTpC)
-                  nTpC = nTr; // calc max threads per core
-              }
-              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-            } // cores loop
-            hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
-          } // tiles loop
-        } else { // tile_support
-          // no tiles, check cores
-          nC = 0;
-          hC = NULL;
-          // num cores in socket
-          int NC =
-              __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
-          for (int c = 0; c < NC; ++c) {
-            // Check Core -------------------------------------------
-            if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
-              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-              continue; // skip core if all PUs are out of fullMask
-            }
-            ++nC;
-            if (nC <= __kmp_hws_core.offset ||
-                nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
-              // skip node as not requested
-              n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
-              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-              continue; // move to next node
-            }
-            // core requested, go down to PUs
-            nT = 0;
-            nTr = 0;
-            hT = NULL;
-            // num procs per core
-            int NT =
-                __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
-            for (int t = 0; t < NT; ++t) {
-              // Check PU ---------------------------------------
-              idx = hT->os_index;
-              if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
-                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                continue; // skip PU if not in fullMask
-              }
-              ++nT;
-              if (nT <= __kmp_hws_proc.offset ||
-                  nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
-                // skip PU
-                KMP_CPU_CLR(idx, __kmp_affin_fullMask);
-                ++n_old;
-                KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
-                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-                continue; // move to next node
-              }
-              ++nTr;
-              if (pAddr) // collect requested thread's data
-                newAddr[n_new] = (*pAddr)[n_old];
-              ++n_new;
-              ++n_old;
-              hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
-            } // threads loop
-            if (nTr > 0) {
-              ++nCr; // num cores per socket
-              ++nCo; // total num cores
-              if (nTr > nTpC)
-                nTpC = nTr; // calc max threads per core
-            }
-            hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
-          } // cores loop
-        } // tiles support
-      } // numa_support
-      if (nCr > 0) { // found cores?
-        ++nPkg; // num sockets
-        if (nCr > nCpP)
-          nCpP = nCr; // calc max cores per socket
-      }
-    } // sockets loop
-
-    // check the subset is valid
-    KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
-    KMP_DEBUG_ASSERT(nPkg > 0);
-    KMP_DEBUG_ASSERT(nCpP > 0);
-    KMP_DEBUG_ASSERT(nTpC > 0);
-    KMP_DEBUG_ASSERT(nCo > 0);
-    KMP_DEBUG_ASSERT(nPkg <= nPackages);
-    KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
-    KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
-    KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
-
-    nPackages = nPkg; // correct num sockets
-    nCoresPerPkg = nCpP; // correct num cores per socket
-    __kmp_nThreadsPerCore = nTpC; // correct num threads per core
-    __kmp_avail_proc = n_new; // correct num procs
-    __kmp_ncores = nCo; // correct num cores
-    // hwloc topology method end
-  } else
-#endif // KMP_USE_HWLOC
-  {
-    int n_old = 0, n_new = 0, proc_num = 0;
-    if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
-      KMP_WARNING(AffHWSubsetNoHWLOC);
-      goto _exit;
-    }
-    if (__kmp_hws_socket.num == 0)
-      __kmp_hws_socket.num = nPackages; // use all available sockets
-    if (__kmp_hws_die.num == 0)
-      __kmp_hws_die.num = nDiesPerPkg; // use all available dies
-    if (__kmp_hws_core.num == 0)
-      __kmp_hws_core.num = nCoresPerPkg; // use all available cores
-    if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
-      __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
-    if (!__kmp_affinity_uniform_topology()) {
-      KMP_WARNING(AffHWSubsetNonUniform);
-      goto _exit; // don't support non-uniform topology
-    }
-    if (depth > 4) {
-      KMP_WARNING(AffHWSubsetNonThreeLevel);
-      goto _exit; // don't support not-3-level topology
-    }
-    if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
-      KMP_WARNING(AffHWSubsetManySockets);
-      goto _exit;
-    }
-    if (depth == 4 && __kmp_hws_die.offset + __kmp_hws_die.num > nDiesPerPkg) {
-      KMP_WARNING(AffHWSubsetManyDies);
-      goto _exit;
-    }
-    if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
-      KMP_WARNING(AffHWSubsetManyCores);
-      goto _exit;
-    }
-    // Form the requested subset
-    if (pAddr) // pAddr is NULL in case of affinity_none
-      newAddr = (AddrUnsPair *)__kmp_allocate(
-          sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_die.num *
-          __kmp_hws_core.num * __kmp_hws_proc.num);
-    for (int i = 0; i < nPackages; ++i) {
-      if (i < __kmp_hws_socket.offset ||
-          i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
-        // skip not-requested socket
-        n_old += nDiesPerPkg * nCoresPerPkg * __kmp_nThreadsPerCore;
-        if (__kmp_pu_os_idx != NULL) {
-          // walk through skipped socket
-          for (int l = 0; l < nDiesPerPkg; ++l) {
-            for (int j = 0; j < nCoresPerPkg; ++j) {
-              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-                KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-                ++proc_num;
-              }
-            }
-          }
-        }
-      } else {
-        // walk through requested socket
-        for (int l = 0; l < nDiesPerPkg; ++l) {
-          // skip unwanted die
-          if (l < __kmp_hws_die.offset ||
-              l >= __kmp_hws_die.offset + __kmp_hws_die.num) {
-            n_old += nCoresPerPkg;
-            if (__kmp_pu_os_idx != NULL) {
-              for (int k = 0; k < nCoresPerPkg; ++k) {
-                KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-                ++proc_num;
-              }
-            }
-          } else {
-            for (int j = 0; j < nCoresPerPkg; ++j) {
-              if (j < __kmp_hws_core.offset ||
-                  j >= __kmp_hws_core.offset +
-                           __kmp_hws_core.num) { // skip not-requested core
-                n_old += __kmp_nThreadsPerCore;
-                if (__kmp_pu_os_idx != NULL) {
-                  for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-                    KMP_CPU_CLR(__kmp_pu_os_idx[proc_num],
-                                __kmp_affin_fullMask);
-                    ++proc_num;
-                  }
-                }
-              } else {
-                // walk through requested core
-                for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-                  if (k < __kmp_hws_proc.num) {
-                    if (pAddr) // collect requested thread's data
-                      newAddr[n_new] = (*pAddr)[n_old];
-                    n_new++;
-                  } else {
-                    if (__kmp_pu_os_idx != NULL)
-                      KMP_CPU_CLR(__kmp_pu_os_idx[proc_num],
-                                  __kmp_affin_fullMask);
-                  }
-                  n_old++;
-                  ++proc_num;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    KMP_DEBUG_ASSERT(n_old == nPackages * nDiesPerPkg * nCoresPerPkg *
-                                  __kmp_nThreadsPerCore);
-    KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_die.num *
-                                  __kmp_hws_core.num * __kmp_hws_proc.num);
-    nPackages = __kmp_hws_socket.num; // correct nPackages
-    nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
-    nDiesPerPkg = __kmp_hws_die.num; // correct nDiesPerPkg
-    __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
-    __kmp_avail_proc = n_new; // correct avail_proc
-    __kmp_ncores =
-        nPackages * nDiesPerPkg * __kmp_hws_core.num; // correct ncores
-  } // non-hwloc topology method
-  if (pAddr) {
-    __kmp_free(*pAddr);
-    *pAddr = newAddr; // replace old topology with new one
-  }
-  if (__kmp_affinity_verbose) {
-    KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-    __kmp_str_buf_print(&buf, "%d", nPackages);
-    KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
-               __kmp_nThreadsPerCore, __kmp_ncores);
-    __kmp_str_buf_free(&buf);
-  }
-_exit:
-  if (__kmp_pu_os_idx != NULL) {
-    __kmp_free(__kmp_pu_os_idx);
-    __kmp_pu_os_idx = NULL;
-  }
-}
-
 // This function figures out the deepest level at which there is at least one
 // cluster/core with more than one processing unit bound to it.
-static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
-                                          int nprocs, int bottom_level) {
+static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
   int core_level = 0;
 
   for (int i = 0; i < nprocs; i++) {
+    const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
     for (int j = bottom_level; j > 0; j--) {
-      if (address2os[i].first.labels[j] > 0) {
+      if (hw_thread.ids[j] > 0) {
         if (core_level < (j - 1)) {
           core_level = j - 1;
         }
@@ -4145,83 +3397,42 @@
 }
 
 // This function counts number of clusters/cores at given level.
-static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
-                                         int nprocs, int bottom_level,
+static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
                                          int core_level) {
-  int ncores = 0;
-  int i, j;
-
-  j = bottom_level;
-  for (i = 0; i < nprocs; i++) {
-    for (j = bottom_level; j > core_level; j--) {
-      if ((i + 1) < nprocs) {
-        if (address2os[i + 1].first.labels[j] > 0) {
+  return __kmp_topology->get_count(core_level);
+}
+// This function finds to which cluster/core given processing unit is bound.
+static int __kmp_affinity_find_core(int proc, int bottom_level,
+                                    int core_level) {
+  int core = 0;
+  KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
+  for (int i = 0; i <= proc; ++i) {
+    if (i + 1 <= proc) {
+      for (int j = 0; j <= core_level; ++j) {
+        if (__kmp_topology->at(i + 1).sub_ids[j] !=
+            __kmp_topology->at(i).sub_ids[j]) {
+          core++;
           break;
         }
       }
     }
-    if (j == core_level) {
-      ncores++;
-    }
-  }
-  if (j > core_level) {
-    // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
-    // core. May occur when called from __kmp_affinity_find_core().
-    ncores++;
   }
-  return ncores;
-}
-
-// This function finds to which cluster/core given processing unit is bound.
-static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
-                                    int bottom_level, int core_level) {
-  return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
-                                       core_level) -
-         1;
+  return core;
 }
 
 // This function finds maximal number of processing units bound to a
 // cluster/core at given level.
-static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
-                                            int nprocs, int bottom_level,
+static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
                                             int core_level) {
-  int maxprocpercore = 0;
-
-  if (core_level < bottom_level) {
-    for (int i = 0; i < nprocs; i++) {
-      int percore = address2os[i].first.labels[core_level + 1] + 1;
-
-      if (percore > maxprocpercore) {
-        maxprocpercore = percore;
-      }
-    }
-  } else {
-    maxprocpercore = 1;
-  }
-  return maxprocpercore;
+  if (core_level >= bottom_level)
+    return 1;
+  int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
+  return __kmp_topology->calculate_ratio(thread_level, core_level);
 }
 
-static AddrUnsPair *address2os = NULL;
 static int *procarr = NULL;
 static int __kmp_aff_depth = 0;
 
-#if KMP_USE_HIER_SCHED
-#define KMP_EXIT_AFF_NONE                                                      \
-  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
-  KMP_ASSERT(address2os == NULL);                                              \
-  __kmp_apply_thread_places(NULL, 0);                                          \
-  __kmp_create_affinity_none_places();                                         \
-  __kmp_dispatch_set_hierarchy_values();                                       \
-  return;
-#else
-#define KMP_EXIT_AFF_NONE                                                      \
-  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
-  KMP_ASSERT(address2os == NULL);                                              \
-  __kmp_apply_thread_places(NULL, 0);                                          \
-  __kmp_create_affinity_none_places();                                         \
-  return;
-#endif
-
 // Create a one element mask array (set of places) which only contains the
 // initial process's affinity mask
 static void __kmp_create_affinity_none_places() {
@@ -4233,31 +3444,6 @@
   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
 }
 
-static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
-  const Address *aa = &(((const AddrUnsPair *)a)->first);
-  const Address *bb = &(((const AddrUnsPair *)b)->first);
-  unsigned depth = aa->depth;
-  unsigned i;
-  KMP_DEBUG_ASSERT(depth == bb->depth);
-  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
-  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
-  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
-    int j = depth - i - 1;
-    if (aa->childNums[j] < bb->childNums[j])
-      return -1;
-    if (aa->childNums[j] > bb->childNums[j])
-      return 1;
-  }
-  for (; i < depth; i++) {
-    int j = i - __kmp_affinity_compact;
-    if (aa->childNums[j] < bb->childNums[j])
-      return -1;
-    if (aa->childNums[j] > bb->childNums[j])
-      return 1;
-  }
-  return 0;
-}
-
 static void __kmp_aux_affinity_initialize(void) {
   if (__kmp_affinity_masks != NULL) {
     KMP_ASSERT(__kmp_affin_fullMask != NULL);
@@ -4317,14 +3503,6 @@
     }
   }
 
-  if (__kmp_affinity_gran == affinity_gran_tile &&
-      // check if user's request is valid
-      __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) {
-    KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY");
-    __kmp_affinity_gran = affinity_gran_package;
-  }
-
-  int depth = -1;
   kmp_i18n_id_t msg_id = kmp_i18n_null;
 
   // For backward compatibility, setting KMP_CPUINFO_FILE =>
@@ -4334,22 +3512,18 @@
     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
   }
 
+  bool success = false;
   if (__kmp_affinity_top_method == affinity_top_method_all) {
-    // In the default code path, errors are not fatal - we just try using
-    // another method. We only emit a warning message if affinity is on, or the
-    // verbose flag is set, and the nowarnings flag was not set.
-    const char *file_name = NULL;
-    int line = 0;
+// In the default code path, errors are not fatal - we just try using
+// another method. We only emit a warning message if affinity is on, or the
+// verbose flag is set, an the nowarnings flag was not set.
 #if KMP_USE_HWLOC
-    if (depth < 0 &&
+    if (!success &&
         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
-      if (__kmp_affinity_verbose) {
-        KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-      }
       if (!__kmp_hwloc_error) {
-        depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
-        if (depth == 0) {
-          KMP_EXIT_AFF_NONE;
+        success = __kmp_affinity_create_hwloc_map(&msg_id);
+        if (!success && __kmp_affinity_verbose) {
+          KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
         }
       } else if (__kmp_affinity_verbose) {
         KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
@@ -4358,166 +3532,85 @@
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-
-    if (depth < 0) {
-      if (__kmp_affinity_verbose) {
-        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
-      }
-
-      file_name = NULL;
-      depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
-      if (depth == 0) {
-        KMP_EXIT_AFF_NONE;
+    if (!success) {
+      success = __kmp_affinity_create_x2apicid_map(&msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
-
-      if (depth < 0) {
-        if (__kmp_affinity_verbose) {
-          if (msg_id != kmp_i18n_null) {
-            KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
-                       __kmp_i18n_catgets(msg_id),
-                       KMP_I18N_STR(DecodingLegacyAPIC));
-          } else {
-            KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
-                       KMP_I18N_STR(DecodingLegacyAPIC));
-          }
-        }
-
-        file_name = NULL;
-        depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
-        if (depth == 0) {
-          KMP_EXIT_AFF_NONE;
-        }
+    }
+    if (!success) {
+      success = __kmp_affinity_create_apicid_map(&msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
     }
-
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #if KMP_OS_LINUX
-
-    if (depth < 0) {
-      if (__kmp_affinity_verbose) {
-        if (msg_id != kmp_i18n_null) {
-          KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
-                     __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
-        } else {
-          KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
-        }
-      }
-
-      kmp_safe_raii_file_t f("/proc/cpuinfo", "r");
-      depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
-      if (depth == 0) {
-        KMP_EXIT_AFF_NONE;
+    if (!success) {
+      int line = 0;
+      success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
     }
-
 #endif /* KMP_OS_LINUX */
 
 #if KMP_GROUP_AFFINITY
-
-    if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
-      if (__kmp_affinity_verbose) {
-        KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+    if (!success && (__kmp_num_proc_groups > 1)) {
+      success = __kmp_affinity_create_proc_group_map(&msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
-
-      depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
-      KMP_ASSERT(depth != 0);
     }
-
 #endif /* KMP_GROUP_AFFINITY */
 
-    if (depth < 0) {
-      if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
-        if (file_name == NULL) {
-          KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
-        } else if (line == 0) {
-          KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
-        } else {
-          KMP_INFORM(UsingFlatOSFileLine, file_name, line,
-                     __kmp_i18n_catgets(msg_id));
-        }
-      }
-      // FIXME - print msg if msg_id = kmp_i18n_null ???
-
-      file_name = "";
-      depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
-      if (depth == 0) {
-        KMP_EXIT_AFF_NONE;
+    if (!success) {
+      success = __kmp_affinity_create_flat_map(&msg_id);
+      if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
       }
-      KMP_ASSERT(depth > 0);
-      KMP_ASSERT(address2os != NULL);
+      KMP_ASSERT(success);
     }
   }
 
+// If the user has specified that a paricular topology discovery method is to be
+// used, then we abort if that method fails. The exception is group affinity,
+// which might have been implicitly set.
 #if KMP_USE_HWLOC
   else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-    }
-    depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
-    if (depth == 0) {
-      KMP_EXIT_AFF_NONE;
+    success = __kmp_affinity_create_hwloc_map(&msg_id);
+    if (!success) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   }
 #endif // KMP_USE_HWLOC
 
-  // If the user has specified that a particular topology discovery method is to
-  // be used, then we abort if that method fails. The exception is group
-  // affinity, which might have been implicitly set.
-
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-
   else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
            __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
-    }
-
-    depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
-    if (depth == 0) {
-      KMP_EXIT_AFF_NONE;
-    }
-    if (depth < 0) {
+    success = __kmp_affinity_create_x2apicid_map(&msg_id);
+    if (!success) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
-    }
-
-    depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
-    if (depth == 0) {
-      KMP_EXIT_AFF_NONE;
-    }
-    if (depth < 0) {
+    success = __kmp_affinity_create_apicid_map(&msg_id);
+    if (!success) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   }
-
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
   else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
-    const char *filename;
-    const char *env_var = nullptr;
-    if (__kmp_cpuinfo_file != NULL) {
-      filename = __kmp_cpuinfo_file;
-      env_var = "KMP_CPUINFO_FILE";
-    } else {
-      filename = "/proc/cpuinfo";
-    }
-
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
-    }
-
-    kmp_safe_raii_file_t f(filename, "r", env_var);
     int line = 0;
-    depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
-    if (depth < 0) {
+    success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
+    if (!success) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
+      const char *filename = __kmp_cpuinfo_get_filename();
       if (line > 0) {
         KMP_FATAL(FileLineMsgExiting, filename, line,
                   __kmp_i18n_catgets(msg_id));
@@ -4525,84 +3618,80 @@
         KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
       }
     }
-    if (__kmp_affinity_type == affinity_none) {
-      KMP_ASSERT(depth == 0);
-      KMP_EXIT_AFF_NONE;
-    }
   }
 
 #if KMP_GROUP_AFFINITY
-
   else if (__kmp_affinity_top_method == affinity_top_method_group) {
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
-    }
-
-    depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
-    KMP_ASSERT(depth != 0);
-    if (depth < 0) {
+    success = __kmp_affinity_create_proc_group_map(&msg_id);
+    KMP_ASSERT(success);
+    if (!success) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   }
-
 #endif /* KMP_GROUP_AFFINITY */
 
   else if (__kmp_affinity_top_method == affinity_top_method_flat) {
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
-    }
-
-    depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
-    if (depth == 0) {
-      KMP_EXIT_AFF_NONE;
-    }
+    success = __kmp_affinity_create_flat_map(&msg_id);
     // should not fail
-    KMP_ASSERT(depth > 0);
-    KMP_ASSERT(address2os != NULL);
+    KMP_ASSERT(success);
   }
 
-#if KMP_USE_HIER_SCHED
-  __kmp_dispatch_set_hierarchy_values();
-#endif
-
-  if (address2os == NULL) {
+  // Early exit if topology could not be created
+  if (!__kmp_topology) {
     if (KMP_AFFINITY_CAPABLE() &&
         (__kmp_affinity_verbose ||
          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
       KMP_WARNING(ErrorInitializeAffinity);
     }
+    if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
+        __kmp_ncores > 0) {
+      __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
+      __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
+                                   __kmp_nThreadsPerCore, __kmp_ncores);
+      if (__kmp_affinity_verbose) {
+        __kmp_topology->print("KMP_AFFINITY");
+      }
+    }
     __kmp_affinity_type = affinity_none;
     __kmp_create_affinity_none_places();
+#if KMP_USE_HIER_SCHED
+    __kmp_dispatch_set_hierarchy_values();
+#endif
     KMP_AFFINITY_DISABLE();
     return;
   }
 
-  if (__kmp_affinity_gran == affinity_gran_tile
-#if KMP_USE_HWLOC
-      && __kmp_tile_depth == 0
+  // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and
+  // initialize other data structures which depend on the topology
+  __kmp_topology->canonicalize();
+  if (__kmp_affinity_verbose)
+    __kmp_topology->print("KMP_AFFINITY");
+  bool filtered = __kmp_topology->filter_hw_subset();
+  if (filtered && __kmp_affinity_verbose)
+    __kmp_topology->print("KMP_HW_SUBSET");
+  machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
+  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+  // If KMP_AFFINITY=none, then only create the single "none" place
+  // which is the process's initial affinity mask or the number of
+  // hardware threads depending on respect,norespect
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_create_affinity_none_places();
+#if KMP_USE_HIER_SCHED
+    __kmp_dispatch_set_hierarchy_values();
 #endif
-  ) {
-    // tiles requested but not detected, warn user on this
-    KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
+    return;
   }
-
-  __kmp_apply_thread_places(&address2os, depth);
+  int depth = __kmp_topology->get_depth();
 
   // Create the table of masks, indexed by thread Id.
   unsigned maxIndex;
   unsigned numUnique;
-  kmp_affin_mask_t *osId2Mask =
-      __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
+  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique);
   if (__kmp_affinity_gran_levels == 0) {
     KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
   }
 
-  // Set the childNums vector in all Address objects. This must be done before
-  // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
-  // account the setting of __kmp_affinity_compact.
-  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
-
   switch (__kmp_affinity_type) {
 
   case affinity_explicit:
@@ -4627,18 +3716,17 @@
     }
     break;
 
-    // The other affinity types rely on sorting the Addresses according to some
-    // permutation of the machine topology tree. Set __kmp_affinity_compact and
-    // __kmp_affinity_offset appropriately, then jump to a common code fragment
-    // to do the sort and create the array of affinity masks.
-
+  // The other affinity types rely on sorting the hardware threads according to
+  // some permutation of the machine topology tree. Set __kmp_affinity_compact
+  // and __kmp_affinity_offset appropriately, then jump to a common code
+  // fragment to do the sort and create the array of affinity masks.
   case affinity_logical:
     __kmp_affinity_compact = 0;
     if (__kmp_affinity_offset) {
       __kmp_affinity_offset =
           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
     }
-    goto sortAddresses;
+    goto sortTopology;
 
   case affinity_physical:
     if (__kmp_nThreadsPerCore > 1) {
@@ -4653,7 +3741,7 @@
       __kmp_affinity_offset =
           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
     }
-    goto sortAddresses;
+    goto sortTopology;
 
   case affinity_scatter:
     if (__kmp_affinity_compact >= depth) {
@@ -4661,13 +3749,13 @@
     } else {
       __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
     }
-    goto sortAddresses;
+    goto sortTopology;
 
   case affinity_compact:
     if (__kmp_affinity_compact >= depth) {
       __kmp_affinity_compact = depth - 1;
     }
-    goto sortAddresses;
+    goto sortTopology;
 
   case affinity_balanced:
     if (depth <= 1) {
@@ -4677,16 +3765,16 @@
       __kmp_affinity_type = affinity_none;
       __kmp_create_affinity_none_places();
       return;
-    } else if (!__kmp_affinity_uniform_topology()) {
+    } else if (!__kmp_topology->is_uniform()) {
       // Save the depth for further usage
       __kmp_aff_depth = depth;
 
-      int core_level = __kmp_affinity_find_core_level(
-          address2os, __kmp_avail_proc, depth - 1);
-      int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
-                                                 depth - 1, core_level);
+      int core_level =
+          __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
+      int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
+                                                 core_level);
       int maxprocpercore = __kmp_affinity_max_proc_per_core(
-          address2os, __kmp_avail_proc, depth - 1, core_level);
+          __kmp_avail_proc, depth - 1, core_level);
 
       int nproc = ncores * maxprocpercore;
       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
@@ -4705,9 +3793,8 @@
       int lastcore = -1;
       int inlastcore = 0;
       for (int i = 0; i < __kmp_avail_proc; i++) {
-        int proc = address2os[i].second;
-        int core =
-            __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
+        int proc = __kmp_topology->at(i).os_id;
+        int core = __kmp_affinity_find_core(i, depth - 1, core_level);
 
         if (core == lastcore) {
           inlastcore++;
@@ -4723,7 +3810,7 @@
       __kmp_affinity_compact = depth - 1;
     }
 
-  sortAddresses:
+  sortTopology:
     // Allocate the gtid->affinity mask table.
     if (__kmp_affinity_dups) {
       __kmp_affinity_num_masks = __kmp_avail_proc;
@@ -4739,18 +3826,19 @@
 
     KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
 
-    // Sort the address2os table according to the current setting of
+    // Sort the topology table according to the current setting of
     // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
-    qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
-          __kmp_affinity_cmp_Address_child_num);
+    __kmp_topology->sort_compact();
     {
       int i;
       unsigned j;
-      for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
-        if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
+      int num_hw_threads = __kmp_topology->get_num_hw_threads();
+      for (i = 0, j = 0; i < num_hw_threads; i++) {
+        if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) {
           continue;
         }
-        unsigned osId = address2os[i].second;
+        int osId = __kmp_topology->at(i).os_id;
+
         kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
         kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
@@ -4761,6 +3849,8 @@
       }
       KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
     }
+    // Sort the topology back using ids
+    __kmp_topology->sort_ids();
     break;
 
   default:
@@ -4768,9 +3858,7 @@
   }
 
   KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
-  machine_hierarchy.init(address2os, __kmp_avail_proc);
 }
-#undef KMP_EXIT_AFF_NONE
 
 void __kmp_affinity_initialize(void) {
   // Much of the code above was written assuming that if a machine was not
@@ -4810,10 +3898,6 @@
     __kmp_free(__kmp_affinity_proclist);
     __kmp_affinity_proclist = NULL;
   }
-  if (address2os != NULL) {
-    __kmp_free(address2os);
-    address2os = NULL;
-  }
   if (procarr != NULL) {
     __kmp_free(procarr);
     procarr = NULL;
@@ -4824,6 +3908,14 @@
     __kmp_hwloc_topology = NULL;
   }
 #endif
+  if (__kmp_hw_subset) {
+    kmp_hw_subset_t::deallocate(__kmp_hw_subset);
+    __kmp_hw_subset = nullptr;
+  }
+  if (__kmp_topology) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+  }
   KMPAffinity::destroy_api();
 }
 
@@ -5201,15 +4293,14 @@
   int tid = th->th.th_info.ds.ds_tid;
 
   switch (__kmp_affinity_gran) {
-  case affinity_gran_fine:
-  case affinity_gran_thread:
+  case KMP_HW_THREAD:
     break;
-  case affinity_gran_core:
+  case KMP_HW_CORE:
     if (__kmp_nThreadsPerCore > 1) {
       fine_gran = false;
     }
     break;
-  case affinity_gran_package:
+  case KMP_HW_SOCKET:
     if (nCoresPerPkg > 1) {
       fine_gran = false;
     }
@@ -5218,7 +4309,7 @@
     fine_gran = false;
   }
 
-  if (__kmp_affinity_uniform_topology()) {
+  if (__kmp_topology->is_uniform()) {
     int coreID;
     int threadID;
     // Number of hyper threads per core in HT machine
@@ -5242,7 +4333,6 @@
       coreID = (tid - big_cores) / chunk;
       threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
     }
-
     KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
                       "Illegal set affinity operation when not capable");
 
@@ -5250,12 +4340,13 @@
     KMP_CPU_ZERO(mask);
 
     if (fine_gran) {
-      int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
+      int osID =
+          __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
       KMP_CPU_SET(osID, mask);
     } else {
       for (int i = 0; i < __kmp_nth_per_core; i++) {
         int osID;
-        osID = address2os[coreID * __kmp_nth_per_core + i].second;
+        osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
         KMP_CPU_SET(osID, mask);
       }
     }
@@ -5271,26 +4362,26 @@
     kmp_affin_mask_t *mask = th->th.th_affin_mask;
     KMP_CPU_ZERO(mask);
 
-    int core_level = __kmp_affinity_find_core_level(
-        address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
-    int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
+    int core_level =
+        __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
+    int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
                                                __kmp_aff_depth - 1, core_level);
     int nth_per_core = __kmp_affinity_max_proc_per_core(
-        address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
+        __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
 
     // For performance gain consider the special case nthreads ==
     // __kmp_avail_proc
     if (nthreads == __kmp_avail_proc) {
       if (fine_gran) {
-        int osID = address2os[tid].second;
+        int osID = __kmp_topology->at(tid).os_id;
         KMP_CPU_SET(osID, mask);
       } else {
-        int core = __kmp_affinity_find_core(address2os, tid,
-                                            __kmp_aff_depth - 1, core_level);
+        int core =
+            __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
         for (int i = 0; i < __kmp_avail_proc; i++) {
-          int osID = address2os[i].second;
-          if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
-                                       core_level) == core) {
+          int osID = __kmp_topology->at(i).os_id;
+          if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
+              core) {
             KMP_CPU_SET(osID, mask);
           }
         }
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -247,8 +247,6 @@
 #if KMP_USE_HWLOC
 int __kmp_hwloc_error = FALSE;
 hwloc_topology_t __kmp_hwloc_topology = NULL;
-int __kmp_numa_detected = FALSE;
-int __kmp_tile_depth = 0;
 #endif
 
 #if KMP_OS_WINDOWS
@@ -263,7 +261,7 @@
 
 size_t __kmp_affin_mask_size = 0;
 enum affinity_type __kmp_affinity_type = affinity_default;
-enum affinity_gran __kmp_affinity_gran = affinity_gran_default;
+kmp_hw_t __kmp_affinity_gran = KMP_HW_UNKNOWN;
 int __kmp_affinity_gran_levels = -1;
 int __kmp_affinity_dups = TRUE;
 enum affinity_top_method __kmp_affinity_top_method =
@@ -286,15 +284,6 @@
 int __kmp_display_affinity = FALSE;
 char *__kmp_affinity_format = NULL;
 
-kmp_hws_item_t __kmp_hws_socket = {0, 0};
-kmp_hws_item_t __kmp_hws_die = {0, 0};
-kmp_hws_item_t __kmp_hws_node = {0, 0};
-kmp_hws_item_t __kmp_hws_tile = {0, 0};
-kmp_hws_item_t __kmp_hws_core = {0, 0};
-kmp_hws_item_t __kmp_hws_proc = {0, 0};
-int __kmp_hws_requested = 0;
-int __kmp_hws_abs_flag = 0; // absolute or per-item number requested
-
 kmp_int32 __kmp_default_device = 0;
 
 kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -2069,9 +2069,9 @@
                                      enum affinity_type *out_type,
                                      char **out_proclist, int *out_verbose,
                                      int *out_warn, int *out_respect,
-                                     enum affinity_gran *out_gran,
-                                     int *out_gran_levels, int *out_dups,
-                                     int *out_compact, int *out_offset) {
+                                     kmp_hw_t *out_gran, int *out_gran_levels,
+                                     int *out_dups, int *out_compact,
+                                     int *out_offset) {
   char *buffer = NULL; // Copy of env var value.
   char *buf = NULL; // Buffer for strtok_r() function.
   char *next = NULL; // end of token / start of next.
@@ -2087,6 +2087,7 @@
   int respect = 0;
   int gran = 0;
   int dups = 0;
+  bool set = false;
 
   KMP_ASSERT(value != NULL);
 
@@ -2232,45 +2233,51 @@
       SKIP_WS(next);
 
       buf = next;
-      if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_fine, -1);
-        buf = next;
-      } else if (__kmp_match_str("thread", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_thread, -1);
-        buf = next;
-      } else if (__kmp_match_str("core", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_core, -1);
-        buf = next;
-#if KMP_USE_HWLOC
-      } else if (__kmp_match_str("tile", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_tile, -1);
-        buf = next;
-#endif
-      } else if (__kmp_match_str("die", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_die, -1);
-        buf = next;
-      } else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_package, -1);
-        buf = next;
-      } else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_node, -1);
-        buf = next;
+
+      // Try any hardware topology type for granularity
+      KMP_FOREACH_HW_TYPE(type) {
+        const char *name = __kmp_hw_get_keyword(type);
+        if (__kmp_match_str(name, buf, CCAST(const char **, &next))) {
+          set_gran(type, -1);
+          buf = next;
+          set = true;
+          break;
+        }
+      }
+      if (!set) {
+        // Support older names for different granularity layers
+        if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) {
+          set_gran(KMP_HW_THREAD, -1);
+          buf = next;
+          set = true;
+        } else if (__kmp_match_str("package", buf,
+                                   CCAST(const char **, &next))) {
+          set_gran(KMP_HW_SOCKET, -1);
+          buf = next;
+          set = true;
+        } else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) {
+          set_gran(KMP_HW_NUMA, -1);
+          buf = next;
+          set = true;
 #if KMP_GROUP_AFFINITY
-      } else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) {
-        set_gran(affinity_gran_group, -1);
-        buf = next;
+        } else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) {
+          set_gran(KMP_HW_PROC_GROUP, -1);
+          buf = next;
+          set = true;
 #endif /* KMP_GROUP AFFINITY */
-      } else if ((*buf >= '0') && (*buf <= '9')) {
-        int n;
-        next = buf;
-        SKIP_DIGITS(next);
-        n = __kmp_str_to_int(buf, *next);
-        KMP_ASSERT(n >= 0);
-        buf = next;
-        set_gran(affinity_gran_default, n);
-      } else {
-        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
-        continue;
+        } else if ((*buf >= '0') && (*buf <= '9')) {
+          int n;
+          next = buf;
+          SKIP_DIGITS(next);
+          n = __kmp_str_to_int(buf, *next);
+          KMP_ASSERT(n >= 0);
+          buf = next;
+          set_gran(KMP_HW_UNKNOWN, n);
+          set = true;
+        } else {
+          EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+          continue;
+        }
       }
     } else if (__kmp_match_str("proclist", buf, CCAST(const char **, &next))) {
       char *temp_proclist;
@@ -2377,20 +2384,20 @@
       *out_offset = number[1];
     }
 
-    if (__kmp_affinity_gran == affinity_gran_default) {
+    if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
 #if KMP_MIC_SUPPORTED
       if (__kmp_mic_type != non_mic) {
         if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
           KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "fine");
         }
-        __kmp_affinity_gran = affinity_gran_fine;
+        __kmp_affinity_gran = KMP_HW_THREAD;
       } else
 #endif
       {
         if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
           KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "core");
         }
-        __kmp_affinity_gran = affinity_gran_core;
+        __kmp_affinity_gran = KMP_HW_CORE;
       }
     }
   } break;
@@ -2475,31 +2482,8 @@
     } else {
       __kmp_str_buf_print(buffer, "%s,", "norespect");
     }
-    switch (__kmp_affinity_gran) {
-    case affinity_gran_default:
-      __kmp_str_buf_print(buffer, "%s", "granularity=default,");
-      break;
-    case affinity_gran_fine:
-      __kmp_str_buf_print(buffer, "%s", "granularity=fine,");
-      break;
-    case affinity_gran_thread:
-      __kmp_str_buf_print(buffer, "%s", "granularity=thread,");
-      break;
-    case affinity_gran_core:
-      __kmp_str_buf_print(buffer, "%s", "granularity=core,");
-      break;
-    case affinity_gran_package:
-      __kmp_str_buf_print(buffer, "%s", "granularity=package,");
-      break;
-    case affinity_gran_node:
-      __kmp_str_buf_print(buffer, "%s", "granularity=node,");
-      break;
-#if KMP_GROUP_AFFINITY
-    case affinity_gran_group:
-      __kmp_str_buf_print(buffer, "%s", "granularity=group,");
-      break;
-#endif /* KMP_GROUP_AFFINITY */
-    }
+    __kmp_str_buf_print(buffer, "granularity=%s,",
+                        __kmp_hw_get_keyword(__kmp_affinity_gran, false));
   }
   if (!KMP_AFFINITY_CAPABLE()) {
     __kmp_str_buf_print(buffer, "%s", "disabled");
@@ -2571,7 +2555,7 @@
       // GOMP_CPU_AFFINITY => granularity=fine,explicit,proclist=...
       __kmp_affinity_proclist = temp_proclist;
       __kmp_affinity_type = affinity_explicit;
-      __kmp_affinity_gran = affinity_gran_fine;
+      __kmp_affinity_gran = KMP_HW_THREAD;
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
     } else {
       KMP_WARNING(AffSyntaxError, name);
@@ -2856,10 +2840,20 @@
 
 static void __kmp_stg_parse_places(char const *name, char const *value,
                                    void *data) {
+  struct kmp_place_t {
+    const char *name;
+    kmp_hw_t type;
+  };
   int count;
+  bool set = false;
   const char *scan = value;
   const char *next = scan;
   const char *kind = "\"threads\"";
+  kmp_place_t std_places[] = {{"threads", KMP_HW_THREAD},
+                              {"cores", KMP_HW_CORE},
+                              {"numa_domains", KMP_HW_NUMA},
+                              {"ll_caches", KMP_HW_LLC},
+                              {"sockets", KMP_HW_SOCKET}};
   kmp_setting_t **rivals = (kmp_setting_t **)data;
   int rc;
 
@@ -2868,52 +2862,47 @@
     return;
   }
 
-  if (__kmp_match_str("threads", scan, &next)) {
-    scan = next;
-    __kmp_affinity_type = affinity_compact;
-    __kmp_affinity_gran = affinity_gran_thread;
-    __kmp_affinity_dups = FALSE;
-    kind = "\"threads\"";
-  } else if (__kmp_match_str("cores", scan, &next)) {
-    scan = next;
-    __kmp_affinity_type = affinity_compact;
-    __kmp_affinity_gran = affinity_gran_core;
-    __kmp_affinity_dups = FALSE;
-    kind = "\"cores\"";
-#if KMP_USE_HWLOC
-  } else if (__kmp_match_str("tiles", scan, &next)) {
-    scan = next;
-    __kmp_affinity_type = affinity_compact;
-    __kmp_affinity_gran = affinity_gran_tile;
-    __kmp_affinity_dups = FALSE;
-    kind = "\"tiles\"";
-#endif
-  } else if (__kmp_match_str("dice", scan, &next) ||
-             __kmp_match_str("dies", scan, &next)) {
-    scan = next;
-    __kmp_affinity_type = affinity_compact;
-    __kmp_affinity_gran = affinity_gran_die;
-    __kmp_affinity_dups = FALSE;
-    kind = "\"dice\"";
-  } else if (__kmp_match_str("sockets", scan, &next)) {
-    scan = next;
-    __kmp_affinity_type = affinity_compact;
-    __kmp_affinity_gran = affinity_gran_package;
-    __kmp_affinity_dups = FALSE;
-    kind = "\"sockets\"";
-  } else {
+  // Standard choices
+  for (size_t i = 0; i < sizeof(std_places) / sizeof(std_places[0]); ++i) {
+    const kmp_place_t &place = std_places[i];
+    if (__kmp_match_str(place.name, scan, &next)) {
+      scan = next;
+      __kmp_affinity_type = affinity_compact;
+      __kmp_affinity_gran = place.type;
+      __kmp_affinity_dups = FALSE;
+      set = true;
+      break;
+    }
+  }
+  // Implementation choices for OMP_PLACES based on internal types
+  if (!set) {
+    KMP_FOREACH_HW_TYPE(type) {
+      const char *name = __kmp_hw_get_keyword(type, true);
+      if (__kmp_match_str("unknowns", scan, &next))
+        continue;
+      if (__kmp_match_str(name, scan, &next)) {
+        scan = next;
+        __kmp_affinity_type = affinity_compact;
+        __kmp_affinity_gran = type;
+        __kmp_affinity_dups = FALSE;
+        set = true;
+        break;
+      }
+    }
+  }
+  if (!set) {
     if (__kmp_affinity_proclist != NULL) {
       KMP_INTERNAL_FREE((void *)__kmp_affinity_proclist);
       __kmp_affinity_proclist = NULL;
     }
     if (__kmp_parse_place_list(name, value, &__kmp_affinity_proclist)) {
       __kmp_affinity_type = affinity_explicit;
-      __kmp_affinity_gran = affinity_gran_fine;
+      __kmp_affinity_gran = KMP_HW_THREAD;
       __kmp_affinity_dups = FALSE;
     } else {
       // Syntax error fallback
       __kmp_affinity_type = affinity_compact;
-      __kmp_affinity_gran = affinity_gran_core;
+      __kmp_affinity_gran = KMP_HW_CORE;
       __kmp_affinity_dups = FALSE;
     }
     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
@@ -2921,6 +2910,9 @@
     }
     return;
   }
+  if (__kmp_affinity_gran != KMP_HW_UNKNOWN) {
+    kind = __kmp_hw_get_keyword(__kmp_affinity_gran);
+  }
 
   if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
@@ -2985,31 +2977,12 @@
     } else {
       num = 0;
     }
-    if (__kmp_affinity_gran == affinity_gran_thread) {
-      if (num > 0) {
-        __kmp_str_buf_print(buffer, "='threads(%d)'\n", num);
-      } else {
-        __kmp_str_buf_print(buffer, "='threads'\n");
-      }
-    } else if (__kmp_affinity_gran == affinity_gran_core) {
+    if (__kmp_affinity_gran != KMP_HW_UNKNOWN) {
+      const char *name = __kmp_hw_get_keyword(__kmp_affinity_gran, true);
       if (num > 0) {
-        __kmp_str_buf_print(buffer, "='cores(%d)' \n", num);
+        __kmp_str_buf_print(buffer, "='%s(%d)'\n", name, num);
       } else {
-        __kmp_str_buf_print(buffer, "='cores'\n");
-      }
-#if KMP_USE_HWLOC
-    } else if (__kmp_affinity_gran == affinity_gran_tile) {
-      if (num > 0) {
-        __kmp_str_buf_print(buffer, "='tiles(%d)' \n", num);
-      } else {
-        __kmp_str_buf_print(buffer, "='tiles'\n");
-      }
-#endif
-    } else if (__kmp_affinity_gran == affinity_gran_package) {
-      if (num > 0) {
-        __kmp_str_buf_print(buffer, "='sockets(%d)'\n", num);
-      } else {
-        __kmp_str_buf_print(buffer, "='sockets'\n");
+        __kmp_str_buf_print(buffer, "='%s'\n", name);
       }
     } else {
       __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
@@ -3118,8 +3091,12 @@
     break;
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case affinity_top_method_x2apicid_1f:
+    value = "x2APIC id leaf 0x1f";
+    break;
+
   case affinity_top_method_x2apicid:
-    value = "x2APIC id";
+    value = "x2APIC id leaf 0xb";
     break;
 
   case affinity_top_method_apicid:
@@ -4727,12 +4704,92 @@
 
 // -----------------------------------------------------------------------------
 // KMP_HW_SUBSET (was KMP_PLACE_THREADS)
+// 2s16c,2t => 2S16C,2T => 2S16C \0 2T
+
+// Return KMP_HW_SUBSET preferred hardware type in case a token is ambiguously
+// short. The original KMP_HW_SUBSET environment variable had single letters:
+// s, c, t for sockets, cores, threads repsectively.
+static kmp_hw_t __kmp_hw_subset_break_tie(const kmp_hw_t *possible,
+                                          size_t num_possible) {
+  for (size_t i = 0; i < num_possible; ++i) {
+    if (possible[i] == KMP_HW_THREAD)
+      return KMP_HW_THREAD;
+    else if (possible[i] == KMP_HW_CORE)
+      return KMP_HW_CORE;
+    else if (possible[i] == KMP_HW_SOCKET)
+      return KMP_HW_SOCKET;
+  }
+  return KMP_HW_UNKNOWN;
+}
 
-// The longest observable sequence of items is
-// Socket-Node-Tile-Core-Thread
-// So, let's limit to 5 levels for now
+// Return hardware type from string or HW_UNKNOWN if string cannot be parsed
+// This algorithm is very forgiving to the user in that, the instant it can
+// reduce the search space to one, it assumes that is the topology level the
+// user wanted, even if it is misspelled later in the token.
+static kmp_hw_t __kmp_stg_parse_hw_subset_name(char const *token) {
+  size_t index, num_possible, token_length;
+  kmp_hw_t possible[KMP_HW_LAST];
+  const char *end;
+
+  // Find the end of the hardware token string
+  end = token;
+  token_length = 0;
+  while (isalnum(*end) || *end == '_') {
+    token_length++;
+    end++;
+  }
+
+  // Set the possibilities to all hardware types
+  num_possible = 0;
+  KMP_FOREACH_HW_TYPE(type) { possible[num_possible++] = type; }
+
+  // Eliminate hardware types by comparing the front of the token
+  // with hardware names
+  // In most cases, the first letter in the token will indicate exactly
+  // which hardware type is parsed, e.g., 'C' = Core
+  index = 0;
+  while (num_possible > 1 && index < token_length) {
+    size_t n = num_possible;
+    char token_char = (char)toupper(token[index]);
+    for (size_t i = 0; i < n; ++i) {
+      const char *s;
+      kmp_hw_t type = possible[i];
+      s = __kmp_hw_get_keyword(type, false);
+      if (index < KMP_STRLEN(s)) {
+        char c = (char)toupper(s[index]);
+        // Mark hardware types for removal when the characters do not match
+        if (c != token_char) {
+          possible[i] = KMP_HW_UNKNOWN;
+          num_possible--;
+        }
+      }
+    }
+    // Remove hardware types that this token cannot be
+    size_t start = 0;
+    for (size_t i = 0; i < n; ++i) {
+      if (possible[i] != KMP_HW_UNKNOWN) {
+        kmp_hw_t temp = possible[i];
+        possible[i] = possible[start];
+        possible[start] = temp;
+        start++;
+      }
+    }
+    KMP_ASSERT(start == num_possible);
+    index++;
+  }
+
+  // Attempt to break a tie if user has very short token
+  // (e.g., is 'T' tile or thread?)
+  if (num_possible > 1)
+    return __kmp_hw_subset_break_tie(possible, num_possible);
+  if (num_possible == 1)
+    return possible[0];
+  return KMP_HW_UNKNOWN;
+}
+
+// The longest observable sequence of items can only be HW_LAST length
 // The input string is usually short enough, let's use 512 limit for now
-#define MAX_T_LEVEL 5
+#define MAX_T_LEVEL KMP_HW_LAST
 #define MAX_STR_LEN 512
 static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
                                       void *data) {
@@ -4751,12 +4808,13 @@
   char input[MAX_STR_LEN];
   size_t len = 0, mlen = MAX_STR_LEN;
   int level = 0;
-  // Canonize the string (remove spaces, unify delimiters, etc.)
+  bool absolute = false;
+  // Canonicalize the string (remove spaces, unify delimiters, etc.)
   char *pos = CCAST(char *, value);
   while (*pos && mlen) {
     if (*pos != ' ') { // skip spaces
       if (len == 0 && *pos == ':') {
-        __kmp_hws_abs_flag = 1; // if the first symbol is ":", skip it
+        absolute = true;
       } else {
         input[len] = (char)(toupper(*pos));
         if (input[len] == 'X')
@@ -4769,10 +4827,10 @@
     mlen--;
     pos++;
   }
-  if (len == 0 || mlen == 0)
+  if (len == 0 || mlen == 0) {
     goto err; // contents is either empty or too long
+  }
   input[len] = '\0';
-  __kmp_hws_requested = 1; // mark that subset requested
   // Split by delimiter
   pos = input;
   components[level++] = pos;
@@ -4782,145 +4840,68 @@
     *pos = '\0'; // modify input and avoid more copying
     components[level++] = ++pos; // expect something after ","
   }
+
+  __kmp_hw_subset = kmp_hw_subset_t::allocate();
+  if (absolute)
+    __kmp_hw_subset->set_absolute();
+
   // Check each component
   for (int i = 0; i < level; ++i) {
     int offset = 0;
     int num = atoi(components[i]); // each component should start with a number
+    if (num <= 0) {
+      goto err; // only positive integers are valid for count
+    }
     if ((pos = strchr(components[i], '@'))) {
       offset = atoi(pos + 1); // save offset
       *pos = '\0'; // cut the offset from the component
     }
     pos = components[i] + strspn(components[i], digits);
-    if (pos == components[i])
+    if (pos == components[i]) {
       goto err;
+    }
     // detect the component type
-    switch (*pos) {
-    case 'S': // Socket
-      if (__kmp_hws_socket.num > 0)
-        goto err; // duplicate is not allowed
-      __kmp_hws_socket.num = num;
-      __kmp_hws_socket.offset = offset;
-      break;
-    case 'N': // NUMA Node
-      if (__kmp_hws_node.num > 0)
-        goto err; // duplicate is not allowed
-      __kmp_hws_node.num = num;
-      __kmp_hws_node.offset = offset;
-      break;
-    case 'D': // Die
-      if (__kmp_hws_die.num > 0)
-        goto err; // duplicate is not allowed
-      __kmp_hws_die.num = num;
-      __kmp_hws_die.offset = offset;
-      break;
-    case 'L': // Cache
-      if (*(pos + 1) == '2') { // L2 - Tile
-        if (__kmp_hws_tile.num > 0)
-          goto err; // duplicate is not allowed
-        __kmp_hws_tile.num = num;
-        __kmp_hws_tile.offset = offset;
-      } else if (*(pos + 1) == '3') { // L3 - Socket
-        if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0)
-          goto err; // duplicate is not allowed
-        __kmp_hws_socket.num = num;
-        __kmp_hws_socket.offset = offset;
-      } else if (*(pos + 1) == '1') { // L1 - Core
-        if (__kmp_hws_core.num > 0)
-          goto err; // duplicate is not allowed
-        __kmp_hws_core.num = num;
-        __kmp_hws_core.offset = offset;
-      }
-      break;
-    case 'C': // Core (or Cache?)
-      if (*(pos + 1) != 'A') {
-        if (__kmp_hws_core.num > 0)
-          goto err; // duplicate is not allowed
-        __kmp_hws_core.num = num;
-        __kmp_hws_core.offset = offset;
-      } else { // Cache
-        char *d = pos + strcspn(pos, digits); // find digit
-        if (*d == '2') { // L2 - Tile
-          if (__kmp_hws_tile.num > 0)
-            goto err; // duplicate is not allowed
-          __kmp_hws_tile.num = num;
-          __kmp_hws_tile.offset = offset;
-        } else if (*d == '3') { // L3 - Socket
-          if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0)
-            goto err; // duplicate is not allowed
-          __kmp_hws_socket.num = num;
-          __kmp_hws_socket.offset = offset;
-        } else if (*d == '1') { // L1 - Core
-          if (__kmp_hws_core.num > 0)
-            goto err; // duplicate is not allowed
-          __kmp_hws_core.num = num;
-          __kmp_hws_core.offset = offset;
-        } else {
-          goto err;
-        }
-      }
-      break;
-    case 'T': // Thread
-      if (__kmp_hws_proc.num > 0)
-        goto err; // duplicate is not allowed
-      __kmp_hws_proc.num = num;
-      __kmp_hws_proc.offset = offset;
-      break;
-    default:
+    kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos);
+    if (type == KMP_HW_UNKNOWN) {
       goto err;
     }
+    if (__kmp_hw_subset->specified(type)) {
+      goto err;
+    }
+    __kmp_hw_subset->push_back(num, type, offset);
   }
   return;
 err:
   KMP_WARNING(AffHWSubsetInvalid, name, value);
-  __kmp_hws_requested = 0; // mark that subset not requested
+  if (__kmp_hw_subset) {
+    kmp_hw_subset_t::deallocate(__kmp_hw_subset);
+    __kmp_hw_subset = nullptr;
+  }
   return;
 }
 
 static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
                                       void *data) {
-  if (__kmp_hws_requested) {
-    int comma = 0;
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-    if (__kmp_env_format)
-      KMP_STR_BUF_PRINT_NAME_EX(name);
-    else
-      __kmp_str_buf_print(buffer, "   %s='", name);
-    if (__kmp_hws_socket.num) {
-      __kmp_str_buf_print(&buf, "%ds", __kmp_hws_socket.num);
-      if (__kmp_hws_socket.offset)
-        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_socket.offset);
-      comma = 1;
-    }
-    if (__kmp_hws_die.num) {
-      __kmp_str_buf_print(&buf, "%s%dd", comma ? "," : "", __kmp_hws_die.num);
-      if (__kmp_hws_die.offset)
-        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_die.offset);
-      comma = 1;
-    }
-    if (__kmp_hws_node.num) {
-      __kmp_str_buf_print(&buf, "%s%dn", comma ? "," : "", __kmp_hws_node.num);
-      if (__kmp_hws_node.offset)
-        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_node.offset);
-      comma = 1;
-    }
-    if (__kmp_hws_tile.num) {
-      __kmp_str_buf_print(&buf, "%s%dL2", comma ? "," : "", __kmp_hws_tile.num);
-      if (__kmp_hws_tile.offset)
-        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_tile.offset);
-      comma = 1;
-    }
-    if (__kmp_hws_core.num) {
-      __kmp_str_buf_print(&buf, "%s%dc", comma ? "," : "", __kmp_hws_core.num);
-      if (__kmp_hws_core.offset)
-        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_core.offset);
-      comma = 1;
-    }
-    if (__kmp_hws_proc.num)
-      __kmp_str_buf_print(&buf, "%s%dt", comma ? "," : "", __kmp_hws_proc.num);
-    __kmp_str_buf_print(buffer, "%s'\n", buf.str);
-    __kmp_str_buf_free(&buf);
+  kmp_str_buf_t buf;
+  int depth;
+  if (!__kmp_hw_subset)
+    return;
+  __kmp_str_buf_init(&buf);
+  if (__kmp_env_format)
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  else
+    __kmp_str_buf_print(buffer, "   %s='", name);
+
+  depth = __kmp_hw_subset->get_depth();
+  for (int i = 0; i < depth; ++i) {
+    const auto &item = __kmp_hw_subset->at(i);
+    __kmp_str_buf_print(&buf, "%s%d%s", (i > 0 ? "," : ""), item.num,
+                        __kmp_hw_get_keyword(item.type));
+    if (item.offset)
+      __kmp_str_buf_print(&buf, "@%d", item.offset);
   }
+  __kmp_str_buf_print(buffer, "%s'\n", buf.str);
+  __kmp_str_buf_free(&buf);
 }
 
 #if USE_ITT_BUILD
@@ -5762,7 +5743,7 @@
       // Reset the affinity flags to their default values,
       // in case this is called from kmp_set_defaults().
       __kmp_affinity_type = affinity_default;
-      __kmp_affinity_gran = affinity_gran_default;
+      __kmp_affinity_gran = KMP_HW_UNKNOWN;
       __kmp_affinity_top_method = affinity_top_method_default;
       __kmp_affinity_respect_mask = affinity_respect_mask_default;
     }
@@ -5772,7 +5753,7 @@
     aff_str = __kmp_env_blk_var(&block, "OMP_PROC_BIND");
     if (aff_str != NULL) {
       __kmp_affinity_type = affinity_default;
-      __kmp_affinity_gran = affinity_gran_default;
+      __kmp_affinity_gran = KMP_HW_UNKNOWN;
       __kmp_affinity_top_method = affinity_top_method_default;
       __kmp_affinity_respect_mask = affinity_respect_mask_default;
     }
@@ -5844,12 +5825,19 @@
   if (!TCR_4(__kmp_init_middle)) {
 #if KMP_USE_HWLOC
     // Force using hwloc when either tiles or numa nodes requested within
-    // KMP_HW_SUBSET and no other topology method is requested
-    if ((__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0 ||
-         __kmp_affinity_gran == affinity_gran_tile) &&
-        (__kmp_affinity_top_method == affinity_top_method_default)) {
+    // KMP_HW_SUBSET or granularity setting and no other topology method
+    // is requested
+    if (__kmp_hw_subset &&
+        __kmp_affinity_top_method == affinity_top_method_default)
+      if (__kmp_hw_subset->specified(KMP_HW_NUMA) ||
+          __kmp_hw_subset->specified(KMP_HW_TILE) ||
+          __kmp_affinity_gran == KMP_HW_TILE ||
+          __kmp_affinity_gran == KMP_HW_NUMA)
+        __kmp_affinity_top_method = affinity_top_method_hwloc;
+    // Force using hwloc when tiles or numa nodes requested for OMP_PLACES
+    if (__kmp_affinity_gran == KMP_HW_NUMA ||
+        __kmp_affinity_gran == KMP_HW_TILE)
       __kmp_affinity_top_method = affinity_top_method_hwloc;
-    }
 #endif
     // Determine if the machine/OS is actually capable of supporting
     // affinity.
@@ -5879,7 +5867,7 @@
         }
         __kmp_affinity_type = affinity_disabled;
         __kmp_affinity_respect_mask = 0;
-        __kmp_affinity_gran = affinity_gran_fine;
+        __kmp_affinity_gran = KMP_HW_THREAD;
       }
     }
 
@@ -5937,44 +5925,27 @@
           __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
         }
         if (__kmp_affinity_top_method == affinity_top_method_default) {
-          if (__kmp_affinity_gran == affinity_gran_default) {
+          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
             __kmp_affinity_top_method = affinity_top_method_group;
-            __kmp_affinity_gran = affinity_gran_group;
-          } else if (__kmp_affinity_gran == affinity_gran_group) {
+            __kmp_affinity_gran = KMP_HW_PROC_GROUP;
+          } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
             __kmp_affinity_top_method = affinity_top_method_group;
           } else {
             __kmp_affinity_top_method = affinity_top_method_all;
           }
         } else if (__kmp_affinity_top_method == affinity_top_method_group) {
-          if (__kmp_affinity_gran == affinity_gran_default) {
-            __kmp_affinity_gran = affinity_gran_group;
-          } else if ((__kmp_affinity_gran != affinity_gran_group) &&
-                     (__kmp_affinity_gran != affinity_gran_fine) &&
-                     (__kmp_affinity_gran != affinity_gran_thread)) {
-            const char *str = NULL;
-            switch (__kmp_affinity_gran) {
-            case affinity_gran_core:
-              str = "core";
-              break;
-            case affinity_gran_package:
-              str = "package";
-              break;
-            case affinity_gran_node:
-              str = "node";
-              break;
-            case affinity_gran_tile:
-              str = "tile";
-              break;
-            default:
-              KMP_DEBUG_ASSERT(0);
-            }
+          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
+            __kmp_affinity_gran = KMP_HW_PROC_GROUP;
+          } else if ((__kmp_affinity_gran != KMP_HW_PROC_GROUP) &&
+                     (__kmp_affinity_gran != KMP_HW_THREAD)) {
+            const char *str = __kmp_hw_get_keyword(__kmp_affinity_gran);
             KMP_WARNING(AffGranTopGroup, var, str);
-            __kmp_affinity_gran = affinity_gran_fine;
+            __kmp_affinity_gran = KMP_HW_THREAD;
           }
         } else {
-          if (__kmp_affinity_gran == affinity_gran_default) {
-            __kmp_affinity_gran = affinity_gran_core;
-          } else if (__kmp_affinity_gran == affinity_gran_group) {
+          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
+            __kmp_affinity_gran = KMP_HW_CORE;
+          } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
             const char *str = NULL;
             switch (__kmp_affinity_type) {
             case affinity_physical:
@@ -5997,7 +5968,7 @@
               KMP_DEBUG_ASSERT(0);
             }
             KMP_WARNING(AffGranGroupType, var, str);
-            __kmp_affinity_gran = affinity_gran_core;
+            __kmp_affinity_gran = KMP_HW_CORE;
           }
         }
       } else
@@ -6039,15 +6010,15 @@
             __kmp_affinity_type = affinity_none;
           }
         }
-        if ((__kmp_affinity_gran == affinity_gran_default) &&
+        if ((__kmp_affinity_gran == KMP_HW_UNKNOWN) &&
             (__kmp_affinity_gran_levels < 0)) {
 #if KMP_MIC_SUPPORTED
           if (__kmp_mic_type != non_mic) {
-            __kmp_affinity_gran = affinity_gran_fine;
+            __kmp_affinity_gran = KMP_HW_THREAD;
           } else
 #endif
           {
-            __kmp_affinity_gran = affinity_gran_core;
+            __kmp_affinity_gran = KMP_HW_CORE;
           }
         }
         if (__kmp_affinity_top_method == affinity_top_method_default) {
diff --git a/openmp/runtime/test/affinity/kmp-affinity.c b/openmp/runtime/test/affinity/kmp-affinity.c
new file mode 100644
--- /dev/null
+++ b/openmp/runtime/test/affinity/kmp-affinity.c
@@ -0,0 +1,71 @@
+// RUN: %libomp-compile -D_GNU_SOURCE
+// RUN: env KMP_AFFINITY=granularity=thread,compact %libomp-run
+// RUN: env KMP_AFFINITY=granularity=core,compact %libomp-run
+// RUN: env KMP_AFFINITY=granularity=socket,compact %libomp-run
+// REQUIRES: linux
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libomp_test_affinity.h"
+#include "libomp_test_topology.h"
+
+// Compare place lists. Make sure every place in p1 is in p2.
+static int compare_places(const place_list_t *p1, const place_list_t *p2) {
+  int i, j;
+  for (i = 0; i < p1->num_places; ++i) {
+    int found = 0;
+    for (j = 0; j < p2->num_places; ++j) {
+      if (affinity_mask_equal(p1->masks[i], p2->masks[j])) {
+        found = 1;
+        break;
+      }
+    }
+    if (!found) {
+      printf("Found place in p1 not in p2!\n");
+      printf("p1 places:\n");
+      topology_print_places(p1);
+      printf("\n");
+      printf("p2 places:\n");
+      topology_print_places(p1);
+      return EXIT_FAILURE;
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
+static int check_places() {
+  int status;
+  const char *value = getenv("KMP_AFFINITY");
+  if (!value) {
+    fprintf(stderr, "error: must set OMP_PLACES envirable for this test!\n");
+    return EXIT_FAILURE;
+  }
+  place_list_t *places, *openmp_places;
+  if (strstr(value, "socket")) {
+    places = topology_alloc_type_places(TOPOLOGY_OBJ_SOCKET);
+  } else if (strstr(value, "core")) {
+    places = topology_alloc_type_places(TOPOLOGY_OBJ_CORE);
+  } else if (strstr(value, "thread")) {
+    places = topology_alloc_type_places(TOPOLOGY_OBJ_THREAD);
+  } else {
+    fprintf(
+        stderr,
+        "error: KMP_AFFINITY granularity must be one of thread,core,socket!\n");
+    return EXIT_FAILURE;
+  }
+  openmp_places = topology_alloc_openmp_places();
+  status = compare_places(openmp_places, places);
+  topology_free_places(places);
+  topology_free_places(openmp_places);
+  return status;
+}
+
+int main() {
+  if (!topology_using_full_mask()) {
+    printf("Thread does not have access to all logical processors. Skipping "
+           "test.\n");
+    return EXIT_SUCCESS;
+  }
+  return check_places();
+}
diff --git a/openmp/runtime/test/affinity/kmp-hw-subset.c b/openmp/runtime/test/affinity/kmp-hw-subset.c
new file mode 100644
--- /dev/null
+++ b/openmp/runtime/test/affinity/kmp-hw-subset.c
@@ -0,0 +1,127 @@
+// RUN: %libomp-compile -D_GNU_SOURCE
+// RUN: env OMP_PLACES=threads %libomp-run
+// RUN: env OMP_PLACES=cores %libomp-run
+// RUN: env OMP_PLACES=sockets %libomp-run
+// REQUIRES: linux
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libomp_test_affinity.h"
+#include "libomp_test_topology.h"
+
+// Check openmp place list to make sure it follow KMP_HW_SUBSET restriction
+static int compare_hw_subset_places(const place_list_t *openmp_places,
+                                    topology_obj_type_t type, int nsockets,
+                                    int ncores_per_socket,
+                                    int nthreads_per_core) {
+  int i, j, expected_total, expected_per_place;
+  if (type == TOPOLOGY_OBJ_THREAD) {
+    expected_total = nsockets * ncores_per_socket * nthreads_per_core;
+    expected_per_place = 1;
+  } else if (type == TOPOLOGY_OBJ_CORE) {
+    expected_total = nsockets * ncores_per_socket;
+    expected_per_place = nthreads_per_core;
+  } else {
+    expected_total = nsockets;
+    expected_per_place = ncores_per_socket;
+  }
+  if (openmp_places->num_places != expected_total) {
+    fprintf(stderr, "error: KMP_HW_SUBSET did not half each resource layer!\n");
+    printf("openmp_places places:\n");
+    topology_print_places(openmp_places);
+    printf("\n");
+    return EXIT_FAILURE;
+  }
+  for (i = 0; i < openmp_places->num_places; ++i) {
+    int count = affinity_mask_count(openmp_places->masks[i]);
+    if (count != expected_per_place) {
+      fprintf(stderr, "error: place %d has %d OS procs instead of %d\n", i,
+              count, expected_per_place);
+      return EXIT_FAILURE;
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
+static int check_places() {
+  char buf[100];
+  topology_obj_type_t type;
+  const char *value;
+  int status = EXIT_SUCCESS;
+  place_list_t *threads, *cores, *sockets, *openmp_places;
+  threads = topology_alloc_type_places(TOPOLOGY_OBJ_THREAD);
+  cores = topology_alloc_type_places(TOPOLOGY_OBJ_CORE);
+  sockets = topology_alloc_type_places(TOPOLOGY_OBJ_SOCKET);
+
+  if (threads->num_places <= 1) {
+    printf("Only one hardware thread to execute on. Skipping test.\n");
+    return status;
+  }
+
+  value = getenv("OMP_PLACES");
+  if (!value) {
+    fprintf(stderr,
+            "error: OMP_PLACES must be set to one of threads,cores,sockets!\n");
+    return EXIT_FAILURE;
+  }
+  if (strcmp(value, "threads") == 0)
+    type = TOPOLOGY_OBJ_THREAD;
+  else if (strcmp(value, "cores") == 0)
+    type = TOPOLOGY_OBJ_CORE;
+  else if (strcmp(value, "sockets") == 0)
+    type = TOPOLOGY_OBJ_SOCKET;
+  else {
+    fprintf(stderr,
+            "error: OMP_PLACES must be one of threads,cores,sockets!\n");
+    return EXIT_FAILURE;
+  }
+
+  // Calculate of num threads per core, num cores per socket, & num sockets
+  if (cores->num_places <= 0) {
+    printf("Invalid number of cores (%d). Skipping test.\n", cores->num_places);
+    return status;
+  } else if (sockets->num_places <= 0) {
+    printf("Invalid number of sockets (%d). Skipping test.\n",
+           cores->num_places);
+    return status;
+  }
+  int nthreads_per_core = threads->num_places / cores->num_places;
+  int ncores_per_socket = cores->num_places / sockets->num_places;
+  int nsockets = sockets->num_places;
+
+  if (nsockets * ncores_per_socket * nthreads_per_core != threads->num_places) {
+    printf("Only uniform topologies can be tested. Skipping test.\n");
+    return status;
+  }
+
+  // Use half the resources of every level
+  if (nthreads_per_core > 1)
+    nthreads_per_core /= 2;
+  if (ncores_per_socket > 1)
+    ncores_per_socket /= 2;
+  if (nsockets > 1)
+    nsockets /= 2;
+
+  snprintf(buf, sizeof(buf), "%ds,%dc,%dt", nsockets, ncores_per_socket,
+           nthreads_per_core);
+  setenv("KMP_HW_SUBSET", buf, 1);
+
+  openmp_places = topology_alloc_openmp_places();
+  status = compare_hw_subset_places(openmp_places, type, nsockets,
+                                    ncores_per_socket, nthreads_per_core);
+  topology_free_places(threads);
+  topology_free_places(cores);
+  topology_free_places(sockets);
+  topology_free_places(openmp_places);
+  return status;
+}
+
+int main() {
+  if (!topology_using_full_mask()) {
+    printf("Thread does not have access to all logical processors. Skipping "
+           "test.\n");
+    return EXIT_SUCCESS;
+  }
+  return check_places();
+}
diff --git a/openmp/runtime/test/affinity/libomp_test_topology.h b/openmp/runtime/test/affinity/libomp_test_topology.h
new file mode 100644
--- /dev/null
+++ b/openmp/runtime/test/affinity/libomp_test_topology.h
@@ -0,0 +1,231 @@
+#ifndef LIBOMP_TEST_TOPOLOGY_H
+#define LIBOMP_TEST_TOPOLOGY_H
+
+#include "libomp_test_affinity.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <errno.h>
+#include <ctype.h>
+#include <omp.h>
+
+typedef enum topology_obj_type_t {
+  TOPOLOGY_OBJ_THREAD,
+  TOPOLOGY_OBJ_CORE,
+  TOPOLOGY_OBJ_SOCKET,
+  TOPOLOGY_OBJ_MAX
+} topology_obj_type_t;
+
+typedef struct place_list_t {
+  int num_places;
+  affinity_mask_t **masks;
+} place_list_t;
+
+// Return the first character in file 'f' that is not a whitespace character
+// including newlines and carriage returns
+static int get_first_nonspace_from_file(FILE *f) {
+  int c;
+  do {
+    c = fgetc(f);
+  } while (c != EOF && (isspace(c) || c == '\n' || c == '\r'));
+  return c;
+}
+
+// Read an integer from file 'f' into 'number'
+// Return 1 on successful read of integer,
+//        0 on unsuccessful read of integer,
+//        EOF on end of file.
+static int get_integer_from_file(FILE *f, int *number) {
+  int n;
+  n = fscanf(f, "%d", number);
+  if (feof(f))
+    return EOF;
+  if (n != 1)
+    return 0;
+  return 1;
+}
+
+// Read a siblings list file from Linux /sys/devices/system/cpu/cpu?/topology/*
+static affinity_mask_t *topology_get_mask_from_file(const char *filename) {
+  int status = EXIT_SUCCESS;
+  FILE *f = fopen(filename, "r");
+  if (!f) {
+    perror(filename);
+    exit(EXIT_FAILURE);
+  }
+  affinity_mask_t *mask = affinity_mask_alloc();
+  while (1) {
+    int c, i, n, lower, upper;
+    // Read the first integer
+    n = get_integer_from_file(f, &lower);
+    if (n == EOF) {
+      break;
+    } else if (n == 0) {
+      fprintf(stderr, "syntax error: expected integer\n");
+      status = EXIT_FAILURE;
+      break;
+    }
+
+    // Now either a , or -
+    c = get_first_nonspace_from_file(f);
+    if (c == EOF || c == ',') {
+      affinity_mask_set(mask, lower);
+      if (c == EOF)
+        break;
+    } else if (c == '-') {
+      n = get_integer_from_file(f, &upper);
+      if (n == EOF || n == 0) {
+        fprintf(stderr, "syntax error: expected integer\n");
+        status = EXIT_FAILURE;
+        break;
+      }
+      for (i = lower; i <= upper; ++i)
+        affinity_mask_set(mask, i);
+      c = get_first_nonspace_from_file(f);
+      if (c == EOF) {
+        break;
+      } else if (c == ',') {
+        continue;
+      } else {
+        fprintf(stderr, "syntax error: unexpected character: '%c (%d)'\n", c,
+                c);
+        status = EXIT_FAILURE;
+        break;
+      }
+    } else {
+      fprintf(stderr, "syntax error: unexpected character: '%c (%d)'\n", c, c);
+      status = EXIT_FAILURE;
+      break;
+    }
+  }
+  fclose(f);
+  if (status == EXIT_FAILURE) {
+    affinity_mask_free(mask);
+    mask = NULL;
+  }
+  return mask;
+}
+
+static int topology_get_num_cpus() {
+  char buf[1024];
+  // Count the number of cpus
+  int cpu = 0;
+  while (1) {
+    snprintf(buf, sizeof(buf), "/sys/devices/system/cpu/cpu%d", cpu);
+    DIR *dir = opendir(buf);
+    if (dir) {
+      closedir(dir);
+      cpu++;
+    } else {
+      break;
+    }
+  }
+  if (cpu == 0)
+    cpu = 1;
+  return cpu;
+}
+
+// Return whether the current thread has access to all logical processors
+static int topology_using_full_mask() {
+  int cpu;
+  int has_all = 1;
+  int num_cpus = topology_get_num_cpus();
+  affinity_mask_t *mask = affinity_mask_alloc();
+  get_thread_affinity(mask);
+  for (cpu = 0; cpu < num_cpus; ++cpu) {
+    if (!affinity_mask_isset(mask, cpu)) {
+      has_all = 0;
+      break;
+    }
+  }
+  affinity_mask_free(mask);
+  return has_all;
+}
+
+// Return array of masks representing OMP_PLACES keyword (e.g., sockets, cores,
+// threads)
+static place_list_t *topology_alloc_type_places(topology_obj_type_t type) {
+  char buf[1024];
+  int i, cpu, num_places, num_unique;
+  int num_cpus = topology_get_num_cpus();
+  place_list_t *places = (place_list_t *)malloc(sizeof(place_list_t));
+  affinity_mask_t **masks =
+      (affinity_mask_t **)malloc(sizeof(affinity_mask_t *) * num_cpus);
+  num_unique = 0;
+  for (cpu = 0; cpu < num_cpus; ++cpu) {
+    affinity_mask_t *mask;
+    if (type == TOPOLOGY_OBJ_CORE) {
+      snprintf(buf, sizeof(buf),
+               "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list",
+               cpu);
+      mask = topology_get_mask_from_file(buf);
+    } else if (type == TOPOLOGY_OBJ_SOCKET) {
+      snprintf(buf, sizeof(buf),
+               "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list",
+               cpu);
+      mask = topology_get_mask_from_file(buf);
+    } else if (type == TOPOLOGY_OBJ_THREAD) {
+      mask = affinity_mask_alloc();
+      affinity_mask_set(mask, cpu);
+    } else {
+      fprintf(stderr, "Unknown topology type (%d)\n", (int)type);
+      exit(EXIT_FAILURE);
+    }
+    // Check for unique topology objects above the thread level
+    if (type != TOPOLOGY_OBJ_THREAD) {
+      for (i = 0; i < num_unique; ++i) {
+        if (affinity_mask_equal(masks[i], mask)) {
+          affinity_mask_free(mask);
+          mask = NULL;
+          break;
+        }
+      }
+    }
+    if (mask)
+      masks[num_unique++] = mask;
+  }
+  places->num_places = num_unique;
+  places->masks = masks;
+  return places;
+}
+
+static place_list_t *topology_alloc_openmp_places() {
+  int place, i;
+  int num_places = omp_get_num_places();
+  place_list_t *places = (place_list_t *)malloc(sizeof(place_list_t));
+  affinity_mask_t **masks =
+      (affinity_mask_t **)malloc(sizeof(affinity_mask_t *) * num_places);
+  for (place = 0; place < num_places; ++place) {
+    int num_procs = omp_get_place_num_procs(place);
+    int *ids = (int *)malloc(sizeof(int) * num_procs);
+    omp_get_place_proc_ids(place, ids);
+    affinity_mask_t *mask = affinity_mask_alloc();
+    for (i = 0; i < num_procs; ++i)
+      affinity_mask_set(mask, ids[i]);
+    masks[place] = mask;
+  }
+  places->num_places = num_places;
+  places->masks = masks;
+  return places;
+}
+
+// Free the array of masks from one of: topology_alloc_type_masks()
+// or topology_alloc_openmp_masks()
+static void topology_free_places(place_list_t *places) {
+  int i;
+  for (i = 0; i < places->num_places; ++i)
+    affinity_mask_free(places->masks[i]);
+  free(places->masks);
+  free(places);
+}
+
+static void topology_print_places(const place_list_t *p) {
+  int i;
+  char buf[1024];
+  for (i = 0; i < p->num_places; ++i) {
+    affinity_mask_snprintf(buf, sizeof(buf), p->masks[i]);
+    printf("Place %d: %s\n", i, buf);
+  }
+}
+
+#endif
diff --git a/openmp/runtime/test/affinity/omp-places.c b/openmp/runtime/test/affinity/omp-places.c
new file mode 100644
--- /dev/null
+++ b/openmp/runtime/test/affinity/omp-places.c
@@ -0,0 +1,83 @@
+// RUN: %libomp-compile -D_GNU_SOURCE
+// RUN: env OMP_PLACES=threads %libomp-run
+// RUN: env OMP_PLACES=cores %libomp-run
+// RUN: env OMP_PLACES=sockets %libomp-run
+// REQUIRES: linux
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libomp_test_affinity.h"
+#include "libomp_test_topology.h"
+
+// Compare place lists. The order is not taken into consideration here.
+// The OS detection might have the cores/sockets in a different
+// order from the runtime.
+static int compare_places(const place_list_t *p1, const place_list_t *p2) {
+  int i, j;
+  if (p1->num_places != p2->num_places) {
+    fprintf(stderr, "error: places do not have same number of places! (p1 has "
+                    "%d, p2 has %d)\n",
+            p1->num_places, p2->num_places);
+    printf("p1 places:\n");
+    topology_print_places(p1);
+    printf("\n");
+    printf("p2 places:\n");
+    topology_print_places(p1);
+    return EXIT_FAILURE;
+  }
+  for (i = 0; i < p1->num_places; ++i) {
+    int found = 0;
+    for (j = 0; j < p2->num_places; ++j) {
+      if (affinity_mask_equal(p1->masks[i], p2->masks[j])) {
+        found = 1;
+        break;
+      }
+    }
+    if (!found) {
+      printf("Found difference in places!\n");
+      printf("p1 places:\n");
+      topology_print_places(p1);
+      printf("\n");
+      printf("p2 places:\n");
+      topology_print_places(p1);
+      return EXIT_FAILURE;
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
+static int check_places() {
+  int status;
+  const char *value = getenv("OMP_PLACES");
+  if (!value) {
+    fprintf(stderr, "error: must set OMP_PLACES envirable for this test!\n");
+    return EXIT_FAILURE;
+  }
+  place_list_t *places, *openmp_places;
+  if (strcmp(value, "sockets") == 0) {
+    places = topology_alloc_type_places(TOPOLOGY_OBJ_SOCKET);
+  } else if (strcmp(value, "cores") == 0) {
+    places = topology_alloc_type_places(TOPOLOGY_OBJ_CORE);
+  } else if (strcmp(value, "threads") == 0) {
+    places = topology_alloc_type_places(TOPOLOGY_OBJ_THREAD);
+  } else {
+    fprintf(stderr,
+            "error: OMP_PLACES must be one of threads,cores,sockets!\n");
+    return EXIT_FAILURE;
+  }
+  openmp_places = topology_alloc_openmp_places();
+  status = compare_places(places, openmp_places);
+  topology_free_places(places);
+  topology_free_places(openmp_places);
+  return status;
+}
+
+int main() {
+  if (!topology_using_full_mask()) {
+    printf("Thread does not have access to all logical processors. Skipping "
+           "test.\n");
+    return EXIT_SUCCESS;
+  }
+  return check_places();
+}