Index: runtime/src/kmp.h
===================================================================
--- runtime/src/kmp.h
+++ runtime/src/kmp.h
@@ -1444,6 +1444,8 @@
 /* keeps tracked of threadprivate cache allocations for cleanup later */
 typedef struct kmp_cached_addr {
   void **addr; /* address of allocated cache */
+  void ***compiler_cache; /* pointer to compiler's cache */
+  void *data; /* pointer to global data */
   struct kmp_cached_addr *next; /* pointer to next cached address */
 } kmp_cached_addr_t;
 
@@ -3774,6 +3776,8 @@
 struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
                                                 void *data_addr,
                                                 size_t pc_size);
+void __kmp_threadprivate_resize_cache(int newCapacity);
+void __kmp_cleanup_threadprivate_caches();
 
 // ompc_, kmpc_ entries moved from omp.h.
 #if KMP_OS_WINDOWS
Index: runtime/src/kmp_runtime.cpp
===================================================================
--- runtime/src/kmp_runtime.cpp
+++ runtime/src/kmp_runtime.cpp
@@ -3508,8 +3508,14 @@
    If any argument is negative, the behavior is undefined. */
 static int __kmp_expand_threads(int nNeed) {
   int added = 0;
-  int old_tp_cached;
-  int __kmp_actual_max_nth;
+  int minimumRequiredCapacity;
+  int newCapacity;
+  kmp_info_t **newThreads;
+  kmp_root_t **newRoot;
+
+// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
+// resizing __kmp_threads does not need additional protection if foreign
+// threads are present
 
 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
   /* only for Windows static library */
@@ -3525,91 +3531,64 @@
   if (nNeed <= 0)
     return added;
 
-  while (1) {
-    int nTarget;
-    int minimumRequiredCapacity;
-    int newCapacity;
-    kmp_info_t **newThreads;
-    kmp_root_t **newRoot;
-
-    // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
-    // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
-    // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
-    // > __kmp_max_nth in one of two ways:
-    //
-    // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
-    //    may not be resused by another thread, so we may need to increase
-    //    __kmp_threads_capacity to __kmp_max_nth + 1.
-    //
-    // 2) New foreign root(s) are encountered.  We always register new foreign
-    //    roots. This may cause a smaller # of threads to be allocated at
-    //    subsequent parallel regions, but the worker threads hang around (and
-    //    eventually go to sleep) and need slots in the __kmp_threads[] array.
-    //
-    // Anyway, that is the reason for moving the check to see if
-    // __kmp_max_nth was exceeded into __kmp_reserve_threads()
-    // instead of having it performed here. -BB
-    old_tp_cached = __kmp_tp_cached;
-    __kmp_actual_max_nth =
-        old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
-    KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
-
-    /* compute expansion headroom to check if we can expand */
-    nTarget = nNeed;
-    if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
-      /* possible expansion too small -- give up */
-      break;
-    }
-    minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
+  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
+  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
+  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
+  // > __kmp_max_nth in one of two ways:
+  //
+  // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
+  //    may not be resused by another thread, so we may need to increase
+  //    __kmp_threads_capacity to __kmp_max_nth + 1.
+  //
+  // 2) New foreign root(s) are encountered.  We always register new foreign
+  //    roots. This may cause a smaller # of threads to be allocated at
+  //    subsequent parallel regions, but the worker threads hang around (and
+  //    eventually go to sleep) and need slots in the __kmp_threads[] array.
+  //
+  // Anyway, that is the reason for moving the check to see if
+  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
+  // instead of having it performed here. -BB
+
+  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
+
+  /* compute expansion headroom to check if we can expand */
+  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
+    /* possible expansion too small -- give up */
+    return added;
+  }
+  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
 
-    newCapacity = __kmp_threads_capacity;
-    do {
-      newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
-                        ? (newCapacity << 1)
-                        : __kmp_actual_max_nth;
-    } while (newCapacity < minimumRequiredCapacity);
-    newThreads = (kmp_info_t **)__kmp_allocate(
-        (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
-        CACHE_LINE);
-    newRoot = (kmp_root_t **)((char *)newThreads +
-                              sizeof(kmp_info_t *) * newCapacity);
-    KMP_MEMCPY(newThreads, __kmp_threads,
-               __kmp_threads_capacity * sizeof(kmp_info_t *));
-    KMP_MEMCPY(newRoot, __kmp_root,
-               __kmp_threads_capacity * sizeof(kmp_root_t *));
-    memset(newThreads + __kmp_threads_capacity, 0,
-           (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
-    memset(newRoot + __kmp_threads_capacity, 0,
-           (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
-
-    if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
-      /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
-         allocated a threadprivate cache while we were allocating the expanded
-         array, and our new capacity is larger than the threadprivate cache
-         capacity, so we should deallocate the expanded arrays and try again.
-         This is the first check of a double-check pair. */
-      __kmp_free(newThreads);
-      continue; /* start over and try again */
-    }
+  newCapacity = __kmp_threads_capacity;
+  do {
+    newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
+                                                          : __kmp_sys_max_nth;
+  } while (newCapacity < minimumRequiredCapacity);
+  newThreads = (kmp_info_t **)__kmp_allocate(
+      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
+  newRoot =
+      (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
+  KMP_MEMCPY(newThreads, __kmp_threads,
+             __kmp_threads_capacity * sizeof(kmp_info_t *));
+  KMP_MEMCPY(newRoot, __kmp_root,
+             __kmp_threads_capacity * sizeof(kmp_root_t *));
+
+  kmp_info_t **temp_threads = __kmp_threads;
+  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
+  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
+  __kmp_free(temp_threads);
+  added += newCapacity - __kmp_threads_capacity;
+  *(volatile int *)&__kmp_threads_capacity = newCapacity;
+
+  if (newCapacity > __kmp_tp_capacity) {
     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
-    if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
-      /* Same check as above, but this time with the lock so we can be sure if
-         we can succeed. */
-      __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
-      __kmp_free(newThreads);
-      continue; /* start over and try again */
-    } else {
-      /* success */
-      // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
-      // investigated.
-      *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
-      *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
-      added += newCapacity - __kmp_threads_capacity;
-      *(volatile int *)&__kmp_threads_capacity = newCapacity;
-      __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
-      break; /* succeeded, so we can exit the loop */
+    if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
+      __kmp_threadprivate_resize_cache(newCapacity);
+    } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
+      *(volatile int *)&__kmp_tp_capacity = newCapacity;
     }
+    __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
   }
+
   return added;
 }
 
@@ -7335,6 +7314,8 @@
     __kmp_init_serial = FALSE;
   }
 
+  __kmp_cleanup_threadprivate_caches();
+
   for (f = 0; f < __kmp_threads_capacity; f++) {
     if (__kmp_root[f] != NULL) {
       __kmp_free(__kmp_root[f]);
Index: runtime/src/kmp_threadprivate.cpp
===================================================================
--- runtime/src/kmp_threadprivate.cpp
+++ runtime/src/kmp_threadprivate.cpp
@@ -594,6 +594,13 @@
   return ret;
 }
 
+static kmp_cached_addr_t *__kmp_find_cache(void *data) {
+  kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+  while (ptr && ptr->data != data)
+    ptr = ptr->next;
+  return ptr;
+}
+
 /*!
  @ingroup THREADPRIVATE
  @param loc source location information
@@ -620,35 +627,40 @@
 
     if (TCR_PTR(*cache) == 0) {
       __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
-      __kmp_tp_cached = 1;
-      __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
+      // Compiler often passes in NULL cache, even if it's already been created
       void **my_cache;
-      KMP_ITT_IGNORE(
-          my_cache = (void **)__kmp_allocate(
-              sizeof(void *) * __kmp_tp_capacity + sizeof(kmp_cached_addr_t)););
-      // No need to zero the allocated memory; __kmp_allocate does that.
-      KC_TRACE(
-          50,
-          ("__kmpc_threadprivate_cached: T#%d allocated cache at address %p\n",
-           global_tid, my_cache));
-
-      /* TODO: free all this memory in __kmp_common_destroy using
-       * __kmp_threadpriv_cache_list */
-      /* Add address of mycache to linked list for cleanup later  */
       kmp_cached_addr_t *tp_cache_addr;
-
-      tp_cache_addr = (kmp_cached_addr_t *)&my_cache[__kmp_tp_capacity];
-      tp_cache_addr->addr = my_cache;
-      tp_cache_addr->next = __kmp_threadpriv_cache_list;
-      __kmp_threadpriv_cache_list = tp_cache_addr;
-
+      // Look for an existing cache
+      tp_cache_addr = __kmp_find_cache(data);
+      if (!tp_cache_addr) { // Cache was never created; do it now
+        __kmp_tp_cached = 1;
+        KMP_ITT_IGNORE(my_cache = (void **)__kmp_allocate(
+                           sizeof(void *) * __kmp_tp_capacity +
+                           sizeof(kmp_cached_addr_t)););
+        // No need to zero the allocated memory; __kmp_allocate does that.
+        KC_TRACE(50, ("__kmpc_threadprivate_cached: T#%d allocated cache at "
+                      "address %p\n",
+                      global_tid, my_cache));
+        /* TODO: free all this memory in __kmp_common_destroy using
+         * __kmp_threadpriv_cache_list */
+        /* Add address of mycache to linked list for cleanup later  */
+        tp_cache_addr = (kmp_cached_addr_t *)&my_cache[__kmp_tp_capacity];
+        tp_cache_addr->addr = my_cache;
+        tp_cache_addr->data = data;
+        tp_cache_addr->compiler_cache = cache;
+        tp_cache_addr->next = __kmp_threadpriv_cache_list;
+        __kmp_threadpriv_cache_list = tp_cache_addr;
+      } else { // A cache was already created; use it
+        my_cache = tp_cache_addr->addr;
+        tp_cache_addr->compiler_cache = cache;
+      }
       KMP_MB();
 
       TCW_PTR(*cache, my_cache);
+      __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
 
       KMP_MB();
     }
-
     __kmp_release_lock(&__kmp_global_lock, global_tid);
   }
 
@@ -661,10 +673,66 @@
   KC_TRACE(10,
            ("__kmpc_threadprivate_cached: T#%d exiting; return value = %p\n",
             global_tid, ret));
-
   return ret;
 }
 
+// This function should only be called when both __kmp_tp_cached_lock and
+// kmp_forkjoin_lock are held.
+void __kmp_threadprivate_resize_cache(int newCapacity) {
+  KC_TRACE(10, ("__kmp_threadprivate_resize_cache: called with size: %d\n",
+                newCapacity));
+
+  kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+
+  while (ptr) {
+    if (ptr->data) { // this location has an active cache; resize it
+      void **my_cache;
+      KMP_ITT_IGNORE(my_cache =
+                         (void **)__kmp_allocate(sizeof(void *) * newCapacity +
+                                                 sizeof(kmp_cached_addr_t)););
+      // No need to zero the allocated memory; __kmp_allocate does that.
+      KC_TRACE(50, ("__kmp_threadprivate_resize_cache: allocated cache at %p\n",
+                    my_cache));
+      // Now copy old cache into new cache
+      void **old_cache = ptr->addr;
+      for (int i = 0; i < __kmp_tp_capacity; ++i) {
+        my_cache[i] = old_cache[i];
+      }
+
+      // Add address of new my_cache to linked list for cleanup later
+      kmp_cached_addr_t *tp_cache_addr;
+      tp_cache_addr = (kmp_cached_addr_t *)&my_cache[newCapacity];
+      tp_cache_addr->addr = my_cache;
+      tp_cache_addr->data = ptr->data;
+      tp_cache_addr->compiler_cache = ptr->compiler_cache;
+      tp_cache_addr->next = __kmp_threadpriv_cache_list;
+      __kmp_threadpriv_cache_list = tp_cache_addr;
+
+      // Copy new cache to compiler's location:
+      // We can only copy directly to (*compiler_cache) after compiler
+      // guarantees it will keep using the same location for the cache. Assuming
+      // that change makes it into 19.0 compiler, we can switch to the following
+      // line when 18.0 is no longer supported:
+      // TCW_PTR(*(tp_cache_addr->compiler_cache), my_cache);
+      // Meanwhile, check if compiler_cache is still pointing at old cache, and
+      // if so, we can point it at the new cache with an atomic compare&swap
+      // operation.
+      (void)KMP_COMPARE_AND_STORE_PTR(tp_cache_addr->compiler_cache, old_cache,
+                                      my_cache);
+      // If the store doesn't happen here, the compiler's old behavior will
+      // inevitably call __kmpc_threadprivate_cache with a new location for the
+      // cache, and that function will store the resized cache there at that
+      // point.
+
+      // Nullify old cache's data pointer so we skip it next time
+      ptr->data = NULL;
+    }
+    ptr = ptr->next;
+  }
+  // After all caches are resized, update __kmp_tp_capacity to the new size
+  *(volatile int *)&__kmp_tp_capacity = newCapacity;
+}
+
 /*!
  @ingroup THREADPRIVATE
  @param loc source location information
@@ -701,14 +769,30 @@
     d_tn->dt.dtorv = dtor;
     d_tn->is_vec = TRUE;
     d_tn->vec_len = (size_t)vector_length;
-    /*
-            d_tn->obj_init = 0;  // AC: commented out because __kmp_allocate
-       zeroes the memory
-            d_tn->pod_init = 0;
-    */
+    // d_tn->obj_init = 0;  // AC: __kmp_allocate zeroes the memory
+    // d_tn->pod_init = 0;
     lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(data)]);
 
     d_tn->next = *lnk_tn;
     *lnk_tn = d_tn;
   }
 }
+
+void __kmp_cleanup_threadprivate_caches() {
+  kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+
+  while (ptr) {
+    void **cache = ptr->addr;
+    __kmp_threadpriv_cache_list = ptr->next;
+    if (*ptr->compiler_cache)
+      *ptr->compiler_cache = NULL;
+    ptr->compiler_cache = NULL;
+    ptr->data = NULL;
+    ptr->addr = NULL;
+    ptr->next = NULL;
+    // Threadprivate data pointed at by cache entries are destroyed at end of
+    // __kmp_launch_thread with __kmp_common_destroy_gtid.
+    __kmp_free(cache); // implicitly frees ptr too
+    ptr = __kmp_threadpriv_cache_list;
+  }
+}