Index: openmp/runtime/src/dllexports
===================================================================
--- openmp/runtime/src/dllexports
+++ openmp/runtime/src/dllexports
@@ -390,6 +390,8 @@
         __kmpc_taskred_init                 277
         __kmpc_taskred_modifier_init        278
         __kmpc_omp_target_task_alloc        279
+        __kmpc_masked                       282
+        __kmpc_end_masked      	            283
 %endif
 
 # User API entry points that have both lower- and upper- case versions for Fortran.
Index: openmp/runtime/src/i18n/en_US.txt
===================================================================
--- openmp/runtime/src/i18n/en_US.txt
+++ openmp/runtime/src/i18n/en_US.txt
@@ -138,7 +138,7 @@
 Hint                         "OMP: Hint %1$s\n"
 
 Pragma                       "%1$s pragma (at %2$s:%3$s():%4$s)"
-    # %1 is pragma name (like "parallel" or "master",
+    # %1 is pragma name (like "parallel" or "masked",
     # %2 is file name,
     # %3 is function (routine) name,
     # %4 is the line number (as string, so "s" type specifier should be used).
Index: openmp/runtime/src/kmp.h
===================================================================
--- openmp/runtime/src/kmp.h
+++ openmp/runtime/src/kmp.h
@@ -845,7 +845,7 @@
 typedef enum kmp_proc_bind_t {
   proc_bind_false = 0,
   proc_bind_true,
-  proc_bind_master,
+  proc_bind_primary,
   proc_bind_close,
   proc_bind_spread,
   proc_bind_intel, // use KMP_AFFINITY interface
@@ -1451,7 +1451,8 @@
   ct_ordered_in_pdo,
   ct_master,
   ct_reduce,
-  ct_barrier
+  ct_barrier,
+  ct_masked
 };
 
 #define IS_CONS_TYPE_ORDERED(ct) ((ct) == ct_pdo_ordered)
@@ -1599,7 +1600,7 @@
   struct private_common *next;
   struct private_common *link;
   void *gbl_addr;
-  void *par_addr; /* par_addr == gbl_addr for MASTER thread */
+  void *par_addr; /* par_addr == gbl_addr for PRIMARY thread */
   size_t cmn_size;
 };
 
@@ -1998,9 +1999,9 @@
     kmp_uint64 b_arrived; /* STATE => task reached synch point. */
 #if USE_DEBUGGER
     // The following two fields are indended for the debugger solely. Only
-    // master of the team accesses these fields: the first one is increased by
-    // 1 when master arrives to a barrier, the second one is increased by one
-    // when all the threads arrived.
+    // primary thread of the team accesses these fields: the first one is
+    // increased by 1 when the primary thread arrives to a barrier, the second
+    // one is increased by one when all the threads arrived.
     kmp_uint b_master_arrived;
     kmp_uint b_team_arrived;
 #endif
@@ -2530,7 +2531,7 @@
 
 // This struct stores a thread that acts as a "root" for a contention
 // group. Contention groups are rooted at kmp_root threads, but also at
-// each master thread of each team created in the teams construct.
+// each primary thread of each team created in the teams construct.
 // This struct therefore also stores a thread_limit associated with
 // that contention group, and a counter to track the number of threads
 // active in that contention group. Each thread has a list of these: CG
@@ -2542,7 +2543,7 @@
 typedef struct kmp_cg_root {
   kmp_info_p *cg_root; // "root" thread for a contention group
   // The CG root's limit comes from OMP_THREAD_LIMIT for root threads, or
-  // thread_limit clause for teams masters
+  // thread_limit clause for teams primary threads
   kmp_int32 cg_thread_limit;
   kmp_int32 cg_nthreads; // Count of active threads in CG rooted at cg_root
   struct kmp_cg_root *up; // pointer to higher level CG root in list
@@ -2552,8 +2553,9 @@
 
 typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* Start with the readonly data which is cache aligned and padded. This is
-     written before the thread starts working by the master. Uber masters may
-     update themselves later. Usage does not consider serialized regions.  */
+     written before the thread starts working by the primary thread. Uber
+     masters may update themselves later. Usage does not consider serialized
+     regions.  */
   kmp_desc_t th_info;
   kmp_team_p *th_team; /* team we belong to */
   kmp_root_p *th_root; /* pointer to root of task hierarchy */
@@ -2564,7 +2566,7 @@
   /* The following are cached from the team info structure */
   /* TODO use these in more places as determined to be needed via profiling */
   int th_team_nproc; /* number of threads in a team */
-  kmp_info_p *th_team_master; /* the team's master thread */
+  kmp_info_p *th_team_master; /* the team's primary thread */
   int th_team_serialized; /* team is serialized */
   microtask_t th_teams_microtask; /* save entry address for teams construct */
   int th_teams_level; /* save initial level of teams construct */
@@ -2585,7 +2587,7 @@
   kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
 #endif
   omp_allocator_handle_t th_def_allocator; /* default allocator */
-  /* The data set by the master at reinit, then R/W by the worker */
+  /* The data set by the primary thread at reinit, then R/W by the worker */
   KMP_ALIGN_CACHE int
       th_set_nproc; /* if > 0, then only use this request for the next fork */
 #if KMP_NESTED_HOT_TEAMS
@@ -2621,7 +2623,7 @@
   ompt_thread_info_t ompt_thread_info;
 #endif
 
-  /* The following are also read by the master during reinit */
+  /* The following are also read by the primary thread during reinit */
   struct common_table *th_pri_common;
 
   volatile kmp_uint32 th_spin_here; /* thread-local location for spinning */
@@ -2719,7 +2721,7 @@
 
 // Set up how many argv pointers will fit in cache lines containing
 // t_inline_argv. Historically, we have supported at least 96 bytes. Using a
-// larger value for more space between the master write/worker read section and
+// larger value for more space between the primary write/worker read section and
 // read/write by all section seems to buy more performance on EPCC PARALLEL.
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define KMP_INLINE_ARGV_BYTES                                                  \
@@ -2745,11 +2747,11 @@
   std::atomic<void *> t_tg_reduce_data[2]; // to support task modifier
   std::atomic<int> t_tg_fini_counter[2]; // sync end of task reductions
 
-  // Master only
+  // Primary thread only
   // ---------------------------------------------------------------------------
-  KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team
-  int t_master_this_cons; // "this_construct" single counter of master in parent
-  // team
+  KMP_ALIGN_CACHE int t_master_tid; // tid of primary thread in parent team
+  int t_master_this_cons; // "this_construct" single counter of primary thread
+  // in parent team
   ident_t *t_ident; // if volatile, have to change too much other crud to
   // volatile too
   kmp_team_p *t_parent; // parent team
@@ -2761,7 +2763,7 @@
   kmp_uint64 t_region_time; // region begin timestamp
 #endif /* USE_ITT_BUILD */
 
-  // Master write, workers read
+  // Primary thread write, workers read
   // --------------------------------------------------------------------------
   KMP_ALIGN_CACHE void **t_argv;
   int t_argc;
@@ -2797,7 +2799,7 @@
   kmp_r_sched_t t_sched; // run-time schedule for the team
 #if KMP_AFFINITY_SUPPORTED
   int t_first_place; // first & last place in parent thread's partition.
-  int t_last_place; // Restore these values to master after par region.
+  int t_last_place; // Restore these values to primary thread after par region.
 #endif // KMP_AFFINITY_SUPPORTED
   int t_display_affinity;
   int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via
@@ -3735,6 +3737,9 @@
 KMP_EXPORT void __kmpc_barrier(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_masked(ident_t *, kmp_int32 global_tid,
+                                   kmp_int32 filter);
+KMP_EXPORT void __kmpc_end_masked(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_critical(ident_t *, kmp_int32 global_tid,
Index: openmp/runtime/src/kmp_barrier.cpp
===================================================================
--- openmp/runtime/src/kmp_barrier.cpp
+++ openmp/runtime/src/kmp_barrier.cpp
@@ -73,9 +73,9 @@
               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
-    // Mark arrival to master thread
+    // Mark arrival to primary thread
     /* After performing this write, a worker thread may not assume that the team
-       is valid any more - it could be deallocated by the master thread at any
+       is valid any more - it could be deallocated by the primary thread at any
        time. */
     ANNOTATE_BARRIER_BEGIN(this_thr);
     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
@@ -166,7 +166,7 @@
     KMP_DEBUG_ASSERT(team != NULL);
     other_threads = team->t.t_threads;
 
-    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
+    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
                   "barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
 
@@ -208,7 +208,7 @@
         flag.release();
       }
     }
-  } else { // Wait for the MASTER thread to release us
+  } else { // Wait for the PRIMARY thread to release us
     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
     if (cancellable) {
@@ -379,13 +379,13 @@
 
     // Mark arrival to parent thread
     /* After performing this write, a worker thread may not assume that the team
-       is valid any more - it could be deallocated by the master thread at any
+       is valid any more - it could be deallocated by the primary thread at any
        time.  */
     ANNOTATE_BARRIER_BEGIN(this_thr);
     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
     flag.release();
   } else {
-    // Need to update the team arrived pointer if we are the master thread
+    // Need to update the team arrived pointer if we are the primary thread
     if (nproc > 1) // New value was already computed above
       team->t.t_bar[bt].b_arrived = new_state;
     else
@@ -455,7 +455,7 @@
   } else {
     team = __kmp_threads[gtid]->th.th_team;
     KMP_DEBUG_ASSERT(team != NULL);
-    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
+    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
                   "barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
   }
@@ -558,7 +558,7 @@
       // Mark arrival to parent thread
       /* After performing this write (in the last iteration of the enclosing for
          loop), a worker thread may not assume that the team is valid any more
-         - it could be deallocated by the master thread at any time.  */
+         - it could be deallocated by the primary thread at any time.  */
       ANNOTATE_BARRIER_BEGIN(this_thr);
       p_flag.set_waiter(other_threads[parent_tid]);
       p_flag.release();
@@ -616,7 +616,7 @@
   }
 
   if (KMP_MASTER_TID(tid)) {
-    // Need to update the team arrived pointer if we are the master thread
+    // Need to update the team arrived pointer if we are the primary thread
     if (new_state == KMP_BARRIER_UNUSED_STATE)
       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
     else
@@ -652,14 +652,14 @@
      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
      are released in the reverse order of the corresponding gather, otherwise
      threads are released in the same order. */
-  if (KMP_MASTER_TID(tid)) { // master
+  if (KMP_MASTER_TID(tid)) { // primary thread
     team = __kmp_threads[gtid]->th.th_team;
     KMP_DEBUG_ASSERT(team != NULL);
-    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
+    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
                   "barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
 #if KMP_BARRIER_ICV_PUSH
-    if (propagate_icvs) { // master already has ICVs in final destination; copy
+    if (propagate_icvs) { // primary already has ICVs in final destination; copy
       copy_icvs(&thr_bar->th_fixed_icvs,
                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
     }
@@ -816,15 +816,15 @@
   }
 
   if (uninitialized || team_sz_changed || tid_changed) {
-    thr_bar->my_level = thr_bar->depth - 1; // default for master
-    thr_bar->parent_tid = -1; // default for master
-    if (!KMP_MASTER_TID(
-            tid)) { // if not master, find parent thread in hierarchy
+    thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
+    thr_bar->parent_tid = -1; // default for primary thread
+    if (!KMP_MASTER_TID(tid)) {
+      // if not primary thread, find parent thread in hierarchy
       kmp_uint32 d = 0;
       while (d < thr_bar->depth) { // find parent based on level of thread in
         // hierarchy, and note level
         kmp_uint32 rem;
-        if (d == thr_bar->depth - 2) { // reached level right below the master
+        if (d == thr_bar->depth - 2) { // reached level right below the primary
           thr_bar->parent_tid = 0;
           thr_bar->my_level = d;
           break;
@@ -1009,7 +1009,7 @@
       }
     }
   }
-  // All subordinates are gathered; now release parent if not master thread
+  // All subordinates are gathered; now release parent if not primary thread
 
   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
@@ -1020,7 +1020,7 @@
                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
     /* Mark arrival to parent: After performing this write, a worker thread may
        not assume that the team is valid any more - it could be deallocated by
-       the master thread at any time. */
+       the primary thread at any time. */
     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
       // flag; release it
@@ -1036,7 +1036,7 @@
       flag.set_waiter(other_threads[thr_bar->parent_tid]);
       flag.release();
     }
-  } else { // Master thread needs to update the team's b_arrived value
+  } else { // Primary thread needs to update the team's b_arrived value
     team->t.t_bar[bt].b_arrived = new_state;
     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
                   "arrived(%p) = %llu\n",
@@ -1061,7 +1061,7 @@
   if (KMP_MASTER_TID(tid)) {
     team = __kmp_threads[gtid]->th.th_team;
     KMP_DEBUG_ASSERT(team != NULL);
-    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
                   "entered barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
   } else { // Worker threads
@@ -1139,7 +1139,7 @@
     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
                              FALSE);
     if (KMP_MASTER_TID(
-            tid)) { // master already has copy in final destination; copy
+            tid)) { // primary already has copy in final destination; copy
       copy_icvs(&thr_bar->th_fixed_icvs,
                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
@@ -1289,7 +1289,7 @@
    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
    barrier
    When cancellable = false,
-     Returns 0 if master thread, 1 if worker thread.
+     Returns 0 if primary thread, 1 if worker thread.
    When cancellable = true
      Returns 0 if not cancelled, 1 if cancelled.  */
 template <bool cancellable = false>
@@ -1376,7 +1376,7 @@
 #endif /* USE_ITT_BUILD */
 #if USE_DEBUGGER
     // Let the debugger know: the thread arrived to the barrier and waiting.
-    if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
+    if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
       team->t.t_bar[bt].b_master_arrived += 1;
     } else {
       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
@@ -1444,7 +1444,7 @@
         }
       }
 #if USE_ITT_BUILD
-      /* TODO: In case of split reduction barrier, master thread may send
+      /* TODO: In case of split reduction barrier, primary thread may send
          acquired event early, before the final summation into the shared
          variable is done (final summation can be a long operation for array
          reductions).  */
@@ -1476,7 +1476,7 @@
           break;
         case 3:
           if (__itt_metadata_add_ptr) {
-            // Initialize with master's wait time
+            // Initialize with primary thread's wait time
             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
             // Set arrive time to zero to be able to check it in
             // __kmp_invoke_task(); the same is done inside the loop below
@@ -1596,7 +1596,7 @@
   return status;
 }
 
-// Returns 0 if master thread, 1 if worker thread.
+// Returns 0 if primary thread, 1 if worker thread.
 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
                   size_t reduce_size, void *reduce_data,
                   void (*reduce)(void *, void *)) {
@@ -1614,7 +1614,7 @@
       int tid = __kmp_tid_from_gtid(gtid);
       kmp_info_t *this_thr = __kmp_threads[gtid];
       if (KMP_MASTER_TID(tid)) {
-        // Master does not need to revert anything
+        // Primary thread does not need to revert anything
       } else {
         // Workers need to revert their private b_arrived flag
         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
@@ -1809,7 +1809,7 @@
   }
 
   /* From this point on, the team data structure may be deallocated at any time
-     by the master thread - it is unsafe to reference it in any of the worker
+     by the primary thread - it is unsafe to reference it in any of the worker
      threads. Any per-team data items that need to be referenced before the
      end of the barrier should be moved to the kmp_task_team_t structs.  */
   if (KMP_MASTER_TID(tid)) {
@@ -1820,7 +1820,7 @@
       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
     }
 #if KMP_STATS_ENABLED
-    // Have master thread flag the workers to indicate they are now waiting for
+    // Have primary thread flag the workers to indicate they are now waiting for
     // next parallel region, Also wake them up so they switch their timers to
     // idle.
     for (int i = 0; i < team->t.t_nproc; ++i) {
@@ -1862,7 +1862,7 @@
         break;
       case 3:
         if (__itt_metadata_add_ptr) {
-          // Initialize with master's wait time
+          // Initialize with primary thread's wait time
           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
           // Set arrive time to zero to be able to check it in
           // __kmp_invoke_task(); the same is done inside the loop below
@@ -1922,7 +1922,7 @@
   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
                 (team != NULL) ? team->t.t_id : -1, tid));
 
-  // th_team pointer only valid for master thread here
+  // th_team pointer only valid for primary thread here
   if (KMP_MASTER_TID(tid)) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
@@ -1958,8 +1958,8 @@
       __kmp_task_team_setup(this_thr, team, 0);
     }
 
-    /* The master thread may have changed its blocktime between the join barrier
-       and the fork barrier. Copy the blocktime info to the thread, where
+    /* The primary thread may have changed its blocktime between join barrier
+       and fork barrier. Copy the blocktime info to the thread, where
        __kmp_wait_template() can access it when the team struct is not
        guaranteed to exist. */
     // See note about the corresponding code in __kmp_join_barrier() being
@@ -1974,7 +1974,7 @@
       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
 #endif
     }
-  } // master
+  } // primary thread
 
   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
   case bp_hyper_bar: {
@@ -2050,25 +2050,25 @@
   }
 
   /* We can now assume that a valid team structure has been allocated by the
-     master and propagated to all worker threads. The current thread, however,
-     may not be part of the team, so we can't blindly assume that the team
-     pointer is non-null.  */
+     primary thread and propagated to all worker threads. The current thread,
+     however, may not be part of the team, so we can't blindly assume that the
+     team pointer is non-null.  */
   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
   KMP_DEBUG_ASSERT(team != NULL);
   tid = __kmp_tid_from_gtid(gtid);
 
 #if KMP_BARRIER_ICV_PULL
-  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
-     __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
+  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
+     __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
      implicit task has this data before this function is called. We cannot
-     modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
-     struct, because it is not always the case that the threads arrays have
-     been allocated when __kmp_fork_call() is executed. */
+     modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
+     thread struct, because it is not always the case that the threads arrays
+     have been allocated when __kmp_fork_call() is executed. */
   {
     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
-    if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
-      // Copy the initial ICVs from the master's thread struct to the implicit
-      // task for this tid.
+    if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
+      // Copy the initial ICVs from the primary thread's thread struct to the
+      // implicit task for this tid.
       KA_TRACE(10,
                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
@@ -2139,13 +2139,13 @@
   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
 
-/* Master thread's copy of the ICVs was set up on the implicit taskdata in
-   __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
+/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
+   __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
    implicit task has this data before this function is called. */
 #if KMP_BARRIER_ICV_PULL
-  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
-     untouched), where all of the worker threads can access them and make their
-     own copies after the barrier. */
+  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
+     remains untouched), where all of the worker threads can access them and
+     make their own copies after the barrier. */
   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
   // allocated at this point
   copy_icvs(
@@ -2159,12 +2159,12 @@
   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
                 team->t.t_threads[0], team));
 #else
-  // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
+  // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
   // time.
   ngo_load(new_icvs);
   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
   // allocated at this point
-  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
+  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
     // TODO: GEH - pass in better source location info since usually NULL here
     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
                   f, team->t.t_threads[f], team));
Index: openmp/runtime/src/kmp_csupport.cpp
===================================================================
--- openmp/runtime/src/kmp_csupport.cpp
+++ openmp/runtime/src/kmp_csupport.cpp
@@ -88,7 +88,7 @@
 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
 that which would be returned by omp_get_thread_num() in the outermost
 active parallel construct. (Or zero if there is no active parallel
-construct, since the master thread is necessarily thread zero).
+construct, since the primary thread is necessarily thread zero).
 
 If multiple non-OpenMP threads all enter an OpenMP construct then this
 will be a unique thread identifier among all the threads created by
@@ -850,6 +850,92 @@
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
+@param global_tid  global thread number.
+@param filter result of evaluating filter clause on thread global_tid, or zero
+if no filter clause present
+@return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
+*/
+kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
+  int status = 0;
+  int tid;
+  KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  tid = __kmp_tid_from_gtid(global_tid);
+  if (tid == filter) {
+    KMP_COUNT_BLOCK(OMP_MASKED);
+    KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
+    status = 1;
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (status) {
+    if (ompt_enabled.ompt_callback_masked) {
+      kmp_info_t *this_thr = __kmp_threads[global_tid];
+      kmp_team_t *team = this_thr->th.th_team;
+      ompt_callbacks.ompt_callback(ompt_callback_masked)(
+          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
+          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+#if KMP_USE_DYNAMIC_LOCK
+    if (status)
+      __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
+    else
+      __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
+#else
+    if (status)
+      __kmp_push_sync(global_tid, ct_masked, loc, NULL);
+    else
+      __kmp_check_sync(global_tid, ct_masked, loc, NULL);
+#endif
+  }
+
+  return status;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+
+Mark the end of a <tt>masked</tt> region. This should only be called by the
+thread that executes the <tt>masked</tt> region.
+*/
+void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
+  KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+  KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  if (ompt_enabled.ompt_callback_masked) {
+    int tid = __kmp_tid_from_gtid(global_tid);
+    ompt_callbacks.ompt_callback(ompt_callback_masked)(
+        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    __kmp_pop_sync(global_tid, ct_masked, loc);
+  }
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
 @param gtid  global thread number.
 
 Start execution of an <tt>ordered</tt> construct.
@@ -3373,7 +3459,7 @@
 @param reduce_func callback function providing reduction operation on two
 operands and returning result of reduction in lhs_data
 @param lck pointer to the unique lock data structure
-@result 1 for the master thread, 0 for all other team threads, 2 for all team
+@result 1 for the primary thread, 0 for all other team threads, 2 for all team
 threads if atomic reduction needed
 
 The nowait version is used for a reduce clause with the nowait argument.
@@ -3469,11 +3555,11 @@
                                    tree_reduce_block)) {
 
 // AT: performance issue: a real barrier here
-// AT:     (if master goes slow, other threads are blocked here waiting for the
-// master to come and release them)
-// AT:     (it's not what a customer might expect specifying NOWAIT clause)
-// AT:     (specifying NOWAIT won't result in improvement of performance, it'll
-// be confusing to a customer)
+// AT: (if primary thread is slow, other threads are blocked here waiting for
+//      the primary thread to come and release them)
+// AT: (it's not what a customer might expect specifying NOWAIT clause)
+// AT: (specifying NOWAIT won't result in improvement of performance, it'll
+//      be confusing to a customer)
 // AT: another implementation of *barrier_gather*nowait() (or some other design)
 // might go faster and be more in line with sense of NOWAIT
 // AT: TO DO: do epcc test and compare times
@@ -3507,7 +3593,7 @@
     }
 #endif
 
-    // all other workers except master should do this pop here
+    // all other workers except primary thread should do this pop here
     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
     if (__kmp_env_consistency_check) {
       if (retval == 0) {
@@ -3565,7 +3651,7 @@
 
   } else if (packed_reduction_method == atomic_reduce_block) {
 
-    // neither master nor other workers should get here
+    // neither primary thread nor other workers should get here
     //     (code gen does not generate this call in case 2: atomic reduce block)
     // actually it's better to remove this elseif at all;
     // after removal this value will checked by the 'else' and will assert
@@ -3573,7 +3659,7 @@
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
-    // only master gets here
+    // only primary thread gets here
     // OMPT: tree reduction is annotated in the barrier code
 
   } else {
@@ -3603,7 +3689,7 @@
 @param reduce_func callback function providing reduction operation on two
 operands and returning result of reduction in lhs_data
 @param lck pointer to the unique lock data structure
-@result 1 for the master thread, 0 for all other team threads, 2 for all team
+@result 1 for the primary thread, 0 for all other team threads, 2 for all team
 threads if atomic reduction needed
 
 A blocking reduce that includes an implicit barrier.
@@ -3697,10 +3783,10 @@
     }
 #endif
 
-    // all other workers except master should do this pop here
-    // ( none of other workers except master will enter __kmpc_end_reduce() )
+    // all other workers except primary thread should do this pop here
+    // (none of other workers except primary will enter __kmpc_end_reduce())
     if (__kmp_env_consistency_check) {
-      if (retval == 0) { // 0: all other workers; 1: master
+      if (retval == 0) { // 0: all other workers; 1: primary thread
         __kmp_pop_sync(global_tid, ct_reduce, loc);
       }
     }
@@ -3826,7 +3912,7 @@
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
-    // only master executes here (master releases all other workers)
+    // only primary thread executes here (primary releases all other workers)
     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
                             global_tid);
 
Index: openmp/runtime/src/kmp_dispatch.cpp
===================================================================
--- openmp/runtime/src/kmp_dispatch.cpp
+++ openmp/runtime/src/kmp_dispatch.cpp
@@ -917,7 +917,7 @@
     }
     // Report loop metadata
     if (itt_need_metadata_reporting) {
-      // Only report metadata by master of active team at level 1
+      // Only report metadata by primary thread of active team at level 1
       kmp_uint64 schedtype = 0;
       switch (schedule) {
       case kmp_sch_static_chunked:
Index: openmp/runtime/src/kmp_dispatch_hier.h
===================================================================
--- openmp/runtime/src/kmp_dispatch_hier.h
+++ openmp/runtime/src/kmp_dispatch_hier.h
@@ -496,7 +496,7 @@
     T hier_id = (T)current->get_hier_id();
     // Attempt to grab next iteration range for this level
     if (previous_id == 0) {
-      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
+      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
                    gtid, hier_level));
       kmp_int32 contains_last;
       T my_lb, my_ub;
@@ -590,7 +590,7 @@
       }
       if (p_last)
         *p_last = contains_last;
-    } // if master thread of this unit
+    } // if primary thread of this unit
     if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
       KD_TRACE(10,
                ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
@@ -740,7 +740,7 @@
                 gtid));
       if (unit_id == 0) {
         // For hand threading, the sh buffer on the lowest level is only ever
-        // modified and read by the master thread on that level.  Because of
+        // modified and read by the primary thread on that level.  Because of
         // this, we can always use the first sh buffer.
         auto sh = &(parent->hier_barrier.sh[0]);
         KMP_DEBUG_ASSERT(sh);
@@ -784,7 +784,7 @@
           }
         }
         parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
-      } // if master thread of lowest unit level
+      } // if primary thread of lowest unit level
       parent->barrier(pr->get_hier_id(), tdata);
       if (unit_id != 0) {
         *p_lb = parent->get_curr_lb(tdata->index);
@@ -975,7 +975,7 @@
   KMP_DEBUG_ASSERT(sh);
   pr->flags.use_hier = TRUE;
   pr->u.p.tc = 0;
-  // Have master allocate the hierarchy
+  // Have primary thread allocate the hierarchy
   if (__kmp_tid_from_gtid(gtid) == 0) {
     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
                   "hierarchy\n",
@@ -1071,7 +1071,7 @@
       break;
     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
-    // Only master threads of this unit within the hierarchy do initialization
+    // Only primary threads of this unit within the hierarchy do initialization
     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
                   gtid, i));
     my_unit->reset_shared_barrier();
Index: openmp/runtime/src/kmp_error.cpp
===================================================================
--- openmp/runtime/src/kmp_error.cpp
+++ openmp/runtime/src/kmp_error.cpp
@@ -30,7 +30,7 @@
                        "sections" pragmas */
     "\"critical\"", "\"ordered\"", /* in PARALLEL */
     "\"ordered\"", /* in PDO */
-    "\"master\"", "\"reduce\"", "\"barrier\""};
+    "\"master\"", "\"reduce\"", "\"barrier\"", "\"masked\""};
 
 #define get_src(ident) ((ident) == NULL ? NULL : (ident)->psource)
 
@@ -311,7 +311,7 @@
       /* we are in CRITICAL which is inside a CRITICAL construct of same name */
       __kmp_error_construct2(kmp_i18n_msg_CnsNestingSameName, ct, ident, &cons);
     }
-  } else if (ct == ct_master || ct == ct_reduce) {
+  } else if (ct == ct_master || ct == ct_masked || ct == ct_reduce) {
     if (p->w_top > p->p_top) {
       /* inside a WORKSHARING construct for this PARALLEL region */
       __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
Index: openmp/runtime/src/kmp_global.cpp
===================================================================
--- openmp/runtime/src/kmp_global.cpp
+++ openmp/runtime/src/kmp_global.cpp
@@ -424,7 +424,7 @@
 
 /* ------------------------------------------------------ */
 /* STATE mostly syncronized with global lock */
-/* data written to rarely by masters, read often by workers */
+/* data written to rarely by primary threads, read often by workers */
 /* TODO: None of this global padding stuff works consistently because the order
    of declaration is not necessarily correlated to storage order. To fix this,
    all the important globals must be put in a big structure instead. */
@@ -432,7 +432,7 @@
 kmp_info_t **__kmp_threads = NULL;
 kmp_root_t **__kmp_root = NULL;
 
-/* data read/written to often by masters */
+/* data read/written to often by primary threads */
 KMP_ALIGN_CACHE
 volatile int __kmp_nth = 0;
 volatile int __kmp_all_nth = 0;
Index: openmp/runtime/src/kmp_gsupport.cpp
===================================================================
--- openmp/runtime/src/kmp_gsupport.cpp
+++ openmp/runtime/src/kmp_gsupport.cpp
@@ -1875,7 +1875,7 @@
   va_end(args);
 }
 
-// fn: the function each master thread of new team will call
+// fn: the function each primary thread of new team will call
 // data: argument to fn
 // num_teams, thread_limit: max bounds on respective ICV
 // flags: unused
Index: openmp/runtime/src/kmp_itt.h
===================================================================
--- openmp/runtime/src/kmp_itt.h
+++ openmp/runtime/src/kmp_itt.h
@@ -53,9 +53,9 @@
 // --- Parallel region reporting ---
 __kmp_inline void
 __kmp_itt_region_forking(int gtid, int team_size,
-                         int barriers); // Master only, before forking threads.
+                         int barriers); // Primary only, before forking threads.
 __kmp_inline void
-__kmp_itt_region_joined(int gtid); // Master only, after joining threads.
+__kmp_itt_region_joined(int gtid); // Primary only, after joining threads.
 // (*) Note: A thread may execute tasks after this point, though.
 
 // --- Frame reporting ---
@@ -191,7 +191,7 @@
 #define SSC_MARK_SPIN_END() INSERT_SSC_MARK(0x4377)
 
 // Markers for architecture simulation.
-// FORKING      : Before the master thread forks.
+// FORKING      : Before the primary thread forks.
 // JOINING      : At the start of the join.
 // INVOKING     : Before the threads invoke microtasks.
 // DISPATCH_INIT: At the start of dynamically scheduled loop.
Index: openmp/runtime/src/kmp_itt.inl
===================================================================
--- openmp/runtime/src/kmp_itt.inl
+++ openmp/runtime/src/kmp_itt.inl
@@ -64,14 +64,14 @@
     KMP_BOOTSTRAP_LOCK_INITIALIZER(metadata_lock);
 
 /* Parallel region reporting.
- * __kmp_itt_region_forking should be called by master thread of a team.
+ * __kmp_itt_region_forking should be called by primary thread of a team.
    Exact moment of call does not matter, but it should be completed before any
    thread of this team calls __kmp_itt_region_starting.
  * __kmp_itt_region_starting should be called by each thread of a team just
    before entering parallel region body.
  * __kmp_itt_region_finished should be called by each thread of a team right
    after returning from parallel region body.
- * __kmp_itt_region_joined should be called by master thread of a team, after
+ * __kmp_itt_region_joined should be called by primary thread of a team, after
    all threads called __kmp_itt_region_finished.
 
  Note: Thread waiting at join barrier (after __kmp_itt_region_finished) can
@@ -448,10 +448,10 @@
 /* Barriers reporting.
 
    A barrier consists of two phases:
-   1. Gather -- master waits for arriving of all the worker threads; each
+   1. Gather -- primary thread waits for all worker threads to arrive; each
       worker thread registers arrival and goes further.
-   2. Release -- each worker threads waits until master lets it go; master lets
-      worker threads go.
+   2. Release -- each worker thread waits until primary thread lets it go;
+      primary thread lets worker threads go.
 
    Function should be called by each thread:
    * __kmp_itt_barrier_starting() -- before arriving to the gather phase.
@@ -487,7 +487,7 @@
   // solution, and reporting fork/join barriers to ITT should be revisited.
 
   if (team != NULL) {
-    // Master thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time.
+    // Primary thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time.
     // Divide b_arrived by KMP_BARRIER_STATE_BUMP to get plain barrier counter.
     kmp_uint64 counter =
         team->t.t_bar[bt].b_arrived / KMP_BARRIER_STATE_BUMP + delta;
@@ -550,12 +550,13 @@
       case bs_forkjoin_barrier: {
         // In case of fork/join barrier we can read thr->th.th_ident, because it
         // contains location of last passed construct (while join barrier is not
-        // such one). Use th_ident of master thread instead -- __kmp_join_call()
-        // called by the master thread saves location.
+        // such one). Use th_ident of primary thread instead --
+        // __kmp_join_call() called by the primary thread saves location.
         //
-        // AC: cannot read from master because __kmp_join_call may be not called
-        //    yet, so we read the location from team. This is the same location.
-        //    And team is valid at the enter to join barrier where this happens.
+        // AC: cannot read from primary thread because __kmp_join_call may not
+        //    be called yet, so we read the location from team. This is the
+        //    same location. Team is valid on entry to join barrier where this
+        //    happens.
         loc = team->t.t_ident;
         if (loc != NULL) {
           src = loc->psource;
@@ -958,7 +959,7 @@
     kmp_str_buf_t name;
     __kmp_str_buf_init(&name);
     if (KMP_MASTER_GTID(gtid)) {
-      __kmp_str_buf_print(&name, "OMP Master Thread #%d", gtid);
+      __kmp_str_buf_print(&name, "OMP Primary Thread #%d", gtid);
     } else {
       __kmp_str_buf_print(&name, "OMP Worker Thread #%d", gtid);
     }
@@ -986,9 +987,9 @@
 } // __kmp_itt_system_object_created
 
 /* Stack stitching api.
-   Master calls "create" and put the stitching id into team structure.
+   Primary thread calls "create" and put the stitching id into team structure.
    Workers read the stitching id and call "enter" / "leave" api.
-   Master calls "destroy" at the end of the parallel region. */
+   Primary thread calls "destroy" at the end of the parallel region. */
 
 __itt_caller __kmp_itt_stack_caller_create() {
 #if USE_ITT_NOTIFY
Index: openmp/runtime/src/kmp_omp.h
===================================================================
--- openmp/runtime/src/kmp_omp.h
+++ openmp/runtime/src/kmp_omp.h
@@ -123,7 +123,7 @@
 
   /* team structure information */
   kmp_int32 t_sizeof_struct;
-  offset_and_size_t t_master_tid; // tid of master in parent team
+  offset_and_size_t t_master_tid; // tid of primary thread in parent team
   offset_and_size_t t_ident; // location of parallel region
   offset_and_size_t t_parent; // parent team
   offset_and_size_t t_nproc; // # team threads
@@ -136,7 +136,7 @@
   offset_and_size_t t_cancel_request;
   offset_and_size_t t_bar;
   offset_and_size_t
-      t_b_master_arrived; // increased by 1 when master arrives to a barrier
+      t_b_master_arrived; // incremented when primary thread reaches barrier
   offset_and_size_t
       t_b_team_arrived; // increased by one when all the threads arrived
 
Index: openmp/runtime/src/kmp_runtime.cpp
===================================================================
--- openmp/runtime/src/kmp_runtime.cpp
+++ openmp/runtime/src/kmp_runtime.cpp
@@ -750,8 +750,8 @@
 #if USE_ITT_BUILD
     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
-        team->t.t_active_level ==
-            1) { // Only report metadata by master of active team at level 1
+        team->t.t_active_level == 1) {
+      // Only report metadata by primary thread of active team at level 1
       __kmp_itt_metadata_single(id_ref);
     }
 #endif /* USE_ITT_BUILD */
@@ -977,7 +977,7 @@
   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
   KMP_MB();
 
-  /* first, let's setup the master thread */
+  /* first, let's setup the primary thread */
   master_th->th.th_info.ds.ds_tid = 0;
   master_th->th.th_team = team;
   master_th->th.th_team_nproc = team->t.t_nproc;
@@ -1022,7 +1022,7 @@
 #endif
   if (!use_hot_team) {
 
-    /* install the master thread */
+    /* install the primary thread */
     team->t.t_threads[0] = master_th;
     __kmp_initialize_info(master_th, team, 0, master_gtid);
 
@@ -1085,7 +1085,7 @@
     kmp_int16 x87_fpu_control_word;
     kmp_uint32 mxcsr;
 
-    // Get master values of FPU control flags (both X87 and vector)
+    // Get primary thread's values of FPU control flags (both X87 and vector)
     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
     __kmp_store_mxcsr(&mxcsr);
     mxcsr &= KMP_X86_MXCSR_MASK;
@@ -1142,7 +1142,7 @@
                                      int realloc); // forward declaration
 
 /* Run a parallel region that has been serialized, so runs only in a team of the
-   single master thread. */
+   single primary thread. */
 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   kmp_info_t *this_thr;
   kmp_team_t *serial_team;
@@ -1662,12 +1662,12 @@
       if (call_context == fork_context_gnu)
         return TRUE;
 
-      /* Invoke microtask for MASTER thread */
+      /* Invoke microtask for PRIMARY thread */
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
 
       if (!parent_team->t.t_invoke(gtid)) {
-        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
+        KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
       }
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
@@ -2067,7 +2067,7 @@
       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
     }
     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
-    // set master's schedule as new run-time schedule
+    // set primary thread's schedule as new run-time schedule
     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
 
     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
@@ -2077,18 +2077,18 @@
     propagateFPControl(team);
 
     if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // Set master's task team to team's task team. Unless this is hot team, it
-      // should be NULL.
+      // Set primary thread's task team to team's task team. Unless this is hot
+      // team, it should be NULL.
       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
                        parent_team->t.t_task_team[master_th->th.th_task_state]);
-      KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
+      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
                     "%p, new task_team %p / team %p\n",
                     __kmp_gtid_from_thread(master_th),
                     master_th->th.th_task_team, parent_team,
                     team->t.t_task_team[master_th->th.th_task_state], team));
 
       if (active_level || master_th->th.th_task_team) {
-        // Take a memo of master's task_state
+        // Take a memo of primary thread's task_state
         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
         if (master_th->th.th_task_state_top >=
             master_th->th.th_task_state_stack_sz) { // increase size
@@ -2108,7 +2108,7 @@
           master_th->th.th_task_state_stack_sz = new_size;
           __kmp_free(old_stack);
         }
-        // Store master's task_state on stack
+        // Store primary thread's task_state on stack
         master_th->th
             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
             master_th->th.th_task_state;
@@ -2117,7 +2117,7 @@
         if (master_th->th.th_hot_teams &&
             active_level < __kmp_hot_teams_max_level &&
             team == master_th->th.th_hot_teams[active_level].hot_team) {
-          // Restore master's nested state if nested hot team
+          // Restore primary thread's nested state if nested hot team
           master_th->th.th_task_state =
               master_th->th
                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
@@ -2215,7 +2215,7 @@
     }
 #endif /* USE_ITT_BUILD */
 
-    // AC: skip __kmp_internal_fork at teams construct, let only master
+    // AC: skip __kmp_internal_fork at teams construct, let only primary
     // threads execute
     if (ap) {
       __kmp_internal_fork(loc, gtid, team);
@@ -2229,7 +2229,7 @@
       return TRUE;
     }
 
-    /* Invoke microtask for MASTER thread */
+    /* Invoke microtask for PRIMARY thread */
     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                   team->t.t_id, team->t.t_pkfn));
   } // END of timer KMP_fork_call block
@@ -2243,7 +2243,7 @@
 #endif
 
   if (!team->t.t_invoke(gtid)) {
-    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
+    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
   }
 
 #if KMP_STATS_ENABLED
@@ -2435,7 +2435,7 @@
 
     // Restore number of threads in the team if needed. This code relies on
     // the proper adjustment of th_teams_size.nth after the fork in
-    // __kmp_teams_master on each teams master in the case that
+    // __kmp_teams_master on each teams primary thread in the case that
     // __kmp_reserve_threads reduced it.
     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
       int old_num = master_th->th.th_team_nproc;
@@ -2551,7 +2551,7 @@
     if (master_th->th.th_task_state_top >
         0) { // Restore task state from memo stack
       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-      // Remember master's state if we re-use this nested hot team
+      // Remember primary thread's state if we re-use this nested hot team
       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
           master_th->th.th_task_state;
       --master_th->th.th_task_state_top; // pop
@@ -2560,11 +2560,11 @@
           master_th->th
               .th_task_state_memo_stack[master_th->th.th_task_state_top];
     }
-    // Copy the task team from the parent team to the master thread
+    // Copy the task team from the parent team to the primary thread
     master_th->th.th_task_team =
         parent_team->t.t_task_team[master_th->th.th_task_state];
     KA_TRACE(20,
-             ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
+             ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
               parent_team));
   }
@@ -3199,7 +3199,7 @@
 #endif
                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
                           0 // argc
-                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
+                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
                           );
 #if USE_DEBUGGER
   // Non-NULL value should be assigned to make the debugger display the root
@@ -3236,7 +3236,7 @@
 #endif
                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
                           0 // argc
-                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
+                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
                           );
   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
 
@@ -3371,7 +3371,7 @@
         __kmp_print_structure_team("    Serial Team:  ",
                                    thread->th.th_serial_team);
         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
-        __kmp_print_structure_thread("    Master:       ",
+        __kmp_print_structure_thread("    Primary:      ",
                                      thread->th.th_team_master);
         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
@@ -3419,7 +3419,7 @@
     int i;
     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
-    __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
+    __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
@@ -3788,7 +3788,7 @@
   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
   TCW_4(__kmp_init_gtid, TRUE);
 
-  /* prepare the master thread for get_gtid() */
+  /* prepare the primary thread for get_gtid() */
   __kmp_gtid_set_specific(gtid);
 
 #if USE_ITT_BUILD
@@ -3883,7 +3883,7 @@
   KMP_DEBUG_ASSERT(level < max_level);
   kmp_team_t *team = hot_teams[level].hot_team;
   nth = hot_teams[level].hot_team_nth;
-  n = nth - 1; // master is not freed
+  n = nth - 1; // primary thread is not freed
   if (level < max_level - 1) {
     for (i = 0; i < nth; ++i) {
       kmp_info_t *th = team->t.t_threads[i];
@@ -4140,9 +4140,9 @@
     this_thr->th.th_pri_head = NULL;
   }
 
-  if (this_thr != master && // Master's CG root is initialized elsewhere
+  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
-    // Make new thread's CG root same as master's
+    // Make new thread's CG root same as primary thread's
     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
     if (tmp) {
@@ -4322,11 +4322,11 @@
       // The reason is that if the library is loaded/unloaded in a loop with
       // small (parallel) work in between, then there is high probability that
       // monitor thread started after the library shutdown. At shutdown it is
-      // too late to cope with the problem, because when the master is in
-      // DllMain (process detach) the monitor has no chances to start (it is
-      // blocked), and master has no means to inform the monitor that the
-      // library has gone, because all the memory which the monitor can access
-      // is going to be released/reset.
+      // too late to cope with the problem, because when the primary thread is
+      // in DllMain (process detach) the monitor has no chances to start (it is
+      // blocked), and primary thread has no means to inform the monitor that
+      // the library has gone, because all the memory which the monitor can
+      // access is going to be released/reset.
       while (TCR_4(__kmp_init_monitor) < 2) {
         KMP_YIELD(TRUE);
       }
@@ -4396,7 +4396,7 @@
     __kmp_print_thread_storage_map(new_thr, new_gtid);
   }
 
-  // add the reserve serialized team, initialized from the team's master thread
+  // add the reserve serialized team, initialized from the team's primary thread
   {
     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
@@ -4520,7 +4520,7 @@
   KMP_CHECK_UPDATE(team->t.t_ident, loc);
 
   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
-  // Copy ICVs to the master thread's implicit taskdata
+  // Copy ICVs to the primary thread's implicit taskdata
   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
 
@@ -4606,11 +4606,11 @@
 #if KMP_AFFINITY_SUPPORTED
 
 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
-// It calculates the worker + master thread's partition based upon the parent
+// It calculates the worker + primary thread's partition based upon the parent
 // thread's partition, and binds each worker to a thread in their partition.
-// The master thread's partition should already include its current binding.
+// The primary thread's partition should already include its current binding.
 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
-  // Copy the master thread's place partition to the team struct
+  // Copy the primary thread's place partition to the team struct
   kmp_info_t *master_th = team->t.t_threads[0];
   KMP_DEBUG_ASSERT(master_th != NULL);
   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
@@ -4628,12 +4628,12 @@
   switch (proc_bind) {
 
   case proc_bind_default:
-    // serial teams might have the proc_bind policy set to proc_bind_default. It
-    // doesn't matter, as we don't rebind master thread for any proc_bind policy
+    // Serial teams might have the proc_bind policy set to proc_bind_default.
+    // Not an issue -- we don't rebind primary thread for any proc_bind policy.
     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
     break;
 
-  case proc_bind_master: {
+  case proc_bind_primary: {
     int f;
     int n_th = team->t.t_nproc;
     for (f = 1; f < n_th; f++) {
@@ -4647,7 +4647,7 @@
         team->t.t_display_affinity = 1;
       }
 
-      KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
+      KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
                      "partition = [%d,%d]\n",
                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
                      f, masters_place, first_place, last_place));
@@ -5035,7 +5035,7 @@
 
       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
       kmp_r_sched_t new_sched = new_icvs->sched;
-      // set master's schedule as new run-time schedule
+      // set primary thread's schedule as new run-time schedule
       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
 
       __kmp_reinitialize_team(team, new_icvs,
@@ -5115,7 +5115,7 @@
         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
       }
 
-      // restore the current task state of the master thread: should be the
+      // restore the current task state of the primary thread: should be the
       // implicit task
       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
                     team->t.t_threads[0], team));
@@ -5185,10 +5185,11 @@
         }
 
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
-        /* Temporarily set full mask for master thread before creation of
-           workers. The reason is that workers inherit the affinity from master,
-           so if a lot of workers are created on the single core quickly, they
-           don't get a chance to set their own affinity for a long time. */
+        /* Temporarily set full mask for primary thread before creation of
+           workers. The reason is that workers inherit the affinity from the
+           primary thread, so if a lot of workers are created on the single
+           core quickly, they don't get a chance to set their own affinity for
+           a long time. */
         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
 #endif
 
@@ -5221,7 +5222,7 @@
 
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
         if (KMP_AFFINITY_CAPABLE()) {
-          /* Restore initial master thread's affinity mask */
+          /* Restore initial primary thread's affinity mask */
           __kmp_set_system_affinity(old_mask, TRUE);
           KMP_CPU_FREE(old_mask);
         }
@@ -5244,15 +5245,15 @@
       if (level) { // set th_task_state for new threads in nested hot team
         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
         // only need to set the th_task_state for the new threads. th_task_state
-        // for master thread will not be accurate until after this in
-        // __kmp_fork_call(), so we look to the master's memo_stack to get the
-        // correct value.
+        // for primary thread will not be accurate until after this in
+        // __kmp_fork_call(), so we look to the primary thread's memo_stack to
+        // get the correct value.
         for (f = old_nproc; f < team->t.t_nproc; ++f)
           team->t.t_threads[f]->th.th_task_state =
               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
       } else { // set th_task_state for new threads in non-nested hot team
-        kmp_uint8 old_state =
-            team->t.t_threads[0]->th.th_task_state; // copy master's state
+        // copy primary thread's state
+        kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
         for (f = old_nproc; f < team->t.t_nproc; ++f)
           team->t.t_threads[f]->th.th_task_state = old_state;
       }
@@ -5553,7 +5554,7 @@
     /* TODO limit size of team pool, call reap_team if pool too large */
     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
     __kmp_team_pool = (volatile kmp_team_t *)team;
-  } else { // Check if team was created for the masters in a teams construct
+  } else { // Check if team was created for primary threads in teams construct
     // See if first worker is a CG root
     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
                      team->t.t_threads[1]->th.th_cg_roots);
@@ -7337,7 +7338,7 @@
 }
 
 void __kmp_teams_master(int gtid) {
-  // This routine is called by all master threads in teams construct
+  // This routine is called by all primary threads in teams construct
   kmp_info_t *thr = __kmp_threads[gtid];
   kmp_team_t *team = thr->th.th_team;
   ident_t *loc = team->t.t_ident;
@@ -7350,7 +7351,7 @@
   // This thread is a new CG root.  Set up the proper variables.
   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
   tmp->cg_root = thr; // Make thr the CG root
-  // Init to thread limit that was stored when league masters were forked
+  // Init to thread limit stored when league primary threads were forked
   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
@@ -7456,7 +7457,7 @@
       num_threads = 1;
     }
   } else {
-    // This thread will be the master of the league masters
+    // This thread will be the primary thread of the league primary threads
     // Store new thread limit; old limit is saved in th_cg_roots list
     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
     // num_threads = min(num_threads, nthreads-var)
@@ -7698,10 +7699,10 @@
   }
   hot_team = root->r.r_hot_team;
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
-    return hot_team->t.t_nproc - 1; // Don't count master thread
+    return hot_team->t.t_nproc - 1; // Don't count primary thread
   }
 
-  // Skip the master thread - it is accounted for elsewhere.
+  // Skip the primary thread - it is accounted for elsewhere.
   retval = 0;
   for (i = 1; i < hot_team->t.t_nproc; i++) {
     if (hot_team->t.t_threads[i]->th.th_active) {
@@ -7734,8 +7735,8 @@
 
   // Threads that are active in the thread pool, active in the hot team for this
   // particular root (if we are at the outer par level), and the currently
-  // executing thread (to become the master) are available to add to the new
-  // team, but are currently contributing to the system load, and must be
+  // executing thread (to become the primary thread) are available to add to the
+  // new team, but are currently contributing to the system load, and must be
   // accounted for.
   pool_active = __kmp_thread_pool_active_nth;
   hot_team_active = __kmp_active_hot_team_nproc(root);
Index: openmp/runtime/src/kmp_sched.cpp
===================================================================
--- openmp/runtime/src/kmp_sched.cpp
+++ openmp/runtime/src/kmp_sched.cpp
@@ -508,8 +508,8 @@
         __kmp_static == kmp_sch_static_greedy ||
         __kmp_static ==
             kmp_sch_static_balanced); // Unknown static scheduling type.
-    // only masters of some teams get single iteration, other threads get
-    // nothing
+    // only primary threads of some teams get single iteration, other threads
+    // get nothing
     if (team_id < trip_count && tid == 0) {
       *pupper = *pupperDist = *plower = *plower + team_id * incr;
     } else {
Index: openmp/runtime/src/kmp_settings.cpp
===================================================================
--- openmp/runtime/src/kmp_settings.cpp
+++ openmp/runtime/src/kmp_settings.cpp
@@ -3236,11 +3236,12 @@
     for (;;) {
       enum kmp_proc_bind_t bind;
 
-      if ((num == (int)proc_bind_master) ||
-          __kmp_match_str("master", buf, &next)) {
+      if ((num == (int)proc_bind_primary) ||
+          __kmp_match_str("master", buf, &next) ||
+          __kmp_match_str("primary", buf, &next)) {
         buf = next;
         SKIP_WS(buf);
-        bind = proc_bind_master;
+        bind = proc_bind_primary;
       } else if ((num == (int)proc_bind_close) ||
                  __kmp_match_str("close", buf, &next)) {
         buf = next;
@@ -3308,8 +3309,8 @@
         __kmp_str_buf_print(buffer, "true");
         break;
 
-      case proc_bind_master:
-        __kmp_str_buf_print(buffer, "master");
+      case proc_bind_primary:
+        __kmp_str_buf_print(buffer, "primary");
         break;
 
       case proc_bind_close:
Index: openmp/runtime/src/kmp_stats.h
===================================================================
--- openmp/runtime/src/kmp_stats.h
+++ openmp/runtime/src/kmp_stats.h
@@ -48,9 +48,9 @@
  */
 enum stats_flags_e {
   noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
-  onlyInMaster = 1 << 1, //!< statistic is valid only for master
+  onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread
   noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
-  notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
+  notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads
   logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
   //! KMP_STATS_EVENTS is on (valid only for timers)
 };
@@ -103,6 +103,7 @@
   macro(OMP_CRITICAL, 0, arg)                                                  \
   macro(OMP_SINGLE, 0, arg)                                                    \
   macro(OMP_MASTER, 0, arg)                                                    \
+  macro(OMP_MASKED, 0, arg)                                                    \
   macro(OMP_TEAMS, 0, arg)                                                     \
   macro(OMP_set_lock, 0, arg)                                                  \
   macro(OMP_test_lock, 0, arg)                                                 \
@@ -150,6 +151,7 @@
   macro (OMP_critical_wait, 0, arg)                                            \
   macro (OMP_single, 0, arg)                                                   \
   macro (OMP_master, 0, arg)                                                   \
+  macro (OMP_masked, 0, arg)                                                   \
   macro (OMP_task_immediate, 0, arg)                                           \
   macro (OMP_task_taskwait, 0, arg)                                            \
   macro (OMP_task_taskyield, 0, arg)                                           \
@@ -180,8 +182,8 @@
 // clang-format on
 
 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
-//                           initializing OpenMP or being created by a master)
-//                           until the thread is destroyed
+//                           initializing OpenMP or being created by a primary
+//                           thread) until the thread is destroyed
 // OMP_parallel           -- Time thread spends executing work directly
 //                           within a #pragma omp parallel
 // OMP_parallel_overhead  -- Time thread spends setting up a parallel region
@@ -198,6 +200,7 @@
 //                           a critical section
 // OMP_single             -- Time spent executing a "single" region
 // OMP_master             -- Time spent executing a "master" region
+// OMP_masked             -- Time spent executing a "masked" region
 // OMP_task_immediate     -- Time spent executing non-deferred tasks
 // OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
 //                           construct
@@ -710,7 +713,7 @@
     to the bar width in the timeline graph.
 
     Every thread will have a thread local pointer to its node in
-    the list.  The sentinel node is used by the master thread to
+    the list.  The sentinel node is used by the primary thread to
     store "dummy" statistics before __kmp_create_worker() is called.
 **************************************************************** */
 class kmp_stats_list {
Index: openmp/runtime/src/kmp_stats.cpp
===================================================================
--- openmp/runtime/src/kmp_stats.cpp
+++ openmp/runtime/src/kmp_stats.cpp
@@ -832,10 +832,10 @@
     // Accumulate timers.
     for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
       // See if we should ignore this timer when aggregating
-      if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on master
-          // and this thread is worker
+      if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on
+          // primary thread and this thread is worker
           (timeStat::workerOnly(s) && (t == 0)) // Timer only valid on worker
-          // and this thread is the master
+          // and this thread is the primary thread
           ) {
         continue;
       }
Index: openmp/runtime/src/kmp_tasking.cpp
===================================================================
--- openmp/runtime/src/kmp_tasking.cpp
+++ openmp/runtime/src/kmp_tasking.cpp
@@ -2835,7 +2835,7 @@
   if (*thread_finished) {
     // We need to un-mark this victim as a finished victim.  This must be done
     // before releasing the lock, or else other threads (starting with the
-    // master victim) might be prematurely released from the barrier!!!
+    // primary thread victim) might be prematurely released from the barrier!!!
     kmp_int32 count;
 
     count = KMP_ATOMIC_INC(unfinished_threads);
@@ -3047,7 +3047,7 @@
       }
 
       // It is now unsafe to reference thread->th.th_team !!!
-      // Decrementing task_team->tt.tt_unfinished_threads can allow the master
+      // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
       // thread to pass through the barrier, where it might reset each thread's
       // th.th_team field for the next parallel region. If we can steal more
       // work, we know that this has not happened yet.
@@ -3060,8 +3060,8 @@
       }
     }
 
-    // If this thread's task team is NULL, master has recognized that there are
-    // no more tasks; bail out
+    // If this thread's task team is NULL, primary thread has recognized that
+    // there are no more tasks; bail out
     if (thread->th.th_task_team == NULL) {
       KA_TRACE(15,
                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
@@ -3201,7 +3201,7 @@
  * After a child * thread checks into a barrier and calls __kmp_release() from
  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
  * longer assume that the kmp_team_t structure is intact (at any moment, the
- * master thread may exit the barrier code and free the team data structure,
+ * primary thread may exit the barrier code and free the team data structure,
  * and return the threads to the thread pool).
  *
  * This does not work with the tasking code, as the thread is still
@@ -3210,11 +3210,11 @@
  * to each thread in the team, so that it can steal work from it.
  *
  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
- * counting mechanism, and is allocated by the master thread before calling
+ * counting mechanism, and is allocated by the primary thread before calling
  * __kmp_<barrier_kind>_release, and then is release by the last thread to
  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
  * of the kmp_task_team_t structs for consecutive barriers can overlap
- * (and will, unless the master thread is the last thread to exit the barrier
+ * (and will, unless the primary thread is the last thread to exit the barrier
  * release phase, which is not typical). The existence of such a struct is
  * useful outside the context of tasking.
  *
@@ -3586,8 +3586,8 @@
       (always || team->t.t_nproc > 1)) {
     team->t.t_task_team[this_thr->th.th_task_state] =
         __kmp_allocate_task_team(this_thr, team);
-    KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
-                  "for team %d at parity=%d\n",
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+                  " for team %d at parity=%d\n",
                   __kmp_gtid_from_thread(this_thr),
                   team->t.t_task_team[this_thr->th.th_task_state],
                   ((team != NULL) ? team->t.t_id : -1),
@@ -3599,14 +3599,14 @@
   // threads spin in the barrier release phase, they will continue to use the
   // previous task_team struct(above), until they receive the signal to stop
   // checking for tasks (they can't safely reference the kmp_team_t struct,
-  // which could be reallocated by the master thread). No task teams are formed
+  // which could be reallocated by the primary thread). No task teams are formed
   // for serialized teams.
   if (team->t.t_nproc > 1) {
     int other_team = 1 - this_thr->th.th_task_state;
     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
       team->t.t_task_team[other_team] =
           __kmp_allocate_task_team(this_thr, team);
-      KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
+      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
                     "task_team %p for team %d at parity=%d\n",
                     __kmp_gtid_from_thread(this_thr),
                     team->t.t_task_team[other_team],
@@ -3625,7 +3625,7 @@
       }
       // if team size has changed, the first thread to enable tasking will
       // realloc threads_data if necessary
-      KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
+      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
                     "%p for team %d at parity=%d\n",
                     __kmp_gtid_from_thread(this_thr),
                     team->t.t_task_team[other_team],
@@ -3675,12 +3675,12 @@
             ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
 }
 
-// __kmp_task_team_wait: Master thread waits for outstanding tasks after the
-// barrier gather phase. Only called by master thread if #threads in team > 1 or
-// if proxy tasks were created.
+// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
+// barrier gather phase. Only called by primary thread if #threads in team > 1
+// or if proxy tasks were created.
 //
 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
-// by passing in 0 optionally as the last argument. When wait is zero, master
+// by passing in 0 optionally as the last argument. When wait is zero, primary
 // thread does not wait for unfinished_threads to reach 0.
 void __kmp_task_team_wait(
     kmp_info_t *this_thr,
@@ -3692,12 +3692,12 @@
 
   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
     if (wait) {
-      KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
+      KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
                     "(for unfinished_threads to reach 0) on task_team = %p\n",
                     __kmp_gtid_from_thread(this_thr), task_team));
       // Worker threads may have dropped through to release phase, but could
       // still be executing tasks. Wait here for tasks to complete. To avoid
-      // memory contention, only master thread checks termination condition.
+      // memory contention, only primary thread checks termination condition.
       kmp_flag_32<false, false> flag(
           RCAST(std::atomic<kmp_uint32> *,
                 &task_team->tt.tt_unfinished_threads),
@@ -3708,7 +3708,7 @@
     // referencing it while spinning.
     KA_TRACE(
         20,
-        ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
+        ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
          "setting active to false, setting local and team's pointer to NULL\n",
          __kmp_gtid_from_thread(this_thr), task_team));
     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
Index: openmp/runtime/src/kmp_threadprivate.cpp
===================================================================
--- openmp/runtime/src/kmp_threadprivate.cpp
+++ openmp/runtime/src/kmp_threadprivate.cpp
@@ -169,7 +169,7 @@
       struct shared_common *d_tn;
 
       /* C++ destructors need to be called once per thread before exiting.
-         Don't call destructors for master thread though unless we used copy
+         Don't call destructors for primary thread though unless we used copy
          constructor */
 
       for (d_tn = __kmp_threadprivate_d_table.data[q]; d_tn;
@@ -451,15 +451,16 @@
     return tn;
 
   /* if C++ object with copy constructor, use it;
-   * else if C++ object with constructor, use it for the non-master copies only;
+   * else if C++ object with constructor, use it for the non-primary thread
+     copies only;
    * else use pod_init and memcpy
    *
-   * C++ constructors need to be called once for each non-master thread on
+   * C++ constructors need to be called once for each non-primary thread on
    * allocate
    * C++ copy constructors need to be called once for each thread on allocate */
 
   /* C++ object with constructors/destructors; don't call constructors for
-     master thread though */
+     primary thread though */
   if (d_tn->is_vec) {
     if (d_tn->ct.ctorv != 0) {
       (void)(*d_tn->ct.ctorv)(tn->par_addr, d_tn->vec_len);