Index: runtime/CMakeLists.txt
===================================================================
--- runtime/CMakeLists.txt
+++ runtime/CMakeLists.txt
@@ -321,12 +321,11 @@
 # OMPT-support
 set(LIBOMP_OMPT_DEBUG FALSE CACHE BOOL
   "Trace OMPT initialization?")
+#after testing: turn on ompt support by default for OpenMP 5.0 and higher
 set(LIBOMP_OMPT_SUPPORT FALSE CACHE BOOL
   "OMPT-support?")
-set(LIBOMP_OMPT_BLAME TRUE CACHE BOOL
-  "OMPT-blame?")
-set(LIBOMP_OMPT_TRACE TRUE CACHE BOOL
-  "OMPT-trace?")
+set(LIBOMP_OMPT_OPTIONAL TRUE CACHE BOOL
+  "OMPT-optional?")
 if(LIBOMP_OMPT_SUPPORT AND (NOT LIBOMP_HAVE_OMPT_SUPPORT))
   libomp_error_say("OpenMP Tools Interface requested but not available in this implementation")
 endif()
@@ -396,8 +395,7 @@
   libomp_say("Use ITT notify       -- ${LIBOMP_USE_ITT_NOTIFY}")
   libomp_say("Use OMPT-support     -- ${LIBOMP_OMPT_SUPPORT}")
   if(${LIBOMP_OMPT_SUPPORT})
-    libomp_say("Use OMPT-blame       -- ${LIBOMP_OMPT_BLAME}")
-    libomp_say("Use OMPT-trace       -- ${LIBOMP_OMPT_TRACE}")
+    libomp_say("Use OMPT-optional  -- ${LIBOMP_OMPT_OPTIONAL}")
   endif()
   libomp_say("Use Adaptive locks   -- ${LIBOMP_USE_ADAPTIVE_LOCKS}")
   libomp_say("Use quad precision   -- ${LIBOMP_USE_QUAD_PRECISION}")
Index: runtime/src/exports_so.txt
===================================================================
--- runtime/src/exports_so.txt
+++ runtime/src/exports_so.txt
@@ -25,8 +25,7 @@
         #
         # OMPT API
         #
-        ompt_tool;           # OMPT initialization interface
-        ompt_control;        # OMPT control interface
+        ompt_start_tool;     # OMPT start interface
 
         # icc drops weak attribute at linking step without the following line:
         Annotate*;           # TSAN annotation
Index: runtime/src/include/50/omp.h.var
===================================================================
--- runtime/src/include/50/omp.h.var
+++ runtime/src/include/50/omp.h.var
@@ -182,6 +182,23 @@
     extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
     extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
 
+    /* OpenMP 5.0 Tool Control */
+    typedef enum omp_control_tool_result_t {
+        omp_control_tool_notool = -2,
+        omp_control_tool_nocallback = -1,
+        omp_control_tool_success = 0,
+        omp_control_tool_ignored = 1
+    } omp_control_tool_result_t;
+
+    typedef enum omp_control_tool_t {
+        omp_control_tool_start = 1,
+        omp_control_tool_pause = 2,
+        omp_control_tool_flush = 3,
+        omp_control_tool_end = 4
+    } omp_control_tool_t;
+    
+    extern int __KAI_KMPC_CONVENTION omp_control_tool(int, int, void*);
+
 #   undef __KAI_KMPC_CONVENTION
 
     /* Warning:
Index: runtime/src/include/50/omp_lib.h.var
===================================================================
--- runtime/src/include/50/omp_lib.h.var
+++ runtime/src/include/50/omp_lib.h.var
@@ -29,6 +29,8 @@
       integer, parameter :: kmp_size_t_kind        = int_ptr_kind()
       integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
       integer, parameter :: omp_lock_hint_kind     = omp_integer_kind
+      integer, parameter :: omp_control_tool_kind  = omp_integer_kind
+      integer, parameter :: omp_control_tool_result_kind = omp_integer_kind
 
       integer (kind=omp_integer_kind), parameter :: openmp_version    = @LIBOMP_OMP_YEAR_MONTH@
       integer (kind=omp_integer_kind), parameter :: kmp_version_major = @LIBOMP_VERSION_MAJOR@
@@ -57,6 +59,16 @@
       integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_rtm            = 131072
       integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 262144
 
+      integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_start = 1
+      integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_pause = 2
+      integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_flush = 3
+      integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_end = 4
+
+      integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_notool = -2
+      integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_nocallback = -1
+      integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_success = 0
+      integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_ignored = 1
+
       interface
 
 !       ***
@@ -494,6 +506,13 @@
           integer (kind=omp_lock_hint_kind), value :: hint
         end subroutine omp_init_nest_lock_with_hint
 
+        function omp_control_tool(command, modifier) bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_control_tool
+          integer (kind=omp_control_tool_kind), value :: command
+          integer (kind=omp_control_tool_kind), value :: modifier
+        end function omp_control_tool
+
       end interface
 
 !DIR$ IF DEFINED (__INTEL_OFFLOAD)
Index: runtime/src/include/50/omp_lib.f.var
===================================================================
--- runtime/src/include/50/omp_lib.f.var
+++ runtime/src/include/50/omp_lib.f.var
@@ -32,6 +32,8 @@
         integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
         integer, parameter :: kmp_cancel_kind        = omp_integer_kind
         integer, parameter :: omp_lock_hint_kind     = omp_integer_kind
+        integer, parameter :: omp_control_tool_kind  = omp_integer_kind
+        integer, parameter :: omp_control_tool_result_kind = omp_integer_kind
 
       end module omp_lib_kinds
 
@@ -518,6 +520,13 @@
             integer (kind=omp_lock_hint_kind) hint
           end subroutine omp_init_nest_lock_with_hint
 
+          function omp_control_tool(command, modifier)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_control_tool
+            integer (kind=omp_control_tool_kind) command
+            integer (kind=omp_control_tool_kind) modifier
+          end function omp_control_tool
+
         end interface
 
 !dec$ if defined(_WIN32)
@@ -563,6 +572,7 @@
 !dec$ attributes alias:'OMP_GET_CANCELLATION' :: omp_get_cancellation
 !dec$ attributes alias:'OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
 !dec$ attributes alias:'OMP_GET_MAX_TASK_PRIORITY' :: omp_get_max_task_priority
+!dec$ attributes alias:'OMP_CONTROL_TOOL' :: omp_control_tool
 
 !dec$ attributes alias:'omp_init_lock' :: omp_init_lock
 !dec$ attributes alias:'omp_init_lock_with_hint' :: omp_init_lock_with_hint
@@ -643,6 +653,7 @@
 !dec$ attributes alias:'_OMP_GET_CANCELLATION' :: omp_get_cancellation
 !dec$ attributes alias:'_OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
 !dec$ attributes alias:'_OMP_GET_MAX_TASK_PRIORTY' :: omp_get_max_task_priority
+!dec$ attributes alias:'_OMP_CONTROL_TOOL' :: omp_control_tool
 
 !dec$ attributes alias:'_omp_init_lock' :: omp_init_lock
 !dec$ attributes alias:'_omp_init_lock_with_hint' :: omp_init_lock_with_hint
@@ -739,6 +750,7 @@
 !dec$ attributes alias:'omp_set_nest_lock_'::omp_set_nest_lock
 !dec$ attributes alias:'omp_unset_nest_lock_'::omp_unset_nest_lock
 !dec$ attributes alias:'omp_test_nest_lock_'::omp_test_nest_lock
+!dec$ attributes alias:'omp_control_tool_'::omp_control_tool
 
 !dec$ attributes alias:'kmp_set_stacksize_'::kmp_set_stacksize
 !dec$ attributes alias:'kmp_set_stacksize_s_'::kmp_set_stacksize_s
@@ -818,6 +830,7 @@
 !dec$ attributes alias:'_omp_set_nest_lock_'::omp_set_nest_lock
 !dec$ attributes alias:'_omp_unset_nest_lock_'::omp_unset_nest_lock
 !dec$ attributes alias:'_omp_test_nest_lock_'::omp_test_nest_lock
+!dec$ attributes alias:'_omp_control_tool_'::omp_control_tool
 
 !dec$ attributes alias:'_kmp_set_stacksize_'::kmp_set_stacksize
 !dec$ attributes alias:'_kmp_set_stacksize_s_'::kmp_set_stacksize_s
Index: runtime/src/include/50/omp_lib.f90.var
===================================================================
--- runtime/src/include/50/omp_lib.f90.var
+++ runtime/src/include/50/omp_lib.f90.var
@@ -28,6 +28,8 @@
         integer, parameter :: kmp_affinity_mask_kind = c_intptr_t
         integer, parameter :: kmp_cancel_kind        = omp_integer_kind
         integer, parameter :: omp_lock_hint_kind     = omp_integer_kind
+        integer, parameter :: omp_control_tool_kind  = omp_integer_kind
+        integer, parameter :: omp_control_tool_result_kind = omp_integer_kind
 
       end module omp_lib_kinds
 
@@ -68,6 +70,16 @@
         integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_rtm            = 131072
         integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 262144
 
+        integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_start = 1
+        integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_pause = 2
+        integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_flush = 3
+        integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_end = 4
+
+        integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_notool = -2
+        integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_nocallback = -1
+        integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_success = 0
+        integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_ignored = 1
+
         interface
 
 !         ***
@@ -519,6 +531,13 @@
             integer (kind=omp_lock_hint_kind), value :: hint
           end subroutine omp_init_nest_lock_with_hint
 
+          function omp_control_tool(command, modifier) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_control_tool
+            integer (kind=omp_control_tool_kind), value :: command
+            integer (kind=omp_control_tool_kind), value :: modifier
+          end function omp_control_tool
+
         end interface
 
       end module omp_lib
Index: runtime/src/include/50/ompt.h.var
===================================================================
--- runtime/src/include/50/ompt.h.var
+++ runtime/src/include/50/ompt.h.var
@@ -10,6 +10,7 @@
  *****************************************************************************/
 
 #include <stdint.h>
+#include <stddef.h>
 
 
 
@@ -17,21 +18,28 @@
  * iteration macros
  *****************************************************************************/
 
-#define FOREACH_OMPT_INQUIRY_FN(macro)  \
-    macro (ompt_enumerate_state)        \
-                                        \
-    macro (ompt_set_callback)           \
-    macro (ompt_get_callback)           \
-                                        \
-    macro (ompt_get_idle_frame)         \
-    macro (ompt_get_task_frame)         \
-                                        \
-    macro (ompt_get_state)              \
-                                        \
-    macro (ompt_get_parallel_id)        \
-    macro (ompt_get_parallel_team_size) \
-    macro (ompt_get_task_id)            \
-    macro (ompt_get_thread_id)
+#define FOREACH_OMPT_INQUIRY_FN(macro)      \
+    macro (ompt_enumerate_states)           \
+    macro (ompt_enumerate_mutex_impls)      \
+                                            \
+    macro (ompt_set_callback)               \
+    macro (ompt_get_callback)               \
+                                            \
+    macro (ompt_get_state)                  \
+                                            \
+    macro (ompt_get_parallel_info)          \
+    macro (ompt_get_task_info)              \
+    macro (ompt_get_thread_data)            \
+    macro (ompt_get_unique_id)              \
+                                            \
+    macro(ompt_get_num_places)              \
+    macro(ompt_get_place_proc_ids)          \
+    macro(ompt_get_place_num)               \
+    macro(ompt_get_partition_place_nums)    \
+    macro(ompt_get_proc_id)                 \
+                                            \
+    macro(ompt_get_target_info)             \
+    macro(ompt_get_num_devices)
 
 #define FOREACH_OMPT_PLACEHOLDER_FN(macro)  \
     macro (ompt_idle)                       \
@@ -40,141 +48,107 @@
     macro (ompt_task_wait)                  \
     macro (ompt_mutex_wait)
 
-#define FOREACH_OMPT_STATE(macro)                                                               \
+#define FOREACH_OMP_STATE(macro)                                                                \
                                                                                                 \
-    /* first */                                                                                 \
-    macro (ompt_state_first, 0x71)          /* initial enumeration state */                     \
+    /* first available state */                                                                 \
+    macro (omp_state_undefined, 0x102)      /* undefined thread state */                        \
                                                                                                 \
     /* work states (0..15) */                                                                   \
-    macro (ompt_state_work_serial, 0x00)    /* working outside parallel */                      \
-    macro (ompt_state_work_parallel, 0x01)  /* working within parallel */                       \
-    macro (ompt_state_work_reduction, 0x02) /* performing a reduction */                        \
+    macro (omp_state_work_serial, 0x000)    /* working outside parallel */                      \
+    macro (omp_state_work_parallel, 0x001)  /* working within parallel */                       \
+    macro (omp_state_work_reduction, 0x002) /* performing a reduction */                        \
                                                                                                 \
-    /* idle (16..31) */                                                                         \
-    macro (ompt_state_idle, 0x10)            /* waiting for work */                             \
+    /* barrier wait states (16..31) */                                                          \
+    macro (omp_state_wait_barrier, 0x010)   /* waiting at a barrier */                          \
+    macro (omp_state_wait_barrier_implicit_parallel, 0x011)                                     \
+                                            /* implicit barrier at the end of parallel region */\
+    macro (omp_state_wait_barrier_implicit_workshare, 0x012)                                    \
+                                            /* implicit barrier at the end of worksharing */    \
+    macro (omp_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
+    macro (omp_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
                                                                                                 \
-    /* overhead states (32..63) */                                                              \
-    macro (ompt_state_overhead, 0x20)        /* overhead excluding wait states */               \
+    /* task wait states (32..63) */                                                             \
+    macro (omp_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
+    macro (omp_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */                        \
                                                                                                 \
-    /* barrier wait states (64..79) */                                                          \
-    macro (ompt_state_wait_barrier, 0x40)    /* waiting at a barrier */                         \
-    macro (ompt_state_wait_barrier_implicit, 0x41)    /* implicit barrier */                    \
-    macro (ompt_state_wait_barrier_explicit, 0x42)    /* explicit barrier */                    \
+    /* mutex wait states (64..127) */                                                           \
+    macro (omp_state_wait_mutex, 0x040)                                                         \
+    macro (omp_state_wait_lock, 0x041)      /* waiting for lock */                              \
+    macro (omp_state_wait_critical, 0x042)  /* waiting for critical */                          \
+    macro (omp_state_wait_atomic, 0x043)    /* waiting for atomic */                            \
+    macro (omp_state_wait_ordered, 0x044)   /* waiting for ordered */                           \
                                                                                                 \
-    /* task wait states (80..95) */                                                             \
-    macro (ompt_state_wait_taskwait, 0x50)   /* waiting at a taskwait */                        \
-    macro (ompt_state_wait_taskgroup, 0x51)  /* waiting at a taskgroup */                       \
+    /* target wait states (128..255) */                                                         \
+    macro (omp_state_wait_target, 0x080)        /* waiting for target region */                 \
+    macro (omp_state_wait_target_map, 0x081)    /* waiting for target data mapping operation */ \
+    macro (omp_state_wait_target_update, 0x082) /* waiting for target update operation */       \
                                                                                                 \
-    /* mutex wait states (96..111) */                                                           \
-    macro (ompt_state_wait_lock, 0x60)       /* waiting for lock */                             \
-    macro (ompt_state_wait_nest_lock, 0x61)  /* waiting for nest lock */                        \
-    macro (ompt_state_wait_critical, 0x62)   /* waiting for critical */                         \
-    macro (ompt_state_wait_atomic, 0x63)     /* waiting for atomic */                           \
-    macro (ompt_state_wait_ordered, 0x64)    /* waiting for ordered */                          \
-    macro (ompt_state_wait_single, 0x6F)     /* waiting for single region (non-standard!) */    \
+    /* misc (256..511) */                                                                       \
+    macro (omp_state_idle, 0x100)           /* waiting for work */                              \
+    macro (omp_state_overhead, 0x101)       /* overhead excluding wait states */                \
                                                                                                 \
-    /* misc (112..127) */                                                                       \
-    macro (ompt_state_undefined, 0x70)       /* undefined thread state */
+    /* implementation-specific states (512..) */
 
 
+#define FOREACH_OMPT_MUTEX_IMPL(macro)                                                \
+    macro (ompt_mutex_impl_unknown, 0)      /* unknown implementatin */               \
+    macro (ompt_mutex_impl_spin, 1)         /* based on spin */                       \
+    macro (ompt_mutex_impl_queuing, 2)      /* based on some fair policy */           \
+    macro (ompt_mutex_impl_speculative, 3)  /* based on HW-supported speculation */
+
 #define FOREACH_OMPT_EVENT(macro)                                                                               \
                                                                                                                 \
     /*--- Mandatory Events ---*/                                                                                \
-    macro (ompt_event_parallel_begin,           ompt_new_parallel_callback_t,   1) /* parallel begin */         \
-    macro (ompt_event_parallel_end,             ompt_end_parallel_callback_t,   2) /* parallel end */           \
-                                                                                                                \
-    macro (ompt_event_task_begin,               ompt_new_task_callback_t,       3) /* task begin */             \
-    macro (ompt_event_task_end,                 ompt_task_callback_t,           4) /* task destroy */           \
-                                                                                                                \
-    macro (ompt_event_thread_begin,             ompt_thread_type_callback_t,    5) /* thread begin */           \
-    macro (ompt_event_thread_end,               ompt_thread_type_callback_t,    6) /* thread end */             \
+    macro (ompt_callback_thread_begin,          ompt_callback_thread_begin_t,   1) /* thread begin */           \
+    macro (ompt_callback_thread_end,            ompt_callback_thread_end_t,     2) /* thread end */             \
                                                                                                                 \
-    macro (ompt_event_control,                  ompt_control_callback_t,        7) /* support control calls */  \
+    macro (ompt_callback_parallel_begin,        ompt_callback_parallel_begin_t, 3) /* parallel begin */         \
+    macro (ompt_callback_parallel_end,          ompt_callback_parallel_end_t,   4) /* parallel end */           \
                                                                                                                 \
-    macro (ompt_event_runtime_shutdown,         ompt_callback_t,                8) /* runtime shutdown */       \
-                                                                                                                \
-    /*--- Optional Events (blame shifting, ompt_event_unimplemented) ---*/                                      \
-    macro (ompt_event_idle_begin,               ompt_thread_callback_t,         9) /* begin idle state */       \
-    macro (ompt_event_idle_end,                 ompt_thread_callback_t,        10) /* end idle state */         \
+    macro (ompt_callback_task_create,           ompt_callback_task_create_t,    5) /* task begin */             \
+    macro (ompt_callback_task_schedule,         ompt_callback_task_schedule_t,  6) /* task schedule */          \
+    macro (ompt_callback_implicit_task,         ompt_callback_implicit_task_t,  7) /* implicit task   */        \
                                                                                                                 \
-    macro (ompt_event_wait_barrier_begin,       ompt_parallel_callback_t,      11) /* begin wait at barrier */  \
-    macro (ompt_event_wait_barrier_end,         ompt_parallel_callback_t,      12) /* end wait at barrier */    \
+    macro (ompt_callback_target,                ompt_callback_target_t,         8) /* target */                 \
+    macro (ompt_callback_target_data_op,        ompt_callback_target_data_op_t, 9) /* target data op*/          \
+    macro (ompt_callback_target_submit,         ompt_callback_target_submit_t, 10) /* target  submit*/          \
                                                                                                                 \
-    macro (ompt_event_wait_taskwait_begin,      ompt_parallel_callback_t,      13) /* begin wait at taskwait */ \
-    macro (ompt_event_wait_taskwait_end,        ompt_parallel_callback_t,      14) /* end wait at taskwait */   \
+    macro (ompt_callback_control_tool,          ompt_callback_control_tool_t,  11) /* control tool */           \
                                                                                                                 \
-    macro (ompt_event_wait_taskgroup_begin,     ompt_parallel_callback_t,      15) /* begin wait at taskgroup */\
-    macro (ompt_event_wait_taskgroup_end,       ompt_parallel_callback_t,      16) /* end wait at taskgroup */  \
+    macro (ompt_callback_device_initialize,     ompt_callback_device_initialize_t, 12) /* device initialize */  \
+    macro (ompt_callback_device_finalize,       ompt_callback_device_finalize_t, 13)   /* device finalize   */  \
                                                                                                                 \
-    macro (ompt_event_release_lock,             ompt_wait_callback_t,          17) /* lock release */           \
-    macro (ompt_event_release_nest_lock_last,   ompt_wait_callback_t,          18) /* last nest lock release */ \
-    macro (ompt_event_release_critical,         ompt_wait_callback_t,          19) /* critical release */       \
+    /*--- Optional Events (blame shifting, ompt_event_unimplemented) ---*/                                      \
                                                                                                                 \
-    macro (ompt_event_release_atomic,           ompt_wait_callback_t,          20) /* atomic release */         \
+    macro (ompt_callback_sync_region_wait,      ompt_callback_sync_region_t,   14) /* sync region wait begin or end*/  \
                                                                                                                 \
-    macro (ompt_event_release_ordered,          ompt_wait_callback_t,          21) /* ordered release */        \
+    macro (ompt_callback_mutex_released,        ompt_callback_mutex_t,         15) /* mutex released */         \
                                                                                                                 \
     /*--- Optional Events (synchronous events, ompt_event_unimplemented) --- */                                 \
-    macro (ompt_event_implicit_task_begin,      ompt_parallel_callback_t,      22) /* implicit task begin   */  \
-    macro (ompt_event_implicit_task_end,        ompt_parallel_callback_t,      23) /* implicit task end  */     \
-                                                                                                                \
-    macro (ompt_event_initial_task_begin,       ompt_parallel_callback_t,      24) /* initial task begin   */   \
-    macro (ompt_event_initial_task_end,         ompt_parallel_callback_t,      25) /* initial task end  */      \
-                                                                                                                \
-    macro (ompt_event_task_switch,              ompt_task_pair_callback_t,     26) /* task switch */            \
-                                                                                                                \
-    macro (ompt_event_loop_begin,               ompt_new_workshare_callback_t, 27) /* task at loop begin */     \
-    macro (ompt_event_loop_end,                 ompt_parallel_callback_t,      28) /* task at loop end */       \
-                                                                                                                \
-    macro (ompt_event_sections_begin,           ompt_new_workshare_callback_t, 29) /* task at sections begin  */\
-    macro (ompt_event_sections_end,             ompt_parallel_callback_t,      30) /* task at sections end */   \
                                                                                                                 \
-    macro (ompt_event_single_in_block_begin,    ompt_new_workshare_callback_t, 31) /* task at single begin*/    \
-    macro (ompt_event_single_in_block_end,      ompt_parallel_callback_t,      32) /* task at single end */     \
+    macro (ompt_callback_task_dependences,      ompt_callback_task_dependences_t, 16) /* report task dependences  */\
+    macro (ompt_callback_task_dependence,       ompt_callback_task_dependence_t, 17) /* report task dependence  */\
                                                                                                                 \
-    macro (ompt_event_single_others_begin,      ompt_parallel_callback_t,      33) /* task at single begin */   \
-    macro (ompt_event_single_others_end,        ompt_parallel_callback_t,      34) /* task at single end */     \
+    macro (ompt_callback_work,                  ompt_callback_work_t,          18) /* task at work begin or end*/\
                                                                                                                 \
-    macro (ompt_event_workshare_begin,          ompt_new_workshare_callback_t, 35) /* task at workshare begin */\
-    macro (ompt_event_workshare_end,            ompt_parallel_callback_t,      36) /* task at workshare end */  \
+    macro (ompt_callback_master,                ompt_callback_master_t,        19) /* task at master begin or end */\
                                                                                                                 \
-    macro (ompt_event_master_begin,             ompt_parallel_callback_t,      37) /* task at master begin */   \
-    macro (ompt_event_master_end,               ompt_parallel_callback_t,      38) /* task at master end */     \
+    macro (ompt_callback_target_map,            ompt_callback_target_map_t,    20) /* target map */             \
                                                                                                                 \
-    macro (ompt_event_barrier_begin,            ompt_parallel_callback_t,      39) /* task at barrier begin  */ \
-    macro (ompt_event_barrier_end,              ompt_parallel_callback_t,      40) /* task at barrier end */    \
+    macro (ompt_callback_sync_region,           ompt_callback_sync_region_t,   21) /* sync region begin or end */ \
                                                                                                                 \
-    macro (ompt_event_taskwait_begin,           ompt_parallel_callback_t,      41) /* task at taskwait begin */ \
-    macro (ompt_event_taskwait_end,             ompt_parallel_callback_t,      42) /* task at task wait end */  \
+    macro (ompt_callback_lock_init,             ompt_callback_mutex_acquire_t, 22) /* lock init */              \
+    macro (ompt_callback_lock_destroy,          ompt_callback_mutex_t,         23) /* lock destroy */           \
                                                                                                                 \
-    macro (ompt_event_taskgroup_begin,          ompt_parallel_callback_t,      43) /* task at taskgroup begin */\
-    macro (ompt_event_taskgroup_end,            ompt_parallel_callback_t,      44) /* task at taskgroup end */  \
+    macro (ompt_callback_mutex_acquire,         ompt_callback_mutex_acquire_t, 24) /* mutex acquire */          \
+    macro (ompt_callback_mutex_acquired,        ompt_callback_mutex_t,         25) /* mutex acquired */         \
                                                                                                                 \
-    macro (ompt_event_release_nest_lock_prev,   ompt_wait_callback_t,          45) /* prev nest lock release */ \
+    macro (ompt_callback_nest_lock,             ompt_callback_nest_lock_t,     26) /* nest lock */              \
                                                                                                                 \
-    macro (ompt_event_wait_lock,                ompt_wait_callback_t,          46) /* lock wait */              \
-    macro (ompt_event_wait_nest_lock,           ompt_wait_callback_t,          47) /* nest lock wait */         \
-    macro (ompt_event_wait_critical,            ompt_wait_callback_t,          48) /* critical wait */          \
-    macro (ompt_event_wait_atomic,              ompt_wait_callback_t,          49) /* atomic wait */            \
-    macro (ompt_event_wait_ordered,             ompt_wait_callback_t,          50) /* ordered wait */           \
+    macro (ompt_callback_flush,                 ompt_callback_flush_t,         27) /* after executing flush */  \
                                                                                                                 \
-    macro (ompt_event_acquired_lock,            ompt_wait_callback_t,          51) /* lock acquired */          \
-    macro (ompt_event_acquired_nest_lock_first, ompt_wait_callback_t,          52) /* 1st nest lock acquired */ \
-    macro (ompt_event_acquired_nest_lock_next,  ompt_wait_callback_t,          53) /* next nest lock acquired*/ \
-    macro (ompt_event_acquired_critical,        ompt_wait_callback_t,          54) /* critical acquired */      \
-    macro (ompt_event_acquired_atomic,          ompt_wait_callback_t,          55) /* atomic acquired */        \
-    macro (ompt_event_acquired_ordered,         ompt_wait_callback_t,          56) /* ordered acquired */       \
-                                                                                                                \
-    macro (ompt_event_init_lock,                ompt_wait_callback_t,          57) /* lock init */              \
-    macro (ompt_event_init_nest_lock,           ompt_wait_callback_t,          58) /* nest lock init */         \
-                                                                                                                \
-    macro (ompt_event_destroy_lock,             ompt_wait_callback_t,          59) /* lock destruction */       \
-    macro (ompt_event_destroy_nest_lock,        ompt_wait_callback_t,          60) /* nest lock destruction */  \
-                                                                                                                \
-    macro (ompt_event_flush,                    ompt_callback_t,               61) /* after executing flush */  \
-                                                                                                                \
-    macro (ompt_event_task_dependences,         ompt_task_dependences_callback_t, 69) /* report task dependences  */\
-    macro (ompt_event_task_dependence_pair,     ompt_task_pair_callback_t,     70) /* report task dependence pair */
+    macro (ompt_callback_cancel,                ompt_callback_cancel_t,        28) /*cancel innermost binding region*/\
+    macro (ompt_callback_idle,                  ompt_callback_idle_t,          29) /* begin or end idle state */\
 
 
 
@@ -186,18 +160,20 @@
  * identifiers
  *---------------------*/
 
-typedef uint64_t ompt_thread_id_t;
-#define ompt_thread_id_none ((ompt_thread_id_t) 0)     /* non-standard */
+typedef uint64_t ompt_id_t;
+#define ompt_id_none 0
 
-typedef uint64_t ompt_task_id_t;
-#define ompt_task_id_none ((ompt_task_id_t) 0)         /* non-standard */
+typedef union ompt_data_u {
+  uint64_t value; /* data initialized by runtime to unique id */
+  void *ptr;      /* pointer under tool control */
+} ompt_data_t;
 
-typedef uint64_t ompt_parallel_id_t;
-#define ompt_parallel_id_none ((ompt_parallel_id_t) 0) /* non-standard */
+static const ompt_data_t ompt_data_none = {0};
 
 typedef uint64_t ompt_wait_id_t;
-#define ompt_wait_id_none ((ompt_wait_id_t) 0)         /* non-standard */
+static const ompt_wait_id_t ompt_wait_id_none = 0;
 
+typedef void ompt_device_t;
 
 /*---------------------
  * ompt_frame_t
@@ -235,35 +211,44 @@
  *---------------------*/
 
 typedef enum {
-#define ompt_state_macro(state, code) state = code,
-    FOREACH_OMPT_STATE(ompt_state_macro)
-#undef ompt_state_macro
-} ompt_state_t;
+#define omp_state_macro(state, code) state = code,
+    FOREACH_OMP_STATE(omp_state_macro)
+#undef omp_state_macro
+} omp_state_t;
 
 
 /*---------------------
  * runtime events
  *---------------------*/
 
-typedef enum {
+typedef enum ompt_callbacks_e{
 #define ompt_event_macro(event, callback, eventid) event = eventid,
     FOREACH_OMPT_EVENT(ompt_event_macro)
 #undef ompt_event_macro
-} ompt_event_t;
+} ompt_callbacks_t;
 
 
 /*---------------------
  * set callback results
  *---------------------*/
-typedef enum {
-    ompt_set_result_registration_error              = 0,
-    ompt_set_result_event_may_occur_no_callback     = 1,
-    ompt_set_result_event_never_occurs              = 2,
-    ompt_set_result_event_may_occur_callback_some   = 3,
-    ompt_set_result_event_may_occur_callback_always = 4,
+typedef enum ompt_set_result_e {
+    ompt_set_error = 0,
+    ompt_set_never = 1,
+    ompt_set_sometimes = 2,
+    ompt_set_sometimes_paired = 3,
+    ompt_set_always = 4
 } ompt_set_result_t;
 
 
+/*----------------------
+ * mutex implementations
+ *----------------------*/
+typedef enum ompt_mutex_impl_e {
+#define ompt_mutex_impl_macro(impl, code) impl = code,
+    FOREACH_OMPT_MUTEX_IMPL(ompt_mutex_impl_macro)
+#undef ompt_mutex_impl_macro
+} ompt_mutex_impl_t;
+
 
 /*****************************************************************************
  * callback signatures
@@ -273,14 +258,10 @@
 typedef void (*ompt_interface_fn_t)(void);
 
 typedef ompt_interface_fn_t (*ompt_function_lookup_t)(
-    const char *                      /* entry point to look up       */
+    const char *                          /* entry point to look up              */
 );
 
 /* threads */
-typedef void (*ompt_thread_callback_t) (
-    ompt_thread_id_t thread_id        /* ID of thread                 */
-);
-
 typedef enum {
     ompt_thread_initial = 1, // start the enumeration at 1
     ompt_thread_worker  = 2,
@@ -288,78 +269,262 @@
 } ompt_thread_type_t;
 
 typedef enum {
-    ompt_invoker_program = 0,         /* program invokes master task  */
-    ompt_invoker_runtime = 1          /* runtime invokes master task  */
+    ompt_invoker_program = 1,             /* program invokes master task         */
+    ompt_invoker_runtime = 2              /* runtime invokes master task         */
 } ompt_invoker_t;
 
-typedef void (*ompt_thread_type_callback_t) (
-    ompt_thread_type_t thread_type,   /* type of thread               */
-    ompt_thread_id_t thread_id        /* ID of thread                 */
+typedef void (*ompt_callback_thread_begin_t) (
+    ompt_thread_type_t thread_type,       /* type of thread                      */
+    ompt_data_t *thread_data              /* data of thread                      */
+);
+
+typedef void (*ompt_callback_thread_end_t) (
+    ompt_data_t *thread_data              /* data of thread                      */
 );
 
 typedef void (*ompt_wait_callback_t) (
-    ompt_wait_id_t wait_id            /* wait id                      */
+    ompt_wait_id_t wait_id                /* wait data                           */
 );
 
 /* parallel and workshares */
-typedef void (*ompt_parallel_callback_t) (
-    ompt_parallel_id_t parallel_id,    /* id of parallel region       */
-    ompt_task_id_t task_id             /* id of task                  */
+typedef enum ompt_scope_endpoint_e {
+    ompt_scope_begin = 1,
+    ompt_scope_end = 2
+} ompt_scope_endpoint_t;
+
+
+/* implicit task */
+typedef void (*ompt_callback_implicit_task_t) (
+    ompt_scope_endpoint_t endpoint,       /* endpoint of implicit task           */
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *task_data,               /* data of implicit task               */
+    unsigned int team_size,               /* team size                           */
+    unsigned int thread_num               /* thread number of calling thread     */
 );
 
-typedef void (*ompt_new_workshare_callback_t) (
-    ompt_parallel_id_t parallel_id,   /* id of parallel region        */
-    ompt_task_id_t parent_task_id,    /* id of parent task            */
-    void *workshare_function          /* pointer to outlined function */
+typedef void (*ompt_callback_parallel_begin_t) (
+    ompt_data_t *parent_task_data,        /* data of parent task                 */
+    const ompt_frame_t *parent_frame,     /* frame data of parent task           */
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    unsigned int requested_team_size,     /* requested number of threads in team */
+    ompt_invoker_t invoker,               /* invoker of master task              */
+    const void *codeptr_ra                /* return address of runtime call      */
 );
 
-typedef void (*ompt_new_parallel_callback_t) (
-    ompt_task_id_t parent_task_id,    /* id of parent task            */
-    ompt_frame_t *parent_task_frame,  /* frame data of parent task    */
-    ompt_parallel_id_t parallel_id,   /* id of parallel region        */
-    uint32_t requested_team_size,     /* number of threads in team    */
-    void *parallel_function,          /* pointer to outlined function */
-    ompt_invoker_t invoker            /* who invokes master task?     */
+typedef void (*ompt_callback_parallel_end_t) (
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *task_data,               /* data of task                        */
+    ompt_invoker_t invoker,               /* invoker of master task              */ 
+    const void *codeptr_ra                /* return address of runtime call      */
 );
 
-typedef void (*ompt_end_parallel_callback_t) (
-    ompt_parallel_id_t parallel_id,   /* id of parallel region       */
-    ompt_task_id_t task_id,           /* id of task                  */
-    ompt_invoker_t invoker            /* who invokes master task?    */
+/* tasks */
+typedef enum ompt_task_type_e {
+    ompt_task_initial    = 0x1,
+    ompt_task_implicit   = 0x2,
+    ompt_task_explicit   = 0x4,
+    ompt_task_target     = 0x8,
+    ompt_task_undeferred = 0x8000000,
+    ompt_task_untied     = 0x10000000,
+    ompt_task_final      = 0x20000000,
+    ompt_task_mergeable  = 0x40000000,
+    ompt_task_merged     = 0x80000000
+} ompt_task_type_t;
+
+typedef enum ompt_task_status_e {
+    ompt_task_complete = 1,
+    ompt_task_yield    = 2,
+    ompt_task_cancel   = 3,
+    ompt_task_others   = 4
+} ompt_task_status_t;
+
+typedef void (*ompt_callback_task_schedule_t) (
+    ompt_data_t *prior_task_data,         /* data of prior task                  */
+    ompt_task_status_t prior_task_status, /* status of prior task                */
+    ompt_data_t *next_task_data           /* data of next task                   */
 );
 
-/* tasks */
-typedef void (*ompt_task_callback_t) (
-    ompt_task_id_t task_id            /* id of task                   */
+typedef void (*ompt_callback_task_create_t) (
+    ompt_data_t *parent_task_data,        /* data of parent task                 */
+    const ompt_frame_t *parent_frame,     /* frame data for parent task          */
+    ompt_data_t *new_task_data,           /* data of created task                */
+    int type,                             /* type of created task                */
+    int has_dependences,                  /* created task has dependences        */
+    const void *codeptr_ra                /* return address of runtime call      */
 );
 
-typedef void (*ompt_task_pair_callback_t) (
-    ompt_task_id_t first_task_id,
-    ompt_task_id_t second_task_id
+/* task dependences */
+typedef void (*ompt_callback_task_dependences_t) (
+    ompt_data_t *task_data,               /* data of task                        */
+    const ompt_task_dependence_t *deps,   /* dependences of task                 */
+    int ndeps                             /* dependences count of task           */
 );
 
-typedef void (*ompt_new_task_callback_t) (
-    ompt_task_id_t parent_task_id,    /* id of parent task            */
-    ompt_frame_t *parent_task_frame,  /* frame data for parent task   */
-    ompt_task_id_t  new_task_id,      /* id of created task           */
-    void *task_function               /* pointer to outlined function */
+typedef void (*ompt_callback_task_dependence_t) (
+    ompt_data_t *src_task_data,           /* data of source task                 */
+    ompt_data_t *sink_task_data           /* data of sink task                   */
 );
 
-/* task dependences */
-typedef void (*ompt_task_dependences_callback_t) (
-    ompt_task_id_t task_id,            /* ID of task with dependences */
-    const ompt_task_dependence_t *deps,/* vector of task dependences  */
-    int ndeps                          /* number of dependences       */
+/* target and device */
+typedef enum ompt_target_type_e {
+    ompt_target = 1,
+    ompt_target_enter_data = 2,
+    ompt_target_exit_data = 3,
+    ompt_target_update = 4
+} ompt_target_type_t;
+
+typedef void (*ompt_callback_target_t) (
+    ompt_target_type_t kind,
+    ompt_scope_endpoint_t endpoint,
+    uint64_t device_num,
+    ompt_data_t *task_data,
+    ompt_id_t target_id,
+    const void *codeptr_ra
 );
 
-/* program */
-typedef void (*ompt_control_callback_t) (
-    uint64_t command,                 /* command of control call      */
-    uint64_t modifier                 /* modifier of control call     */
+typedef enum ompt_target_data_op_e {
+    ompt_target_data_alloc = 1,
+    ompt_target_data_transfer_to_dev = 2,
+    ompt_target_data_transfer_from_dev = 3,
+    ompt_target_data_delete = 4
+} ompt_target_data_op_t;
+
+typedef void (*ompt_callback_target_data_op_t) (
+    ompt_id_t target_id,
+    ompt_id_t host_op_id,
+    ompt_target_data_op_t optype,
+    void *host_addr,
+    void *device_addr,
+    size_t bytes
 );
 
-typedef void (*ompt_callback_t)(void);
+typedef void (*ompt_callback_target_submit_t) (
+    ompt_id_t target_id,
+    ompt_id_t host_op_id
+);
 
+typedef void (*ompt_callback_target_map_t) (
+    ompt_id_t target_id,
+    unsigned int nitems,
+    void **host_addr,
+    void **device_addr,
+    size_t *bytes,
+    unsigned int *mapping_flags
+);
+
+typedef void (*ompt_callback_device_initialize_t) (
+    uint64_t device_num,
+    const char *type,
+    ompt_device_t *device,
+    ompt_function_lookup_t lookup,
+    const char *documentation
+);
+
+typedef void (*ompt_callback_device_finalize_t) (
+    uint64_t device_num
+);
+
+/* control_tool */
+typedef int (*ompt_callback_control_tool_t) (
+    uint64_t command,                     /* command of control call             */
+    uint64_t modifier,                    /* modifier of control call            */
+    void *arg,                            /* argument of control call            */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef enum ompt_mutex_kind_e {
+    ompt_mutex = 0x10,
+    ompt_mutex_lock = 0x11,
+    ompt_mutex_nest_lock = 0x12,
+    ompt_mutex_critical = 0x13,
+    ompt_mutex_atomic = 0x14,
+    ompt_mutex_ordered = 0x20
+} ompt_mutex_kind_t;
+
+typedef void (*ompt_callback_mutex_acquire_t) (
+    ompt_mutex_kind_t kind,               /* mutex kind                          */
+    unsigned int hint,                    /* mutex hint                          */
+    unsigned int impl,                    /* mutex implementation                */
+    ompt_wait_id_t wait_id,               /* id of object being awaited          */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_mutex_t) (
+    ompt_mutex_kind_t kind,               /* mutex kind                          */
+    ompt_wait_id_t wait_id,               /* id of object being awaited          */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_nest_lock_t) (
+    ompt_scope_endpoint_t endpoint,       /* endpoint of nested lock             */
+    ompt_wait_id_t wait_id,               /* id of object being awaited          */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_master_t) (
+    ompt_scope_endpoint_t endpoint,       /* endpoint of master region           */
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *task_data,               /* data of task                        */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_idle_t) (
+    ompt_scope_endpoint_t endpoint        /* endpoint of idle time               */
+);
+
+typedef enum ompt_work_type_e {
+    ompt_work_loop = 1,
+    ompt_work_sections = 2,
+    ompt_work_single_executor = 3,
+    ompt_work_single_other = 4,
+    ompt_work_workshare = 5,
+    ompt_work_distribute = 6,
+    ompt_work_taskloop = 7
+} ompt_work_type_t;
+
+typedef void (*ompt_callback_work_t) (
+    ompt_work_type_t wstype,              /* type of work region                 */
+    ompt_scope_endpoint_t endpoint,       /* endpoint of work region             */
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *task_data,               /* data of task                        */
+    uint64_t count,                       /* quantity of work                    */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef enum ompt_sync_region_kind_e {
+    ompt_sync_region_barrier = 1,
+    ompt_sync_region_taskwait = 2,
+    ompt_sync_region_taskgroup = 3
+} ompt_sync_region_kind_t;
+
+typedef void (*ompt_callback_sync_region_t) (
+    ompt_sync_region_kind_t kind,         /* kind of sync region                 */
+    ompt_scope_endpoint_t endpoint,       /* endpoint of sync region             */
+    ompt_data_t *parallel_data,           /* data of parallel region             */
+    ompt_data_t *task_data,               /* data of task                        */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef enum ompt_cancel_flag_e {
+    ompt_cancel_parallel       = 0x1,
+    ompt_cancel_sections       = 0x2,
+    ompt_cancel_do             = 0x4,
+    ompt_cancel_taskgroup      = 0x8,
+    ompt_cancel_activated      = 0x10,
+    ompt_cancel_detected       = 0x20,
+    ompt_cancel_discarded_task = 0x40
+} ompt_cancel_flag_t;
+
+typedef void (*ompt_callback_cancel_t) (
+    ompt_data_t *task_data,               /* data of task                        */
+    int flags,                            /* cancel flags                        */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
+
+typedef void (*ompt_callback_flush_t) (
+    ompt_data_t *thread_data,             /* data of thread                      */
+    const void *codeptr_ra                /* return address of runtime call      */
+);
 
 /****************************************************************************
  * ompt API
@@ -381,33 +546,48 @@
  ***************************************************************************/
 
 /* state */
-OMPT_API_FUNCTION(ompt_state_t, ompt_get_state, (
-    ompt_wait_id_t *ompt_wait_id
+OMPT_API_FUNCTION(omp_state_t, ompt_get_state, (
+    ompt_wait_id_t *wait_id
 ));
 
 /* thread */
-OMPT_API_FUNCTION(ompt_thread_id_t, ompt_get_thread_id, (void));
-
-OMPT_API_FUNCTION(void *, ompt_get_idle_frame, (void));
+OMPT_API_FUNCTION(ompt_data_t*, ompt_get_thread_data, (void));
 
 /* parallel region */
-OMPT_API_FUNCTION(ompt_parallel_id_t, ompt_get_parallel_id, (
-    int ancestor_level
+OMPT_API_FUNCTION(int, ompt_get_parallel_info, (
+    int ancestor_level,
+    ompt_data_t **parallel_data,
+    int *team_size
 ));
 
-OMPT_API_FUNCTION(int, ompt_get_parallel_team_size, (
-    int ancestor_level
+/* task */
+OMPT_API_FUNCTION(int, ompt_get_task_info, (
+    int ancestor_level,
+    int *type,
+    ompt_data_t **task_data,
+    ompt_frame_t **task_frame,
+    ompt_data_t **parallel_data,
+    int *thread_num
 ));
 
-/* task */
-OMPT_API_FUNCTION(ompt_task_id_t, ompt_get_task_id, (
-    int depth
+/* places */
+OMPT_API_FUNCTION(int, ompt_get_num_places, (void));
+
+OMPT_API_FUNCTION(int, ompt_get_place_proc_ids, (
+    int place_num,
+    int ids_size,
+    int *ids
 ));
 
-OMPT_API_FUNCTION(ompt_frame_t *, ompt_get_task_frame, (
-    int depth
+OMPT_API_FUNCTION(int, ompt_get_place_num, (void));
+
+OMPT_API_FUNCTION(int, ompt_get_partition_place_nums, (
+    int place_nums_size,
+    int *place_nums
 ));
 
+/* proc_id */
+OMPT_API_FUNCTION(int, ompt_get_proc_id, (void));
 
 
 /****************************************************************************
@@ -445,25 +625,35 @@
  * INITIALIZATION FUNCTIONS
  ***************************************************************************/
 
-OMPT_API_FUNCTION(void, ompt_initialize, (
+typedef struct ompt_fns_t ompt_fns_t;
+
+OMPT_API_FUNCTION(int, ompt_initialize, (
     ompt_function_lookup_t ompt_fn_lookup,
-    const char *runtime_version,
-    unsigned int ompt_version
+    ompt_fns_t *fns
 ));
 
+OMPT_API_FUNCTION(void, ompt_finalize, (
+    ompt_fns_t *fns
+));
+
+struct ompt_fns_t {
+    ompt_initialize_t initialize;
+    ompt_finalize_t finalize;
+};
 
 /* initialization interface to be defined by tool */
-ompt_initialize_t ompt_tool(void);
+#ifdef _WIN32
+__declspec(dllexport)
+#endif
+ompt_fns_t * ompt_start_tool(
+    unsigned int omp_version, 
+    const char * runtime_version
+);
 
-typedef enum opt_init_mode_e {
-    ompt_init_mode_never  = 0,
-    ompt_init_mode_false  = 1,
-    ompt_init_mode_true   = 2,
-    ompt_init_mode_always = 3
-} ompt_init_mode_t;
+typedef void (*ompt_callback_t)(void);
 
 OMPT_API_FUNCTION(int, ompt_set_callback, (
-    ompt_event_t event,
+    ompt_callbacks_t which,
     ompt_callback_t callback
 ));
 
@@ -477,7 +667,7 @@
 
 
 OMPT_API_FUNCTION(int, ompt_get_callback, (
-    ompt_event_t event,
+    ompt_callbacks_t which,
     ompt_callback_t *callback
 ));
 
@@ -487,29 +677,37 @@
  * MISCELLANEOUS FUNCTIONS
  ***************************************************************************/
 
-/* control */
-// FIXME: remove workaround for clang
-#if !defined(__clang__) && defined(_OPENMP) && (_OPENMP >= 201307)
-#pragma omp declare target
-#endif
-void ompt_control(
-    uint64_t command,
-    uint64_t modifier
-);
-#if !defined(__clang__) && defined(_OPENMP) && (_OPENMP >= 201307)
-#pragma omp end declare target
-#endif
-
 /* state enumeration */
-OMPT_API_FUNCTION(int, ompt_enumerate_state, (
+OMPT_API_FUNCTION(int, ompt_enumerate_states, (
     int current_state,
     int *next_state,
     const char **next_state_name
 ));
 
+/* mutex implementation enumeration */
+OMPT_API_FUNCTION(int, ompt_enumerate_mutex_impls, (
+    int current_impl,
+    int *next_impl,
+    const char **next_impl_name
+));
+
+/* get_unique_id */
+OMPT_API_FUNCTION(uint64_t, ompt_get_unique_id, (void));
+
 #ifdef  __cplusplus
 };
 #endif
 
-#endif
+/****************************************************************************
+ * TARGET
+ ***************************************************************************/
+
+ OMPT_API_FUNCTION(int, ompt_get_target_info, (
+    uint64_t *device_num,
+    ompt_id_t *target_id,
+    ompt_id_t *host_op_id
+));
+
+ OMPT_API_FUNCTION(int, ompt_get_num_devices, (void));
 
+#endif /* __OMPT__ */
Index: runtime/src/kmp.h
===================================================================
--- runtime/src/kmp.h
+++ runtime/src/kmp.h
@@ -200,6 +200,10 @@
 #define KMP_IDENT_BARRIER_IMPL_SINGLE 0x0140
 #define KMP_IDENT_BARRIER_IMPL_WORKSHARE 0x01C0
 
+#define KMP_IDENT_WORK_LOOP 0x200 // static loop
+#define KMP_IDENT_WORK_SECTIONS 0x400 // sections
+#define KMP_IDENT_WORK_DISTRIBUTE 0x800 // distribute
+
 /*!
  * The ident structure that describes a source location.
  */
@@ -798,6 +802,10 @@
 extern int __kmp_hws_requested;
 extern int __kmp_hws_abs_flag; // absolute or per-item number requested
 
+#if OMP_50_ENABLED && LIBOMP_OMPT_SUPPORT
+extern char const *__kmp_tool_libraries;
+#endif // OMP_50_ENABLED && LIBOMP_OMPT_SUPPORT
+
 /* ------------------------------------------------------------------------ */
 
 #define KMP_PAD(type, sz)                                                      \
@@ -3314,7 +3322,7 @@
 extern kmp_team_t *
 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if OMPT_SUPPORT
-                    ompt_parallel_id_t ompt_parallel_id,
+                    ompt_data_t ompt_parallel_data,
 #endif
                     kmp_proc_bind_t proc_bind, kmp_internal_control_t *new_icvs,
                     int argc USE_NESTED_HOT_ARG(kmp_info_t *thr));
@@ -3322,7 +3330,7 @@
 extern kmp_team_t *
 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if OMPT_SUPPORT
-                    ompt_parallel_id_t ompt_parallel_id,
+                    ompt_id_t ompt_parallel_id,
 #endif
                     kmp_internal_control_t *new_icvs,
                     int argc USE_NESTED_HOT_ARG(kmp_info_t *thr));
@@ -3362,9 +3370,6 @@
 };
 extern int __kmp_fork_call(ident_t *loc, int gtid,
                            enum fork_context_e fork_context, kmp_int32 argc,
-#if OMPT_SUPPORT
-                           void *unwrapped_task,
-#endif
                            microtask_t microtask, launch_t invoker,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && KMP_OS_LINUX
Index: runtime/src/kmp_atomic.h
===================================================================
--- runtime/src/kmp_atomic.h
+++ runtime/src/kmp_atomic.h
@@ -361,19 +361,20 @@
 
 static inline void __kmp_acquire_atomic_lock(kmp_atomic_lock_t *lck,
                                              kmp_int32 gtid) {
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_wait_atomic)) {
-    ompt_callbacks.ompt_callback(ompt_event_wait_atomic)((ompt_wait_id_t)lck);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_atomic, 0, ompt_mutex_impl_queuing, (ompt_wait_id_t)lck,
+        OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 
   __kmp_acquire_queuing_lock(lck, gtid);
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled &&
-      ompt_callbacks.ompt_callback(ompt_event_acquired_atomic)) {
-    ompt_callbacks.ompt_callback(ompt_event_acquired_atomic)(
-        (ompt_wait_id_t)lck);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_atomic, (ompt_wait_id_t)lck, OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 }
@@ -386,10 +387,10 @@
 static inline void __kmp_release_atomic_lock(kmp_atomic_lock_t *lck,
                                              kmp_int32 gtid) {
   __kmp_release_queuing_lock(lck, gtid);
-#if OMPT_SUPPORT && OMPT_BLAME
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_release_atomic)) {
-    ompt_callbacks.ompt_callback(ompt_event_release_atomic)(
-        (ompt_wait_id_t)lck);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_atomic, (ompt_wait_id_t)lck, OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 }
Index: runtime/src/kmp_barrier.cpp
===================================================================
--- runtime/src/kmp_barrier.cpp
+++ runtime/src/kmp_barrier.cpp
@@ -16,6 +16,9 @@
 #include "kmp_itt.h"
 #include "kmp_os.h"
 #include "kmp_stats.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
 
 #if KMP_MIC
 #include <immintrin.h>
@@ -1224,8 +1227,9 @@
   int status = 0;
   ident_t *loc = __kmp_threads[gtid]->th.th_ident;
 #if OMPT_SUPPORT
-  ompt_task_id_t my_task_id;
-  ompt_parallel_id_t my_parallel_id;
+  ompt_data_t *my_task_data;
+  ompt_data_t *my_parallel_data;
+  void *return_address;
 #endif
 
   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
@@ -1233,28 +1237,26 @@
 
   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
-#if OMPT_BLAME
-    my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
-    my_parallel_id = team->t.ompt_team_info.parallel_id;
-
-#if OMPT_TRACE
-    if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
-      if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
-        ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
-            my_parallel_id, my_task_id);
-      }
+  if (ompt_enabled.enabled) {
+#if OMPT_OPTIONAL
+    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
+    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
+    return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
+          my_task_data, return_address);
     }
-#endif
-    if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
-      ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(my_parallel_id,
-                                                             my_task_id);
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
+          my_task_data, return_address);
     }
 #endif
     // It is OK to report the barrier state after the barrier begin callback.
     // According to the OMPT specification, a compliant implementation may
     // even delay reporting this state until the barrier begins to wait.
-    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+    this_thr->th.ompt_thread_info.state = omp_state_wait_barrier;
   }
 #endif
 
@@ -1489,14 +1491,20 @@
                 __kmp_tid_from_gtid(gtid), status));
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
-#if OMPT_BLAME
-    if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
-      ompt_callbacks.ompt_callback(ompt_event_barrier_end)(my_parallel_id,
-                                                           my_task_id);
+  if (ompt_enabled.enabled) {
+#if OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
+          my_task_data, return_address);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
+          my_task_data, return_address);
     }
 #endif
-    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
   }
 #endif
   ANNOTATE_BARRIER_END(&team->t.t_bar);
@@ -1593,14 +1601,31 @@
 
   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
 #if OMPT_SUPPORT
-#if OMPT_TRACE
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
-    ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
-        team->t.ompt_team_info.parallel_id,
-        team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
-  }
+  ompt_data_t *my_task_data;
+  ompt_data_t *my_parallel_data;
+  if (ompt_enabled.enabled) {
+#if OMPT_OPTIONAL
+    void *codeptr = NULL;
+    int ds_tid = this_thr->th.th_info.ds.ds_tid;
+    if (KMP_MASTER_TID(ds_tid) &&
+        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
+         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
+      codeptr = team->t.ompt_team_info.master_return_address;
+    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
+    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
+          my_task_data, codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
+          my_task_data, codeptr);
+    }
 #endif
-  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+    this_thr->th.ompt_thread_info.state = omp_state_wait_barrier_implicit;
+  }
 #endif
 
   if (__kmp_tasking_mode == tskm_extra_barrier) {
@@ -1758,20 +1783,6 @@
   KA_TRACE(10,
            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
 
-#if OMPT_SUPPORT
-  if (ompt_enabled) {
-#if OMPT_BLAME
-    if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
-      ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
-          team->t.ompt_team_info.parallel_id,
-          team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
-    }
-#endif
-
-    // return to default state
-    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
-  }
-#endif
   ANNOTATE_BARRIER_END(&team->t.t_bar);
 }
 
@@ -1869,6 +1880,39 @@
   }
   }
 
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    if (this_thr->th.ompt_thread_info.state ==
+        omp_state_wait_barrier_implicit) {
+      int ds_tid = this_thr->th.th_info.ds.ds_tid;
+      ompt_data_t *tId = (team) ? OMPT_CUR_TASK_DATA(this_thr)
+                                : &(this_thr->th.ompt_thread_info.task_data);
+      this_thr->th.ompt_thread_info.state = omp_state_overhead;
+#if OMPT_OPTIONAL
+      void *codeptr = NULL;
+      if (KMP_MASTER_TID(ds_tid) &&
+          (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
+           ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
+        codeptr = team->t.ompt_team_info.master_return_address;
+      if (ompt_enabled.ompt_callback_sync_region_wait) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+            ompt_sync_region_barrier, ompt_scope_end, NULL, tId, codeptr);
+      }
+      if (ompt_enabled.ompt_callback_sync_region) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+            ompt_sync_region_barrier, ompt_scope_end, NULL, tId, codeptr);
+      }
+#endif
+      if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_end, NULL, tId, 0, ds_tid);
+      }
+      // return to idle state
+      this_thr->th.ompt_thread_info.state = omp_state_overhead;
+    }
+  }
+#endif
+
   // Early exit for reaping threads releasing forkjoin barrier
   if (TCR_4(__kmp_global.g.g_done)) {
     this_thr->th.th_task_team = NULL;
Index: runtime/src/kmp_cancel.cpp
===================================================================
--- runtime/src/kmp_cancel.cpp
+++ runtime/src/kmp_cancel.cpp
@@ -12,6 +12,9 @@
 #include "kmp_i18n.h"
 #include "kmp_io.h"
 #include "kmp_str.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
 
 #if OMP_40_ENABLED
 
@@ -51,11 +54,25 @@
         kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(
             &(this_team->t.t_cancel_request), cancel_noreq, cncl_kind);
         if (old == cancel_noreq || old == cncl_kind) {
-          // printf("__kmpc_cancel: this_team->t.t_cancel_request=%d @ %p\n",
-          //       this_team->t.t_cancel_request,
-          //       &(this_team->t.t_cancel_request));
-          // we do not have a cancellation request in this team or we do have
-          // one that matches the current request -> cancel
+// we do not have a cancellation request in this team or we do have
+// one that matches the current request -> cancel
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+          if (ompt_enabled.ompt_callback_cancel) {
+            ompt_data_t *task_data;
+            __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                          NULL);
+            ompt_cancel_flag_t type = ompt_cancel_parallel;
+            if (cncl_kind == cancel_parallel)
+              type = ompt_cancel_parallel;
+            else if (cncl_kind == cancel_loop)
+              type = ompt_cancel_do;
+            else if (cncl_kind == cancel_sections)
+              type = ompt_cancel_sections;
+            ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                task_data, type | ompt_cancel_activated,
+                OMPT_GET_RETURN_ADDRESS(0));
+          }
+#endif
           return 1 /* true */;
         }
         break;
@@ -75,8 +92,18 @@
           kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(
               &(taskgroup->cancel_request), cancel_noreq, cncl_kind);
           if (old == cancel_noreq || old == cncl_kind) {
-            // we do not have a cancellation request in this taskgroup or we do
-            // have one that matches the current request -> cancel
+// we do not have a cancellation request in this taskgroup or we do
+// have one that matches the current request -> cancel
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+            if (ompt_enabled.ompt_callback_cancel) {
+              ompt_data_t *task_data;
+              __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                            NULL);
+              ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                  task_data, ompt_cancel_taskgroup | ompt_cancel_activated,
+                  OMPT_GET_RETURN_ADDRESS(0));
+            }
+#endif
             return 1 /* true */;
           }
         } else {
@@ -134,8 +161,25 @@
         KMP_DEBUG_ASSERT(this_team);
         if (this_team->t.t_cancel_request) {
           if (cncl_kind == this_team->t.t_cancel_request) {
-            // the request in the team structure matches the type of
-            // cancellation point so we can cancel
+// the request in the team structure matches the type of
+// cancellation point so we can cancel
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+            if (ompt_enabled.ompt_callback_cancel) {
+              ompt_data_t *task_data;
+              __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                            NULL);
+              ompt_cancel_flag_t type = ompt_cancel_parallel;
+              if (cncl_kind == cancel_parallel)
+                type = ompt_cancel_parallel;
+              else if (cncl_kind == cancel_loop)
+                type = ompt_cancel_do;
+              else if (cncl_kind == cancel_sections)
+                type = ompt_cancel_sections;
+              ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                  task_data, type | ompt_cancel_detected,
+                  OMPT_GET_RETURN_ADDRESS(0));
+            }
+#endif
             return 1 /* true */;
           }
           KMP_ASSERT(0 /* false */);
@@ -158,7 +202,18 @@
 
         taskgroup = task->td_taskgroup;
         if (taskgroup) {
-          // return the current status of cancellation for the taskgroup
+// return the current status of cancellation for the taskgroup
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+          if (ompt_enabled.ompt_callback_cancel &&
+              !!taskgroup->cancel_request) {
+            ompt_data_t *task_data;
+            __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                          NULL);
+            ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                task_data, ompt_cancel_taskgroup | ompt_cancel_detected,
+                OMPT_GET_RETURN_ADDRESS(0));
+          }
+#endif
           return !!taskgroup->cancel_request;
         } else {
           // if a cancellation point is encountered by a task that does not
Index: runtime/src/kmp_config.h.cmake
===================================================================
--- runtime/src/kmp_config.h.cmake
+++ runtime/src/kmp_config.h.cmake
@@ -45,10 +45,8 @@
 #define OMPT_DEBUG LIBOMP_OMPT_DEBUG
 #cmakedefine01 LIBOMP_OMPT_SUPPORT
 #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
-#cmakedefine01 LIBOMP_OMPT_BLAME
-#define OMPT_BLAME LIBOMP_OMPT_BLAME
-#cmakedefine01 LIBOMP_OMPT_TRACE
-#define OMPT_TRACE LIBOMP_OMPT_TRACE
+#cmakedefine01 LIBOMP_OMPT_OPTIONAL
+#define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
 #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS
 #define KMP_USE_ADAPTIVE_LOCKS LIBOMP_USE_ADAPTIVE_LOCKS
 #define KMP_DEBUG_ADAPTIVE_LOCKS 0
Index: runtime/src/kmp_csupport.cpp
===================================================================
--- runtime/src/kmp_csupport.cpp
+++ runtime/src/kmp_csupport.cpp
@@ -278,7 +278,7 @@
 
 #if OMPT_SUPPORT
     ompt_frame_t *ompt_frame;
-    if (ompt_enabled) {
+    if (ompt_enabled.enabled) {
       kmp_info_t *master_th = __kmp_threads[gtid];
       kmp_team_t *parent_team = master_th->th.th_team;
       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
@@ -289,7 +289,8 @@
         ompt_frame = &(
             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
       }
-      ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
+      ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+      OMPT_STORE_RETURN_ADDRESS(gtid);
     }
 #endif
 
@@ -297,9 +298,6 @@
     SSC_MARK_FORKING();
 #endif
     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
-#if OMPT_SUPPORT
-                    VOLATILE_CAST(void *) microtask, // "unwrapped" task
-#endif
                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
@@ -371,11 +369,11 @@
 #if OMPT_SUPPORT
   kmp_team_t *parent_team = this_thr->th.th_team;
   int tid = __kmp_tid_from_gtid(gtid);
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     parent_team->t.t_implicit_task_taskdata[tid]
-        .ompt_task_info.frame.reenter_runtime_frame =
-        __builtin_frame_address(1);
+        .ompt_task_info.frame.reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   // check if __kmpc_push_num_teams called, set default number of teams
@@ -388,9 +386,6 @@
   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
 
   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
-#if OMPT_SUPPORT
-                  VOLATILE_CAST(void *) microtask, // "unwrapped" task
-#endif
                   VOLATILE_CAST(microtask_t)
                       __kmp_teams_master, // "wrapped" task
                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
@@ -433,9 +428,12 @@
 when the condition is false.
 */
 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
-  // The implementation is now in kmp_runtime.cpp so that it can share static
-  // functions with kmp_fork_call since the tasks to be done are similar in
-  // each case.
+// The implementation is now in kmp_runtime.cpp so that it can share static
+// functions with kmp_fork_call since the tasks to be done are similar in
+// each case.
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
   __kmp_serialized_parallel(loc, global_tid);
 }
 
@@ -482,6 +480,30 @@
   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
 
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state != omp_state_overhead) {
+    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_runtime_frame = NULL;
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
+          __kmp_tid_from_gtid(global_tid));
+    }
+
+    // reset clear the task id only after unlinking the task
+    ompt_data_t *parent_task_data;
+    __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
+
+    if (ompt_enabled.ompt_callback_parallel_end) {
+      ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+          &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
+          ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
+    }
+    __ompt_lw_taskteam_unlink(this_thr);
+    this_thr->th.ompt_thread_info.state = omp_state_overhead;
+  }
+#endif
+
   /* If necessary, pop the internal control stack values and replace the team
    * values */
   top = serial_team->t.t_control_stack_top;
@@ -554,6 +576,12 @@
 
   if (__kmp_env_consistency_check)
     __kmp_pop_parallel(global_tid, NULL);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    this_thr->th.ompt_thread_info.state =
+        ((this_thr->th.th_team_serialized) ? omp_state_work_serial
+                                           : omp_state_work_parallel);
+#endif
 }
 
 /*!
@@ -617,6 +645,13 @@
 #else
 #error Unknown or unsupported architecture
 #endif
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_flush) {
+    ompt_callbacks.ompt_callback(ompt_callback_flush)(
+        __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
 }
 
 /* -------------------------------------------------------------------------- */
@@ -642,12 +677,13 @@
     __kmp_check_barrier(global_tid, ct_barrier, loc);
   }
 
-#if OMPT_SUPPORT && OMPT_TRACE
+#if OMPT_SUPPORT
   ompt_frame_t *ompt_frame;
-  if (ompt_enabled) {
-    ompt_frame = __ompt_get_task_frame_internal(0);
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->reenter_runtime_frame == NULL)
-      ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
+      ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
 #endif
   __kmp_threads[global_tid]->th.th_ident = loc;
@@ -659,8 +695,8 @@
   // 4) no sync is required
 
   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
     ompt_frame->reenter_runtime_frame = NULL;
   }
 #endif
@@ -687,16 +723,17 @@
     status = 1;
   }
 
-#if OMPT_SUPPORT && OMPT_TRACE
+#if OMPT_SUPPORT && OMPT_OPTIONAL
   if (status) {
-    if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
+    if (ompt_enabled.ompt_callback_master) {
       kmp_info_t *this_thr = __kmp_threads[global_tid];
       kmp_team_t *team = this_thr->th.th_team;
 
       int tid = __kmp_tid_from_gtid(global_tid);
-      ompt_callbacks.ompt_callback(ompt_event_master_begin)(
-          team->t.ompt_team_info.parallel_id,
-          team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+      ompt_callbacks.ompt_callback(ompt_callback_master)(
+          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
+          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+          OMPT_GET_RETURN_ADDRESS(0));
     }
   }
 #endif
@@ -732,14 +769,15 @@
   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
   KMP_POP_PARTITIONED_TIMER();
 
-#if OMPT_SUPPORT && OMPT_TRACE
+#if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_info_t *this_thr = __kmp_threads[global_tid];
   kmp_team_t *team = this_thr->th.th_team;
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_master_end)) {
+  if (ompt_enabled.ompt_callback_master) {
     int tid = __kmp_tid_from_gtid(global_tid);
-    ompt_callbacks.ompt_callback(ompt_event_master_end)(
-        team->t.ompt_team_info.parallel_id,
-        team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+    ompt_callbacks.ompt_callback(ompt_callback_master)(
+        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+        OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 
@@ -776,16 +814,24 @@
 
   th = __kmp_threads[gtid];
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_team_t *team;
+  ompt_wait_id_t lck;
+  void *codeptr_ra;
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    team = __kmp_team_from_gtid(gtid);
+    lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
     /* OMPT state update */
-    th->th.ompt_thread_info.wait_id = (uint64_t)loc;
-    th->th.ompt_thread_info.state = ompt_state_wait_ordered;
+    th->th.ompt_thread_info.wait_id = lck;
+    th->th.ompt_thread_info.state = omp_state_wait_ordered;
 
     /* OMPT event callback */
-    if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
-      ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
-          th->th.ompt_thread_info.wait_id);
+    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_ordered, omp_lock_hint_none, ompt_mutex_impl_spin,
+          (ompt_wait_id_t)lck, codeptr_ra);
     }
   }
 #endif
@@ -795,16 +841,16 @@
   else
     __kmp_parallel_deo(&gtid, &cid, loc);
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
     /* OMPT state update */
-    th->th.ompt_thread_info.state = ompt_state_work_parallel;
+    th->th.ompt_thread_info.state = omp_state_work_parallel;
     th->th.ompt_thread_info.wait_id = 0;
 
     /* OMPT event callback */
-    if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
-      ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
-          th->th.ompt_thread_info.wait_id);
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
     }
   }
 #endif
@@ -839,11 +885,13 @@
   else
     __kmp_parallel_dxo(&gtid, &cid, loc);
 
-#if OMPT_SUPPORT && OMPT_BLAME
-  if (ompt_enabled &&
-      ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
-    ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
-        th->th.ompt_thread_info.wait_id);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_ordered,
+        (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
+        OMPT_LOAD_RETURN_ADDRESS(gtid));
   }
 #endif
 }
@@ -1063,11 +1111,18 @@
 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
                      kmp_critical_name *crit) {
 #if KMP_USE_DYNAMIC_LOCK
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif // OMPT_SUPPORT
   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
 #else
   KMP_COUNT_BLOCK(OMP_CRITICAL);
   KMP_TIME_PARTITIONED_BLOCK(
       OMP_critical_wait); /* Time spent waiting to enter the critical section */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  omp_state_t prev_state = omp_state_undefined;
+  ompt_thread_info_t ti;
+#endif
   kmp_user_lock_p lck;
 
   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
@@ -1101,6 +1156,25 @@
 #if USE_ITT_BUILD
   __kmp_itt_critical_acquiring(lck);
 #endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+  void *codeptr_ra = NULL;
+  if (ompt_enabled.enabled) {
+    ti = __kmp_threads[global_tid]->th.ompt_thread_info;
+    /* OMPT state update */
+    prev_state = ti.state;
+    ti.wait_id = (ompt_wait_id_t)lck;
+    ti.state = omp_state_wait_critical;
+
+    /* OMPT event callback */
+    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+          (ompt_wait_id_t)crit, codeptr_ra);
+    }
+  }
+#endif
   // Value of 'crit' should be good for using as a critical_id of the critical
   // section directive.
   __kmp_acquire_user_lock_with_checks(lck, global_tid);
@@ -1108,6 +1182,19 @@
 #if USE_ITT_BUILD
   __kmp_itt_critical_acquired(lck);
 #endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    /* OMPT state update */
+    ti.state = prev_state;
+    ti.wait_id = 0;
+
+    /* OMPT event callback */
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
+    }
+  }
+#endif
 
   KMP_START_EXPLICIT_TIMER(OMP_critical);
   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
@@ -1160,6 +1247,76 @@
   return __kmp_user_lock_seq;
 }
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+static ompt_mutex_impl_t
+__ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
+  if (user_lock) {
+    switch (KMP_EXTRACT_D_TAG(user_lock)) {
+    case 0:
+      break;
+#if KMP_USE_FUTEX
+    case locktag_futex:
+      return ompt_mutex_impl_queuing;
+#endif
+    case locktag_tas:
+      return ompt_mutex_impl_spin;
+#if KMP_USE_TSX
+    case locktag_hle:
+      return ompt_mutex_impl_speculative;
+#endif
+    default:
+      return ompt_mutex_impl_unknown;
+    }
+    ilock = KMP_LOOKUP_I_LOCK(user_lock);
+  }
+  KMP_ASSERT(ilock);
+  switch (ilock->type) {
+#if KMP_USE_TSX
+  case locktag_adaptive:
+  case locktag_rtm:
+    return ompt_mutex_impl_speculative;
+#endif
+  case locktag_nested_tas:
+    return ompt_mutex_impl_spin;
+#if KMP_USE_FUTEX
+  case locktag_nested_futex:
+#endif
+  case locktag_ticket:
+  case locktag_queuing:
+  case locktag_drdpa:
+  case locktag_nested_ticket:
+  case locktag_nested_queuing:
+  case locktag_nested_drdpa:
+    return ompt_mutex_impl_queuing;
+  default:
+    return ompt_mutex_impl_unknown;
+  }
+}
+
+// For locks without dynamic binding
+static ompt_mutex_impl_t __ompt_get_mutex_impl_type() {
+  switch (__kmp_user_lock_kind) {
+  case lk_tas:
+    return ompt_mutex_impl_spin;
+#if KMP_USE_FUTEX
+  case lk_futex:
+#endif
+  case lk_ticket:
+  case lk_queuing:
+  case lk_drdpa:
+    return ompt_mutex_impl_queuing;
+#if KMP_USE_TSX
+  case lk_hle:
+  case lk_rtm:
+  case lk_adaptive:
+    return ompt_mutex_impl_speculative;
+#endif
+  default:
+    return ompt_mutex_impl_unknown;
+  }
+}
+#endif
+
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
@@ -1177,6 +1334,14 @@
                                kmp_critical_name *crit, uintptr_t hint) {
   KMP_COUNT_BLOCK(OMP_CRITICAL);
   kmp_user_lock_p lck;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  omp_state_t prev_state = omp_state_undefined;
+  ompt_thread_info_t ti;
+  // This is the case, if called from __kmpc_critical:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+#endif
 
   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
 
@@ -1203,6 +1368,22 @@
 #if USE_ITT_BUILD
     __kmp_itt_critical_acquiring(lck);
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
+      /* OMPT state update */
+      prev_state = ti.state;
+      ti.wait_id = (ompt_wait_id_t)lck;
+      ti.state = omp_state_wait_critical;
+
+      /* OMPT event callback */
+      if (ompt_enabled.ompt_callback_mutex_acquire) {
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+            ompt_mutex_critical, (unsigned int)hint,
+            __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
+      }
+    }
+#endif
 #if KMP_USE_INLINED_TAS
     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
@@ -1225,12 +1406,41 @@
 #if USE_ITT_BUILD
     __kmp_itt_critical_acquiring(lck);
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
+      /* OMPT state update */
+      prev_state = ti.state;
+      ti.wait_id = (ompt_wait_id_t)lck;
+      ti.state = omp_state_wait_critical;
+
+      /* OMPT event callback */
+      if (ompt_enabled.ompt_callback_mutex_acquire) {
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+            ompt_mutex_critical, (unsigned int)hint,
+            __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
+      }
+    }
+#endif
     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
   }
 
 #if USE_ITT_BUILD
   __kmp_itt_critical_acquired(lck);
 #endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    /* OMPT state update */
+    ti.state = prev_state;
+    ti.wait_id = 0;
+
+    /* OMPT event callback */
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
+    }
+  }
+#endif
 
   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
@@ -1317,14 +1527,18 @@
   // section directive.
   __kmp_release_user_lock_with_checks(lck, global_tid);
 
-#if OMPT_SUPPORT && OMPT_BLAME
-  if (ompt_enabled &&
-      ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
-    ompt_callbacks.ompt_callback(ompt_event_release_critical)((uint64_t)lck);
+#endif // KMP_USE_DYNAMIC_LOCK
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  /* OMPT release event triggers after lock is released; place here to trigger
+   * for all #if branches */
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
   }
 #endif
 
-#endif // KMP_USE_DYNAMIC_LOCK
   KMP_POP_PARTITIONED_TIMER();
   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
 }
@@ -1349,10 +1563,24 @@
   if (__kmp_env_consistency_check)
     __kmp_check_barrier(global_tid, ct_barrier, loc);
 
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->reenter_runtime_frame == NULL)
+      ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+  }
+#endif
 #if USE_ITT_NOTIFY
   __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->reenter_runtime_frame = NULL;
+  }
+#endif
 
   return (status != 0) ? 0 : 1;
 }
@@ -1397,10 +1625,24 @@
     __kmp_check_barrier(global_tid, ct_barrier, loc);
   }
 
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->reenter_runtime_frame == NULL)
+      ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+  }
+#endif
 #if USE_ITT_NOTIFY
   __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->reenter_runtime_frame = NULL;
+  }
+#endif
 
   ret = __kmpc_master(loc, global_tid);
 
@@ -1443,26 +1685,33 @@
     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
   }
 
-#if OMPT_SUPPORT && OMPT_TRACE
+#if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_info_t *this_thr = __kmp_threads[global_tid];
   kmp_team_t *team = this_thr->th.th_team;
   int tid = __kmp_tid_from_gtid(global_tid);
 
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     if (rc) {
-      if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
-        ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
-            team->t.ompt_team_info.parallel_id,
-            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
-            team->t.ompt_team_info.microtask);
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_executor, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
       }
     } else {
-      if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
-        ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
-            team->t.ompt_team_info.parallel_id,
-            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_end,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
       }
-      this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
     }
   }
 #endif
@@ -1483,16 +1732,17 @@
   __kmp_exit_single(global_tid);
   KMP_POP_PARTITIONED_TIMER();
 
-#if OMPT_SUPPORT && OMPT_TRACE
+#if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_info_t *this_thr = __kmp_threads[global_tid];
   kmp_team_t *team = this_thr->th.th_team;
   int tid = __kmp_tid_from_gtid(global_tid);
 
-  if (ompt_enabled &&
-      ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
-    ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
-        team->t.ompt_team_info.parallel_id,
-        team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_single_executor, ompt_scope_end,
+        &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
+        OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 }
@@ -1507,12 +1757,28 @@
 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_work_type_t ompt_work_type;
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
-    ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
-    ompt_callbacks.ompt_callback(ompt_event_loop_end)(team_info->parallel_id,
-                                                      task_info->task_id);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    // Determine workshare type
+    if (loc != NULL) {
+      if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
+        ompt_work_type = ompt_work_loop;
+      } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
+        ompt_work_type = ompt_work_sections;
+      } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
+        ompt_work_type = ompt_work_distribute;
+      } else {
+        KMP_ASSERT2(0,
+                    "__kmpc_for_static_fini: can't determine workshare type");
+      }
+      KMP_DEBUG_ASSERT(ompt_work_type);
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
+        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 
@@ -1709,6 +1975,15 @@
   if (didit)
     *data_ptr = cpy_data;
 
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->reenter_runtime_frame == NULL)
+      ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
 /* This barrier is not a barrier region boundary */
 #if USE_ITT_NOTIFY
   __kmp_threads[gtid]->th.th_ident = loc;
@@ -1721,11 +1996,21 @@
 // Consider next barrier a user-visible barrier for barrier region boundaries
 // Nesting checks are already handled by the single construct checks
 
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
 #if USE_ITT_NOTIFY
   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
 // tasks can overwrite the location)
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->reenter_runtime_frame = NULL;
+  }
+#endif
 }
 
 /* -------------------------------------------------------------------------- */
@@ -1812,6 +2097,19 @@
   }
 
   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_lock, (omp_lock_hint_t)hint,
+        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        codeptr);
+  }
+#endif
 }
 
 /* initialize the lock with a hint */
@@ -1823,6 +2121,19 @@
   }
 
   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
+        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        codeptr);
+  }
+#endif
 }
 
 #endif // KMP_USE_DYNAMIC_LOCK
@@ -1837,6 +2148,19 @@
   }
   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        codeptr);
+  }
+#endif
+
 #else // KMP_USE_DYNAMIC_LOCK
 
   static char const *const func = "omp_init_lock";
@@ -1867,9 +2191,15 @@
   INIT_LOCK(lck);
   __kmp_set_user_lock_location(lck, loc);
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_init_lock)) {
-    ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t)lck);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -1890,6 +2220,19 @@
   }
   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_nest_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        codeptr);
+  }
+#endif
+
 #else // KMP_USE_DYNAMIC_LOCK
 
   static char const *const func = "omp_init_nest_lock";
@@ -1923,9 +2266,15 @@
   INIT_NESTED_LOCK(lck);
   __kmp_set_user_lock_location(lck, loc);
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) {
-    ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t)lck);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -1948,6 +2297,22 @@
   }
   __kmp_itt_lock_destroyed(lck);
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    kmp_user_lock_p lck;
+    if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
+      lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
+    } else {
+      lck = (kmp_user_lock_p)user_lock;
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
+  }
+#endif
   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
 #else
   kmp_user_lock_p lck;
@@ -1966,9 +2331,14 @@
     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
   }
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) {
-    ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t)lck);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -2001,6 +2371,16 @@
   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
   __kmp_itt_lock_destroyed(ilk->lock);
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
+  }
+#endif
   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
 
 #else // KMP_USE_DYNAMIC_LOCK
@@ -2023,10 +2403,14 @@
     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
   }
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled &&
-      ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) {
-    ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t)lck);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -2063,6 +2447,18 @@
       (kmp_user_lock_p)
           user_lock); // itt function will get to the right lock object.
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        codeptr);
+  }
+#endif
 #if KMP_USE_INLINED_TAS
   if (tag == locktag_tas && !__kmp_env_consistency_check) {
     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
@@ -2078,6 +2474,12 @@
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
+  }
+#endif
 
 #else // KMP_USE_DYNAMIC_LOCK
 
@@ -2100,6 +2502,17 @@
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring(lck);
 #endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)lck, codeptr);
+  }
+#endif
 
   ACQUIRE_LOCK(lck, gtid);
 
@@ -2107,9 +2520,10 @@
   __kmp_itt_lock_acquired(lck);
 #endif /* USE_ITT_BUILD */
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) {
-    ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t)lck);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
   }
 #endif
 
@@ -2122,14 +2536,41 @@
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
 #endif
-  KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_nest_lock, omp_lock_hint_none,
+          __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+          codeptr);
+    }
+  }
+#endif
+  int acquire_status =
+      KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
 #endif
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled) {
-    // missing support here: need to know whether acquired first or not
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
+      }
+    }
   }
 #endif
 
@@ -2156,6 +2597,19 @@
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring(lck);
 #endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_nest_lock, omp_lock_hint_none,
+          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
+    }
+  }
+#endif
 
   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
 
@@ -2163,16 +2617,20 @@
   __kmp_itt_lock_acquired(lck);
 #endif /* USE_ITT_BUILD */
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
-      if (ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first))
-        ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)(
-            (uint64_t)lck);
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
+      }
     } else {
-      if (ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next))
-        ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)(
-            (uint64_t)lck);
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
+      }
     }
   }
 #endif
@@ -2200,6 +2658,17 @@
     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
   }
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
+  }
+#endif
+
 #else // KMP_USE_DYNAMIC_LOCK
 
   kmp_user_lock_p lck;
@@ -2217,6 +2686,18 @@
 #endif /* USE_ITT_BUILD */
     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
     KMP_MB();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    // This is the case, if called from omp_init_lock_with_hint:
+    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_mutex_released) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+          ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
+    }
+#endif
+
     return;
 #else
     lck = (kmp_user_lock_p)user_lock;
@@ -2238,9 +2719,14 @@
 
   RELEASE_LOCK(lck, gtid);
 
-#if OMPT_SUPPORT && OMPT_BLAME
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
-    ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t)lck);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
   }
 #endif
 
@@ -2254,7 +2740,28 @@
 #if USE_ITT_BUILD
   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
 #endif
-  KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
+  int release_status =
+      KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (release_status == KMP_LOCK_RELEASED) {
+      if (ompt_enabled.ompt_callback_mutex_released) {
+        // release_lock_last
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
+      }
+    } else if (ompt_enabled.ompt_callback_nest_lock) {
+      // release_lock_prev
+      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+          ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
+    }
+  }
+#endif
 
 #else // KMP_USE_DYNAMIC_LOCK
 
@@ -2272,10 +2779,39 @@
 #if USE_ITT_BUILD
     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
 #endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    int release_status = KMP_LOCK_STILL_HELD;
+#endif
+
     if (--(tl->lk.depth_locked) == 0) {
       TCW_4(tl->lk.poll, 0);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+      release_status = KMP_LOCK_RELEASED;
+#endif
     }
     KMP_MB();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    // This is the case, if called from omp_init_lock_with_hint:
+    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+    if (ompt_enabled.enabled) {
+      if (release_status == KMP_LOCK_RELEASED) {
+        if (ompt_enabled.ompt_callback_mutex_released) {
+          // release_lock_last
+          ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+              ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
+        }
+      } else if (ompt_enabled.ompt_callback_nest_lock) {
+        // release_lock_previous
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
+      }
+    }
+#endif
+
     return;
 #else
     lck = (kmp_user_lock_p)user_lock;
@@ -2298,17 +2834,22 @@
 
   int release_status;
   release_status = RELEASE_NESTED_LOCK(lck, gtid);
-#if OMPT_SUPPORT && OMPT_BLAME
-  if (ompt_enabled) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
     if (release_status == KMP_LOCK_RELEASED) {
-      if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
-        ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
-            (uint64_t)lck);
+      if (ompt_enabled.ompt_callback_mutex_released) {
+        // release_lock_last
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
       }
-    } else if (ompt_callbacks.ompt_callback(
-                   ompt_event_release_nest_lock_prev)) {
-      ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
-          (uint64_t)lck);
+    } else if (ompt_enabled.ompt_callback_nest_lock) {
+      // release_lock_previous
+      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+          ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
     }
   }
 #endif
@@ -2326,6 +2867,18 @@
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        codeptr);
+  }
+#endif
 #if KMP_USE_INLINED_TAS
   if (tag == locktag_tas && !__kmp_env_consistency_check) {
     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
@@ -2342,6 +2895,12 @@
 #if USE_ITT_BUILD
     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
+    }
+#endif
     return FTN_TRUE;
   } else {
 #if USE_ITT_BUILD
@@ -2372,6 +2931,17 @@
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring(lck);
 #endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)lck, codeptr);
+  }
+#endif
 
   rc = TEST_LOCK(lck, gtid);
 #if USE_ITT_BUILD
@@ -2381,6 +2951,13 @@
     __kmp_itt_lock_cancelled(lck);
   }
 #endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
+  }
+#endif
+
   return (rc ? FTN_TRUE : FTN_FALSE);
 
 /* Can't use serial interval since not block structured */
@@ -2395,6 +2972,18 @@
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_nest_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        codeptr);
+  }
+#endif
   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
 #if USE_ITT_BUILD
   if (rc) {
@@ -2403,6 +2992,23 @@
     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
   }
 #endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && rc) {
+    if (rc == 1) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
+      }
+    }
+  }
+#endif
   return rc;
 
 #else // KMP_USE_DYNAMIC_LOCK
@@ -2430,6 +3036,19 @@
   __kmp_itt_lock_acquiring(lck);
 #endif /* USE_ITT_BUILD */
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) &&
+        ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_nest_lock, omp_lock_hint_none,
+          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
+    }
+#endif
+
   rc = TEST_NESTED_LOCK(lck, gtid);
 #if USE_ITT_BUILD
   if (rc) {
@@ -2438,6 +3057,23 @@
     __kmp_itt_lock_cancelled(lck);
   }
 #endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && rc) {
+    if (rc == 1) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
+      }
+    }
+  }
+#endif
   return rc;
 
 /* Can't use serial interval since not block structured */
@@ -2697,6 +3333,19 @@
 // this barrier should be invisible to a customer and to the threading profile
 // tool (it's neither a terminating barrier nor customer's code, it's
 // used for an internal purpose)
+#if OMPT_SUPPORT
+    // JP: can this barrier potentially leed to task scheduling?
+    // JP: as long as there is a barrier in the implementation, OMPT should and
+    // will provide the barrier events
+    //         so we set-up the necessary frame/return addresses.
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->reenter_runtime_frame == NULL)
+        ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
@@ -2704,6 +3353,11 @@
         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
     retval = (retval != 0) ? (0) : (1);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->reenter_runtime_frame = NULL;
+    }
+#endif
 
     // all other workers except master should do this pop here
     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
@@ -2859,6 +3513,15 @@
 // case tree_reduce_block:
 // this barrier should be visible to a customer and to the threading profile
 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->reenter_runtime_frame == NULL)
+        ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident =
         loc; // needed for correct notification of frames
@@ -2867,6 +3530,11 @@
         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
     retval = (retval != 0) ? (0) : (1);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->reenter_runtime_frame = NULL;
+    }
+#endif
 
     // all other workers except master should do this pop here
     // ( none of other workers except master will enter __kmpc_end_reduce() )
@@ -2916,28 +3584,70 @@
     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
 
 // TODO: implicit barrier: should be exposed
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->reenter_runtime_frame == NULL)
+        ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->reenter_runtime_frame = NULL;
+    }
+#endif
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
 // usage: if team size==1, no synchronization is required (Intel platforms only)
 
 // TODO: implicit barrier: should be exposed
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->reenter_runtime_frame == NULL)
+        ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->reenter_runtime_frame = NULL;
+    }
+#endif
 
   } else if (packed_reduction_method == atomic_reduce_block) {
 
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->reenter_runtime_frame == NULL)
+        ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
 // TODO: implicit barrier: should be exposed
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->reenter_runtime_frame = NULL;
+    }
+#endif
 
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
Index: runtime/src/kmp_dispatch.cpp
===================================================================
--- runtime/src/kmp_dispatch.cpp
+++ runtime/src/kmp_dispatch.cpp
@@ -1230,12 +1230,14 @@
   }
 #endif // ( KMP_STATIC_STEAL_ENABLED )
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
-    ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
-    ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
-        team_info->parallel_id, task_info->task_id, team_info->microtask);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    kmp_info_t *thr = __kmp_threads[gtid];
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
   }
 #endif
 }
@@ -1390,16 +1392,18 @@
 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
    is not called. */
-#if OMPT_SUPPORT && OMPT_TRACE
+#if OMPT_SUPPORT && OMPT_OPTIONAL
 #define OMPT_LOOP_END                                                          \
   if (status == 0) {                                                           \
-    if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {   \
+    if (ompt_enabled.ompt_callback_work) {                                     \
       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
-      ompt_task_info_t *task_info = __ompt_get_taskinfo(0);                    \
-      ompt_callbacks.ompt_callback(ompt_event_loop_end)(                       \
-          team_info->parallel_id, task_info->task_id);                         \
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
+      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
+          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
+          &(task_info->task_data), 0, codeptr);                                \
     }                                                                          \
   }
+// TODO: implement count
 #else
 #define OMPT_LOOP_END // no-op
 #endif
@@ -1407,7 +1411,12 @@
 template <typename T>
 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
                                T *p_lb, T *p_ub,
-                               typename traits_t<T>::signed_t *p_st) {
+                               typename traits_t<T>::signed_t *p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                               ,
+                               void *codeptr
+#endif
+                               ) {
 
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
@@ -2527,6 +2536,9 @@
                             enum sched_type schedule, kmp_int32 lb,
                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
 }
 /*!
@@ -2536,6 +2548,9 @@
                              enum sched_type schedule, kmp_uint32 lb,
                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
 }
 
@@ -2546,6 +2561,9 @@
                             enum sched_type schedule, kmp_int64 lb,
                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
 }
 
@@ -2556,6 +2574,9 @@
                              enum sched_type schedule, kmp_uint64 lb,
                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
 }
 
@@ -2573,6 +2594,9 @@
                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
                                  kmp_int32 chunk) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
 }
@@ -2582,6 +2606,9 @@
                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
                                   kmp_int32 chunk) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
 }
@@ -2591,6 +2618,9 @@
                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
                                  kmp_int64 chunk) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
 }
@@ -2600,6 +2630,9 @@
                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
                                   kmp_int64 chunk) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
 }
@@ -2619,7 +2652,15 @@
 */
 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
-  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                        ,
+                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+                                            );
 }
 
 /*!
@@ -2628,7 +2669,15 @@
 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
                             kmp_int32 *p_st) {
-  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                         ,
+                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+                                             );
 }
 
 /*!
@@ -2636,7 +2685,15 @@
 */
 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
-  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                        ,
+                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+                                            );
 }
 
 /*!
@@ -2645,7 +2702,15 @@
 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
                             kmp_int64 *p_st) {
-  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                         ,
+                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+                                             );
 }
 
 /*!
Index: runtime/src/kmp_ftn_entry.h
===================================================================
--- runtime/src/kmp_ftn_entry.h
+++ runtime/src/kmp_ftn_entry.h
@@ -21,6 +21,10 @@
 
 #include "kmp_i18n.h"
 
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
@@ -340,6 +344,26 @@
 #endif
 }
 
+#if OMP_50_ENABLED
+int FTN_STDCALL FTN_CONTROL_TOOL(uint64_t command, uint64_t modifier,
+                                 void *arg) {
+#if defined(KMP_STUB) || !OMPT_SUPPORT
+  return -2;
+#else
+  OMPT_STORE_RETURN_ADDRESS(__kmp_entry_gtid());
+  if (!TCR_4(__kmp_init_middle)) {
+    return -2;
+  }
+  kmp_info_t *this_thr = __kmp_threads[__kmp_entry_gtid()];
+  ompt_task_info_t *parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
+  parent_task_info->frame.reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+  int ret = __kmp_control_tool(command, modifier, arg);
+  parent_task_info->frame.reenter_runtime_frame = 0;
+  return ret;
+#endif
+}
+#endif
+
 int FTN_STDCALL xexpand(FTN_GET_THREAD_NUM)(void) {
 #ifdef KMP_STUB
   return 0;
@@ -873,8 +897,11 @@
 #ifdef KMP_STUB
   *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
 #else
-  __kmpc_init_lock_with_hint(NULL, __kmp_entry_gtid(), user_lock,
-                             KMP_DEREF hint);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
 #endif
 }
 
@@ -883,8 +910,11 @@
 #ifdef KMP_STUB
   *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
 #else
-  __kmpc_init_nest_lock_with_hint(NULL, __kmp_entry_gtid(), user_lock,
-                                  KMP_DEREF hint);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_nest_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
 #endif
 }
 #endif
@@ -894,7 +924,11 @@
 #ifdef KMP_STUB
   *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
 #else
-  __kmpc_init_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_lock(NULL, gtid, user_lock);
 #endif
 }
 
@@ -903,7 +937,11 @@
 #ifdef KMP_STUB
   *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
 #else
-  __kmpc_init_nest_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_nest_lock(NULL, gtid, user_lock);
 #endif
 }
 
@@ -911,7 +949,11 @@
 #ifdef KMP_STUB
   *((kmp_stub_lock_t *)user_lock) = UNINIT;
 #else
-  __kmpc_destroy_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_destroy_lock(NULL, gtid, user_lock);
 #endif
 }
 
@@ -919,7 +961,11 @@
 #ifdef KMP_STUB
   *((kmp_stub_lock_t *)user_lock) = UNINIT;
 #else
-  __kmpc_destroy_nest_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_destroy_nest_lock(NULL, gtid, user_lock);
 #endif
 }
 
@@ -933,7 +979,11 @@
   }
   *((kmp_stub_lock_t *)user_lock) = LOCKED;
 #else
-  __kmpc_set_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_set_lock(NULL, gtid, user_lock);
 #endif
 }
 
@@ -944,7 +994,11 @@
   }
   (*((int *)user_lock))++;
 #else
-  __kmpc_set_nest_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_set_nest_lock(NULL, gtid, user_lock);
 #endif
 }
 
@@ -958,7 +1012,11 @@
   }
   *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
 #else
-  __kmpc_unset_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_unset_lock(NULL, gtid, user_lock);
 #endif
 }
 
@@ -972,7 +1030,11 @@
   }
   (*((int *)user_lock))--;
 #else
-  __kmpc_unset_nest_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_unset_nest_lock(NULL, gtid, user_lock);
 #endif
 }
 
@@ -987,7 +1049,11 @@
   *((kmp_stub_lock_t *)user_lock) = LOCKED;
   return 1;
 #else
-  return __kmpc_test_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmpc_test_lock(NULL, gtid, user_lock);
 #endif
 }
 
@@ -998,7 +1064,11 @@
   }
   return ++(*((int *)user_lock));
 #else
-  return __kmpc_test_nest_lock(NULL, __kmp_entry_gtid(), user_lock);
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmpc_test_nest_lock(NULL, gtid, user_lock);
 #endif
 }
 
Index: runtime/src/kmp_ftn_os.h
===================================================================
--- runtime/src/kmp_ftn_os.h
+++ runtime/src/kmp_ftn_os.h
@@ -133,6 +133,10 @@
 #endif
 #endif
 
+#if OMP_50_ENABLED
+#define FTN_CONTROL_TOOL omp_control_tool
+#endif
+
 #endif /* KMP_FTN_PLAIN */
 
 /* ------------------------------------------------------------------------ */
@@ -251,6 +255,10 @@
 #endif
 #endif
 
+#if OMP_50_ENABLED
+#define FTN_CONTROL_TOOL OMP_CONTROL_TOOL
+#endif
+
 #endif /* KMP_FTN_APPEND */
 
 /* ------------------------------------------------------------------------ */
@@ -369,6 +377,10 @@
 #endif
 #endif
 
+#if OMP_50_ENABLED
+#define FTN_CONTROL_TOOL OMP_CONTROL_TOOL
+#endif
+
 #endif /* KMP_FTN_UPPER */
 
 /* ------------------------------------------------------------------------ */
@@ -487,6 +499,10 @@
 #endif
 #endif
 
+#if OMP_50_ENABLED
+#define FTN_CONTROL_TOOL OMP_CONTROL_TOOL_
+#endif
+
 #endif /* KMP_FTN_UAPPEND */
 
 /* -------------------------- GOMP API NAMES ------------------------ */
Index: runtime/src/kmp_global.cpp
===================================================================
--- runtime/src/kmp_global.cpp
+++ runtime/src/kmp_global.cpp
@@ -303,6 +303,10 @@
 kmp_uint64 __kmp_taskloop_min_tasks = 0;
 #endif
 
+#if OMP_50_ENABLED && OMPT_SUPPORT
+char const *__kmp_tool_libraries = NULL;
+#endif
+
 /* This check ensures that the compiler is passing the correct data type for the
    flags formal parameter of the function kmpc_omp_task_alloc(). If the type is
    not a 4-byte type, then give an error message about a non-positive length
Index: runtime/src/kmp_gsupport.cpp
===================================================================
--- runtime/src/kmp_gsupport.cpp
+++ runtime/src/kmp_gsupport.cpp
@@ -31,14 +31,20 @@
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_barrier");
   KA_TRACE(20, ("GOMP_barrier: T#%d\n", gtid));
-#if OMPT_SUPPORT && OMPT_TRACE
+#if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_frame_t *ompt_frame;
-  if (ompt_enabled) {
-    ompt_frame = __ompt_get_task_frame_internal(0);
-    ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   __kmpc_barrier(&loc, gtid);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->reenter_runtime_frame = NULL;
+  }
+#endif
 }
 
 // Mutual exclusion
@@ -56,6 +62,9 @@
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_critical_start");
   KA_TRACE(20, ("GOMP_critical_start: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmpc_critical(&loc, gtid, __kmp_unnamed_critical_addr);
 }
 
@@ -63,6 +72,9 @@
   int gtid = __kmp_get_gtid();
   MKLOC(loc, "GOMP_critical_end");
   KA_TRACE(20, ("GOMP_critical_end: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmpc_end_critical(&loc, gtid, __kmp_unnamed_critical_addr);
 }
 
@@ -111,7 +123,40 @@
   // 3rd parameter == FALSE prevents kmp_enter_single from pushing a
   // workshare when USE_CHECKS is defined.  We need to avoid the push,
   // as there is no corresponding GOMP_single_end() call.
-  return __kmp_enter_single(gtid, &loc, FALSE);
+  kmp_int32 rc = __kmp_enter_single(gtid, &loc, FALSE);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(gtid);
+
+  if (ompt_enabled.enabled) {
+    if (rc) {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_executor, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_end,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    }
+  }
+#endif
+
+  return rc;
 }
 
 void *xexpand(KMP_API_NAME_GOMP_SINGLE_COPY_START)(void) {
@@ -129,14 +174,33 @@
   if (__kmp_enter_single(gtid, &loc, FALSE))
     return NULL;
 
-  // Wait for the first thread to set the copyprivate data pointer,
-  // and for all other threads to reach this point.
+// Wait for the first thread to set the copyprivate data pointer,
+// and for all other threads to reach this point.
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 
   // Retrieve the value of the copyprivate data point, and wait for all
   // threads to do likewise, then return.
   retval = __kmp_team_from_gtid(gtid)->t.t_copypriv_data;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->reenter_runtime_frame = NULL;
+  }
+#endif
   return retval;
 }
 
@@ -149,14 +213,35 @@
   // continuing, so that the know that the copyprivate data pointer has been
   // propagated to all threads before trying to reuse the t_copypriv_data field.
   __kmp_team_from_gtid(gtid)->t.t_copypriv_data = data;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->reenter_runtime_frame = NULL;
+  }
+#endif
 }
 
 void xexpand(KMP_API_NAME_GOMP_ORDERED_START)(void) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_ordered_start");
   KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmpc_ordered(&loc, gtid);
 }
 
@@ -164,6 +249,9 @@
   int gtid = __kmp_get_gtid();
   MKLOC(loc, "GOMP_ordered_end");
   KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
   __kmpc_end_ordered(&loc, gtid);
 }
 
@@ -197,26 +285,26 @@
 #if OMPT_SUPPORT
   kmp_info_t *thr;
   ompt_frame_t *ompt_frame;
-  ompt_state_t enclosing_state;
+  omp_state_t enclosing_state;
 
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     // get pointer to thread data structure
     thr = __kmp_threads[*gtid];
 
     // save enclosing task state; set current state for task
     enclosing_state = thr->th.ompt_thread_info.state;
-    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    thr->th.ompt_thread_info.state = omp_state_work_parallel;
 
     // set task frame
-    ompt_frame = __ompt_get_task_frame_internal(0);
-    ompt_frame->exit_runtime_frame = __builtin_frame_address(0);
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
 
   task(data);
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     // clear task frame
     ompt_frame->exit_runtime_frame = NULL;
 
@@ -236,24 +324,29 @@
                                           enum sched_type schedule, long start,
                                           long end, long incr,
                                           long chunk_size) {
-  // Intialize the loop worksharing construct.
+// Intialize the loop worksharing construct.
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    OMPT_STORE_RETURN_ADDRESS(*gtid);
+#endif
   KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size,
                     schedule != kmp_sch_static);
 
 #if OMPT_SUPPORT
   kmp_info_t *thr;
   ompt_frame_t *ompt_frame;
-  ompt_state_t enclosing_state;
+  omp_state_t enclosing_state;
 
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     thr = __kmp_threads[*gtid];
     // save enclosing task state; set current state for task
     enclosing_state = thr->th.ompt_thread_info.state;
-    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    thr->th.ompt_thread_info.state = omp_state_work_parallel;
 
     // set task frame
-    ompt_frame = __ompt_get_task_frame_internal(0);
-    ompt_frame->exit_runtime_frame = __builtin_frame_address(0);
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
 
@@ -261,7 +354,7 @@
   task(data);
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     // clear task frame
     ompt_frame->exit_runtime_frame = NULL;
 
@@ -285,11 +378,8 @@
   va_list ap;
   va_start(ap, argc);
 
-  rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc,
-#if OMPT_SUPPORT
-                       VOLATILE_CAST(void *) unwrapped_task,
-#endif
-                       wrapper, __kmp_invoke_task_func,
+  rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc, wrapper,
+                       __kmp_invoke_task_func,
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
                        &ap
 #else
@@ -304,18 +394,19 @@
   }
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
-#if OMPT_TRACE
+  int ompt_team_size;
+  if (ompt_enabled.enabled) {
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
-    ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
 
     // implicit task callback
-    if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
-      ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
-          team_info->parallel_id, task_info->task_id);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_team_size = __kmp_team_from_gtid(gtid)->t.t_nproc;
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), ompt_team_size, __kmp_tid_from_gtid(gtid));
     }
-#endif
-    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    thr->th.ompt_thread_info.state = omp_state_work_parallel;
   }
 #endif
 }
@@ -323,47 +414,9 @@
 static void __kmp_GOMP_serialized_parallel(ident_t *loc, kmp_int32 gtid,
                                            void (*task)(void *)) {
 #if OMPT_SUPPORT
-  ompt_parallel_id_t ompt_parallel_id;
-  if (ompt_enabled) {
-    ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
-
-    ompt_parallel_id = __ompt_parallel_id_new(gtid);
-
-    // parallel region callback
-    if (ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
-      int team_size = 1;
-      ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
-          task_info->task_id, &task_info->frame, ompt_parallel_id, team_size,
-          (void *)task, OMPT_INVOKER(fork_context_gnu));
-    }
-  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
-
   __kmp_serialized_parallel(loc, gtid);
-
-#if OMPT_SUPPORT
-  if (ompt_enabled) {
-    kmp_info_t *thr = __kmp_threads[gtid];
-
-    ompt_task_id_t my_ompt_task_id = __ompt_task_id_new(gtid);
-
-    // set up lightweight task
-    ompt_lw_taskteam_t *lwt =
-        (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
-    __ompt_lw_taskteam_init(lwt, thr, gtid, (void *)task, ompt_parallel_id);
-    lwt->ompt_task_info.task_id = my_ompt_task_id;
-    __ompt_lw_taskteam_link(lwt, thr);
-
-#if OMPT_TRACE
-    // implicit task callback
-    if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
-      ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
-          ompt_parallel_id, my_ompt_task_id);
-    }
-    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
-#endif
-  }
-#endif
 }
 
 void xexpand(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *), void *data,
@@ -373,10 +426,11 @@
 #if OMPT_SUPPORT
   ompt_frame_t *parent_frame, *frame;
 
-  if (ompt_enabled) {
-    parent_frame = __ompt_get_task_frame_internal(0);
-    parent_frame->reenter_runtime_frame = __builtin_frame_address(1);
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
+    parent_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   MKLOC(loc, "GOMP_parallel_start");
@@ -394,9 +448,9 @@
   }
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
-    frame = __ompt_get_task_frame_internal(0);
-    frame->exit_runtime_frame = __builtin_frame_address(1);
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &frame, NULL, NULL);
+    frame->exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
   }
 #endif
 }
@@ -404,44 +458,23 @@
 void xexpand(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
   int gtid = __kmp_get_gtid();
   kmp_info_t *thr;
+  int ompt_team_size = __kmp_team_from_gtid(gtid)->t.t_nproc;
 
   thr = __kmp_threads[gtid];
 
   MKLOC(loc, "GOMP_parallel_end");
   KA_TRACE(20, ("GOMP_parallel_end: T#%d\n", gtid));
 
-#if OMPT_SUPPORT
-  ompt_parallel_id_t parallel_id;
-  ompt_task_id_t serialized_task_id;
-  ompt_frame_t *ompt_frame = NULL;
-
-  if (ompt_enabled) {
-    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
-    parallel_id = team_info->parallel_id;
-
-    ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
-    serialized_task_id = task_info->task_id;
-
-    // unlink if necessary. no-op if there is not a lightweight task.
-    ompt_lw_taskteam_t *lwt = __ompt_lw_taskteam_unlink(thr);
-    // GOMP allocates/frees lwt since it can't be kept on the stack
-    if (lwt) {
-      __kmp_free(lwt);
-    }
-  }
-#endif
-
   if (!thr->th.th_team->t.t_serialized) {
     __kmp_run_after_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr,
                                  thr->th.th_team);
 
 #if OMPT_SUPPORT
-    if (ompt_enabled) {
+    if (ompt_enabled.enabled) {
       // Implicit task is finished here, in the barrier we might schedule
       // deferred tasks,
       // these don't see the implicit task on the stack
-      ompt_frame = __ompt_get_task_frame_internal(0);
-      ompt_frame->exit_runtime_frame = NULL;
+      OMPT_CUR_TASK_INFO(thr)->frame.exit_runtime_frame = NULL;
     }
 #endif
 
@@ -452,35 +485,7 @@
 #endif
                     );
   } else {
-#if OMPT_SUPPORT && OMPT_TRACE
-    if (ompt_enabled &&
-        ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
-      ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
-          parallel_id, serialized_task_id);
-    }
-#endif
-
     __kmpc_end_serialized_parallel(&loc, gtid);
-
-#if OMPT_SUPPORT
-    if (ompt_enabled) {
-      // Record that we re-entered the runtime system in the frame that
-      // created the parallel region.
-      ompt_task_info_t *parent_task_info = __ompt_get_taskinfo(0);
-
-      if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
-        ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
-            parallel_id, parent_task_info->task_id,
-            OMPT_INVOKER(fork_context_gnu));
-      }
-
-      parent_task_info->frame.reenter_runtime_frame = NULL;
-
-      thr->th.ompt_thread_info.state =
-          (((thr->th.th_team)->t.t_serialized) ? ompt_state_work_serial
-                                               : ompt_state_work_parallel);
-    }
-#endif
   }
 }
 
@@ -508,6 +513,12 @@
 // num and calculate the iteration space using the result.  It doesn't do this
 // with ordered static loop, so they can be checked.
 
+#if OMPT_SUPPORT
+#define IF_OMPT_SUPPORT(code) code
+#else
+#define IF_OMPT_SUPPORT(code)
+#endif
+
 #define LOOP_START(func, schedule)                                             \
   int func(long lb, long ub, long str, long chunk_sz, long *p_lb,              \
            long *p_ub) {                                                       \
@@ -520,9 +531,11 @@
               gtid, lb, ub, str, chunk_sz));                                   \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
       KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
                         (schedule) != kmp_sch_static);                         \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
       status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
                                  (kmp_int *)p_ub, (kmp_int *)&stride);         \
       if (status) {                                                            \
@@ -551,8 +564,10 @@
               gtid, lb, ub, str, chunk_sz));                                   \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
       KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz, TRUE); \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
       status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
                                  (kmp_int *)p_ub, (kmp_int *)&stride);         \
       if (status) {                                                            \
@@ -577,6 +592,7 @@
     MKLOC(loc, #func);                                                         \
     KA_TRACE(20, (#func ": T#%d\n", gtid));                                    \
                                                                                \
+    IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
     fini_code status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,    \
                                          (kmp_int *)p_ub, (kmp_int *)&stride); \
     if (status) {                                                              \
@@ -621,7 +637,20 @@
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_loop_end: T#%d\n", gtid))
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->reenter_runtime_frame = NULL;
+  }
+#endif
 
   KA_TRACE(20, ("GOMP_loop_end exit: T#%d\n", gtid))
 }
@@ -796,17 +825,18 @@
     KA_TRACE(20, (#func " exit: T#%d\n", gtid));                               \
   }
 
-#if OMPT_SUPPORT
+#if OMPT_SUPPORT && OMPT_OPTIONAL
 
 #define OMPT_LOOP_PRE()                                                        \
   ompt_frame_t *parent_frame;                                                  \
-  if (ompt_enabled) {                                                          \
-    parent_frame = __ompt_get_task_frame_internal(0);                          \
-    parent_frame->reenter_runtime_frame = __builtin_frame_address(1);          \
-  }
+  if (ompt_enabled.enabled) {                                                  \
+    __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);   \
+    parent_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);           \
+  }                                                                            \
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 
 #define OMPT_LOOP_POST()                                                       \
-  if (ompt_enabled) {                                                          \
+  if (ompt_enabled.enabled) {                                                  \
     parent_frame->reenter_runtime_frame = NULL;                                \
   }
 
@@ -878,6 +908,16 @@
     }
   }
 
+#if OMPT_SUPPORT
+  kmp_taskdata_t *current_task;
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    current_task = __kmp_threads[gtid]->th.th_current_task;
+    current_task->ompt_task_info.frame.reenter_runtime_frame =
+        OMPT_GET_FRAME_ADDRESS(1);
+  }
+#endif
+
   if (if_cond) {
 #if OMP_40_ENABLED
     if (gomp_flags & 8) {
@@ -893,23 +933,26 @@
         dep_list[i].flags.out = (i < nout);
       }
       __kmpc_omp_task_with_deps(&loc, gtid, task, ndeps, dep_list, 0, NULL);
-    } else
+    } else {
 #endif
       __kmpc_omp_task(&loc, gtid, task);
+    }
   } else {
 #if OMPT_SUPPORT
     ompt_thread_info_t oldInfo;
     kmp_info_t *thread;
     kmp_taskdata_t *taskdata;
-    if (ompt_enabled) {
+    kmp_taskdata_t *current_task;
+    if (ompt_enabled.enabled) {
       // Store the threads states and restore them after the task
       thread = __kmp_threads[gtid];
       taskdata = KMP_TASK_TO_TASKDATA(task);
       oldInfo = thread->th.ompt_thread_info;
       thread->th.ompt_thread_info.wait_id = 0;
-      thread->th.ompt_thread_info.state = ompt_state_work_parallel;
+      thread->th.ompt_thread_info.state = omp_state_work_parallel;
       taskdata->ompt_task_info.frame.exit_runtime_frame =
-          __builtin_frame_address(0);
+          OMPT_GET_FRAME_ADDRESS(0);
+      OMPT_STORE_RETURN_ADDRESS(gtid);
     }
 #endif
 
@@ -918,12 +961,17 @@
     __kmpc_omp_task_complete_if0(&loc, gtid, task);
 
 #if OMPT_SUPPORT
-    if (ompt_enabled) {
+    if (ompt_enabled.enabled) {
       thread->th.ompt_thread_info = oldInfo;
       taskdata->ompt_task_info.frame.exit_runtime_frame = NULL;
     }
 #endif
   }
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    current_task->ompt_task_info.frame.reenter_runtime_frame = NULL;
+  }
+#endif
 
   KA_TRACE(20, ("GOMP_task exit: T#%d\n", gtid));
 }
@@ -932,6 +980,11 @@
   MKLOC(loc, "GOMP_taskwait");
   int gtid = __kmp_entry_gtid();
 
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
   KA_TRACE(20, ("GOMP_taskwait: T#%d\n", gtid));
 
   __kmpc_omp_taskwait(&loc, gtid);
@@ -1001,10 +1054,11 @@
 #if OMPT_SUPPORT
   ompt_frame_t *parent_frame;
 
-  if (ompt_enabled) {
-    parent_frame = __ompt_get_task_frame_internal(0);
-    parent_frame->reenter_runtime_frame = __builtin_frame_address(1);
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
+    parent_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   MKLOC(loc, "GOMP_parallel_sections_start");
@@ -1023,7 +1077,7 @@
   }
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     parent_frame->reenter_runtime_frame = NULL;
   }
 #endif
@@ -1037,7 +1091,20 @@
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_sections_end: T#%d\n", gtid))
 
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    ompt_frame->reenter_runtime_frame = NULL;
+  }
+#endif
 
   KA_TRACE(20, ("GOMP_sections_end exit: T#%d\n", gtid))
 }
@@ -1063,10 +1130,11 @@
 
 #if OMPT_SUPPORT
   ompt_task_info_t *parent_task_info, *task_info;
-  if (ompt_enabled) {
-    parent_task_info = __ompt_get_taskinfo(0);
-    parent_task_info->frame.reenter_runtime_frame = __builtin_frame_address(1);
+  if (ompt_enabled.enabled) {
+    parent_task_info = __ompt_get_task_info_object(0);
+    parent_task_info->frame.reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
   }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
     if (num_threads != 0) {
@@ -1082,15 +1150,20 @@
     __kmp_GOMP_serialized_parallel(&loc, gtid, task);
   }
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
-    task_info = __ompt_get_taskinfo(0);
-    task_info->frame.exit_runtime_frame = __builtin_frame_address(0);
+  if (ompt_enabled.enabled) {
+    task_info = __ompt_get_task_info_object(0);
+    task_info->frame.exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
   task(data);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
   xexpand(KMP_API_NAME_GOMP_PARALLEL_END)();
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     task_info->frame.exit_runtime_frame = NULL;
     parent_task_info->frame.reenter_runtime_frame = NULL;
   }
@@ -1106,6 +1179,10 @@
   MKLOC(loc, "GOMP_parallel_sections");
   KA_TRACE(20, ("GOMP_parallel_sections: T#%d\n", gtid));
 
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
   if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
     if (num_threads != 0) {
       __kmp_push_num_threads(&loc, gtid, num_threads);
@@ -1153,6 +1230,7 @@
       __kmp_GOMP_serialized_parallel(&loc, gtid, task);                        \
     }                                                                          \
                                                                                \
+    IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
     KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                              \
                       (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,          \
                       (schedule) != kmp_sch_static);                           \
@@ -1177,6 +1255,11 @@
   MKLOC(loc, "GOMP_taskgroup_start");
   KA_TRACE(20, ("GOMP_taskgroup_start: T#%d\n", gtid));
 
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
   __kmpc_taskgroup(&loc, gtid);
 
   return;
@@ -1187,6 +1270,11 @@
   MKLOC(loc, "GOMP_taskgroup_end");
   KA_TRACE(20, ("GOMP_taskgroup_end: T#%d\n", gtid));
 
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
   __kmpc_end_taskgroup(&loc, gtid);
 
   return;
Index: runtime/src/kmp_lock.h
===================================================================
--- runtime/src/kmp_lock.h
+++ runtime/src/kmp_lock.h
@@ -1142,7 +1142,7 @@
 // with/without consistency checking.
 extern void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t);
 extern void (*__kmp_direct_destroy[])(kmp_dyna_lock_t *);
-extern void (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32);
+extern int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32);
 extern int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32);
 extern int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32);
 
@@ -1150,7 +1150,7 @@
 // with/withuot consistency checking.
 extern void (*__kmp_indirect_init[])(kmp_user_lock_p);
 extern void (*__kmp_indirect_destroy[])(kmp_user_lock_p);
-extern void (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32);
+extern int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32);
 extern int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32);
 extern int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32);
 
Index: runtime/src/kmp_lock.cpp
===================================================================
--- runtime/src/kmp_lock.cpp
+++ runtime/src/kmp_lock.cpp
@@ -1135,7 +1135,7 @@
   kmp_int32 need_mf = 1;
 
 #if OMPT_SUPPORT
-  ompt_state_t prev_state = ompt_state_undefined;
+  omp_state_t prev_state = omp_state_undefined;
 #endif
 
   KA_TRACE(1000,
@@ -1243,7 +1243,7 @@
 #endif
 
 #if OMPT_SUPPORT
-        if (ompt_enabled && prev_state != ompt_state_undefined) {
+        if (ompt_enabled.enabled && prev_state != omp_state_undefined) {
           /* change the state before clearing wait_id */
           this_thr->th.ompt_thread_info.state = prev_state;
           this_thr->th.ompt_thread_info.wait_id = 0;
@@ -1258,11 +1258,11 @@
     }
 
 #if OMPT_SUPPORT
-    if (ompt_enabled && prev_state == ompt_state_undefined) {
+    if (ompt_enabled.enabled && prev_state == omp_state_undefined) {
       /* this thread will spin; set wait_id before entering wait state */
       prev_state = this_thr->th.ompt_thread_info.state;
       this_thr->th.ompt_thread_info.wait_id = (uint64_t)lck;
-      this_thr->th.ompt_thread_info.state = ompt_state_wait_lock;
+      this_thr->th.ompt_thread_info.state = omp_state_wait_lock;
     }
 #endif
 
@@ -2911,11 +2911,11 @@
 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *l,
                                      kmp_dyna_lockseq_t tag);
 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock);
-static void __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
+static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
-static void __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
-                                                kmp_int32);
+static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                               kmp_int32);
 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
                                                  kmp_int32);
 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
@@ -2938,14 +2938,13 @@
 
 // set/acquire functions
 #define expand(l, op)                                                          \
-  0, (void (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
-static void (*direct_set[])(kmp_dyna_lock_t *, kmp_int32) = {
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
+static int (*direct_set[])(kmp_dyna_lock_t *, kmp_int32) = {
     __kmp_set_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, acquire)};
 #undef expand
 #define expand(l, op)                                                          \
-  0, (void (*)(kmp_dyna_lock_t *,                                              \
-               kmp_int32))__kmp_##op##_##l##_lock_with_checks,
-static void (*direct_set_check[])(kmp_dyna_lock_t *, kmp_int32) = {
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
+static int (*direct_set_check[])(kmp_dyna_lock_t *, kmp_int32) = {
     __kmp_set_indirect_lock_with_checks, 0,
     KMP_FOREACH_D_LOCK(expand, acquire)};
 #undef expand
@@ -2968,7 +2967,7 @@
 #undef expand
 
 // Exposes only one set of jump tables (*lock or *lock_with_checks).
-void (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32) = 0;
+int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32) = 0;
 int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32) = 0;
 int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32) = 0;
 
@@ -2982,13 +2981,13 @@
 
 // set/acquire functions
 #define expand(l, op)                                                          \
-  (void (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
-static void (*indirect_set[])(kmp_user_lock_p, kmp_int32) = {
-    KMP_FOREACH_I_LOCK(expand, acquire)};
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
+static int (*indirect_set[])(kmp_user_lock_p,
+                             kmp_int32) = {KMP_FOREACH_I_LOCK(expand, acquire)};
 #undef expand
 #define expand(l, op)                                                          \
-  (void (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
-static void (*indirect_set_check[])(kmp_user_lock_p, kmp_int32) = {
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
+static int (*indirect_set_check[])(kmp_user_lock_p, kmp_int32) = {
     KMP_FOREACH_I_LOCK(expand, acquire)};
 #undef expand
 
@@ -3009,7 +3008,7 @@
 #undef expand
 
 // Exposes only one jump tables (*lock or *lock_with_checks).
-void (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32) = 0;
+int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32) = 0;
 int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32) = 0;
 int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32) = 0;
 
@@ -3164,9 +3163,9 @@
   __kmp_release_lock(&__kmp_global_lock, gtid);
 }
 
-static void __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
+static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
-  KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
+  return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
 }
 
 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
@@ -3179,11 +3178,11 @@
   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
 }
 
-static void __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
-                                                kmp_int32 gtid) {
+static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                               kmp_int32 gtid) {
   kmp_indirect_lock_t *l =
       __kmp_lookup_indirect_lock((void **)lock, "omp_set_lock");
-  KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
+  return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
 }
 
 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
Index: runtime/src/kmp_runtime.cpp
===================================================================
--- runtime/src/kmp_runtime.cpp
+++ runtime/src/kmp_runtime.cpp
@@ -722,16 +722,6 @@
     /* TODO replace with general release procedure */
     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
 
-#if OMPT_SUPPORT && OMPT_BLAME
-    if (ompt_enabled &&
-        ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
-      /* accept blame for "ordered" waiting */
-      kmp_info_t *this_thread = __kmp_threads[gtid];
-      ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
-          this_thread->th.ompt_thread_info.wait_id);
-    }
-#endif
-
     KMP_MB(); /* Flush all pending memory write invalidates.  */
   }
 #endif /* BUILD_PARALLEL_ORDERED */
@@ -1204,6 +1194,28 @@
   this_thr->th.th_set_proc_bind = proc_bind_default;
 #endif /* OMP_40_ENABLED */
 
+#if OMPT_SUPPORT
+  ompt_data_t ompt_parallel_data;
+  ompt_parallel_data.ptr = NULL;
+  ompt_data_t *implicit_task_data;
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state != omp_state_overhead) {
+
+    ompt_task_info_t *parent_task_info;
+    parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
+
+    parent_task_info->frame.reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+    if (ompt_enabled.ompt_callback_parallel_begin) {
+      int team_size = 1;
+
+      ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
+          &(parent_task_info->task_data), &(parent_task_info->frame),
+          &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
+    }
+  }
+#endif // OMPT_SUPPORT
+
   if (this_thr->th.th_team != serial_team) {
     // Nested level will be an index in the nested nthreads array
     int level = this_thr->th.th_team->t.t_level;
@@ -1215,13 +1227,9 @@
 
       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
 
-#if OMPT_SUPPORT
-      ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
-#endif
-
       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
 #if OMPT_SUPPORT
-                                     ompt_parallel_id,
+                                     ompt_parallel_data,
 #endif
 #if OMP_40_ENABLED
                                      proc_bind,
@@ -1316,11 +1324,6 @@
     }
     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
 
-#if OMPT_SUPPORT
-    ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
-    __ompt_team_assign_id(serial_team, ompt_parallel_id);
-#endif
-
     KMP_MB();
 
   } else {
@@ -1364,17 +1367,41 @@
 
   if (__kmp_env_consistency_check)
     __kmp_push_parallel(global_tid, NULL);
+#if OMPT_SUPPORT
+  serial_team->t.ompt_team_info.master_return_address = codeptr;
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state != omp_state_overhead) {
+    OMPT_CUR_TASK_INFO(this_thr)
+        ->frame.exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+
+    ompt_lw_taskteam_t lw_taskteam;
+    __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
+                            &ompt_parallel_data, codeptr);
+
+    __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
+    // don't use lw_taskteam after linking. content was swaped
+
+    /* OMPT implicit task begin */
+    implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
+          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
+    }
+
+    /* OMPT state */
+    this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
+    OMPT_CUR_TASK_INFO(this_thr)
+        ->frame.exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
+  }
+#endif
 }
 
 /* most of the work for a fork */
 /* return true if we really went parallel, false if serialized */
 int __kmp_fork_call(ident_t *loc, int gtid,
                     enum fork_context_e call_context, // Intel, GNU, ...
-                    kmp_int32 argc,
-#if OMPT_SUPPORT
-                    void *unwrapped_task,
-#endif
-                    microtask_t microtask, launch_t invoker,
+                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
                     va_list *ap
@@ -1432,16 +1459,17 @@
     master_set_numthreads = master_th->th.th_set_nproc;
 
 #if OMPT_SUPPORT
-    ompt_parallel_id_t ompt_parallel_id;
-    ompt_task_id_t ompt_task_id;
+    ompt_data_t ompt_parallel_data;
+    ompt_parallel_data.ptr = NULL;
+    ompt_data_t *parent_task_data;
     ompt_frame_t *ompt_frame;
-    ompt_task_id_t my_task_id;
-    ompt_parallel_id_t my_parallel_id;
+    ompt_data_t *implicit_task_data;
+    void *return_address = NULL;
 
-    if (ompt_enabled) {
-      ompt_parallel_id = __ompt_parallel_id_new(gtid);
-      ompt_task_id = __ompt_get_task_id_internal(0);
-      ompt_frame = __ompt_get_task_frame_internal(0);
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
+                                    NULL, NULL);
+      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
     }
 #endif
 
@@ -1465,13 +1493,16 @@
 #endif
 
 #if OMPT_SUPPORT
-    if (ompt_enabled &&
-        ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
-      int team_size = master_set_numthreads;
-
-      ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
-          ompt_task_id, ompt_frame, ompt_parallel_id, team_size, unwrapped_task,
-          OMPT_INVOKER(call_context));
+    if (ompt_enabled.enabled) {
+      if (ompt_enabled.ompt_callback_parallel_begin) {
+        int team_size = master_set_numthreads
+                            ? master_set_numthreads
+                            : get__nproc_2(parent_team, master_tid);
+        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
+            parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
+            OMPT_INVOKER(call_context), return_address);
+      }
+      master_th->th.ompt_thread_info.state = omp_state_overhead;
     }
 #endif
 
@@ -1508,27 +1539,25 @@
 
         ompt_lw_taskteam_t lw_taskteam;
 
-        if (ompt_enabled) {
-          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, unwrapped_task,
-                                  ompt_parallel_id);
-          lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
+        if (ompt_enabled.enabled) {
+          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                  &ompt_parallel_data, return_address);
           exit_runtime_p =
               &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
 
-          __ompt_lw_taskteam_link(&lw_taskteam, master_th);
+          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+          // don't use lw_taskteam after linking. content was swaped
 
-#if OMPT_TRACE
           /* OMPT implicit task begin */
-          my_task_id = lw_taskteam.ompt_task_info.task_id;
-          my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
-          if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
-            ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
-                my_parallel_id, my_task_id);
+          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+          if (ompt_enabled.ompt_callback_implicit_task) {
+            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+                implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
           }
-#endif
 
           /* OMPT state */
-          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+          master_th->th.ompt_thread_info.state = omp_state_work_parallel;
         } else {
           exit_runtime_p = &dummy;
         }
@@ -1547,34 +1576,27 @@
 
 #if OMPT_SUPPORT
         *exit_runtime_p = NULL;
-        if (ompt_enabled) {
-#if OMPT_TRACE
-          lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
-
-          if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
-            ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
-                ompt_parallel_id, ompt_task_id);
+        if (ompt_enabled.enabled) {
+          OMPT_CUR_TASK_INFO(master_th)->frame.exit_runtime_frame = NULL;
+          if (ompt_enabled.ompt_callback_implicit_task) {
+            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                ompt_scope_end, NULL, implicit_task_data, 1,
+                __kmp_tid_from_gtid(gtid));
           }
-
           __ompt_lw_taskteam_unlink(master_th);
-          // reset clear the task id only after unlinking the task
-          lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
-#endif
 
-          if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
-            ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
-                ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
+          if (ompt_enabled.ompt_callback_parallel_end) {
+            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+                OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
+                OMPT_INVOKER(call_context), return_address);
           }
-          master_th->th.ompt_thread_info.state = ompt_state_overhead;
+          master_th->th.ompt_thread_info.state = omp_state_overhead;
         }
 #endif
         return TRUE;
       }
 
       parent_team->t.t_pkfn = microtask;
-#if OMPT_SUPPORT
-      parent_team->t.ompt_team_info.microtask = unwrapped_task;
-#endif
       parent_team->t.t_invoke = invoker;
       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
       parent_team->t.t_active_level++;
@@ -1726,28 +1748,27 @@
 #if OMPT_SUPPORT
           void *dummy;
           void **exit_runtime_p;
+          ompt_task_info_t *task_info;
 
           ompt_lw_taskteam_t lw_taskteam;
 
-          if (ompt_enabled) {
+          if (ompt_enabled.enabled) {
             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                    unwrapped_task, ompt_parallel_id);
-            lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
-            exit_runtime_p =
-                &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
-
-            __ompt_lw_taskteam_link(&lw_taskteam, master_th);
-
-#if OMPT_TRACE
-            my_task_id = lw_taskteam.ompt_task_info.task_id;
-            if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
-              ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
-                  ompt_parallel_id, my_task_id);
+                                    &ompt_parallel_data, return_address);
+
+            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+            // don't use lw_taskteam after linking. content was swaped
+
+            task_info = OMPT_CUR_TASK_INFO(master_th);
+            exit_runtime_p = &(task_info->frame.exit_runtime_frame);
+            if (ompt_enabled.ompt_callback_implicit_task) {
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+                  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
             }
-#endif
 
             /* OMPT state */
-            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+            master_th->th.ompt_thread_info.state = omp_state_work_parallel;
           } else {
             exit_runtime_p = &dummy;
           }
@@ -1766,26 +1787,21 @@
           }
 
 #if OMPT_SUPPORT
-          *exit_runtime_p = NULL;
-          if (ompt_enabled) {
-            lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
-
-#if OMPT_TRACE
-            if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
-              ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
-                  ompt_parallel_id, ompt_task_id);
+          if (ompt_enabled.enabled) {
+            exit_runtime_p = NULL;
+            if (ompt_enabled.ompt_callback_implicit_task) {
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_end, NULL, &(task_info->task_data), 1,
+                  __kmp_tid_from_gtid(gtid));
             }
-#endif
 
             __ompt_lw_taskteam_unlink(master_th);
-            // reset clear the task id only after unlinking the task
-            lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
-
-            if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
-              ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
-                  ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
+            if (ompt_enabled.ompt_callback_parallel_end) {
+              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+                  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
+                  OMPT_INVOKER(call_context), return_address);
             }
-            master_th->th.ompt_thread_info.state = ompt_state_overhead;
+            master_th->th.ompt_thread_info.state = omp_state_overhead;
           }
 #endif
         } else if (microtask == (microtask_t)__kmp_teams_master) {
@@ -1834,30 +1850,28 @@
 #if OMPT_SUPPORT
           void *dummy;
           void **exit_runtime_p;
+          ompt_task_info_t *task_info;
 
           ompt_lw_taskteam_t lw_taskteam;
 
-          if (ompt_enabled) {
+          if (ompt_enabled.enabled) {
             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
-                                    unwrapped_task, ompt_parallel_id);
-            lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
-            exit_runtime_p =
-                &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
-
-            __ompt_lw_taskteam_link(&lw_taskteam, master_th);
+                                    &ompt_parallel_data, return_address);
+            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+            // don't use lw_taskteam after linking. content was swaped
+            task_info = OMPT_CUR_TASK_INFO(master_th);
+            exit_runtime_p = &(task_info->frame.exit_runtime_frame);
 
-#if OMPT_TRACE
             /* OMPT implicit task begin */
-            my_task_id = lw_taskteam.ompt_task_info.task_id;
-            my_parallel_id = ompt_parallel_id;
-            if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
-              ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
-                  my_parallel_id, my_task_id);
+            implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+            if (ompt_enabled.ompt_callback_implicit_task) {
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
             }
-#endif
 
             /* OMPT state */
-            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+            master_th->th.ompt_thread_info.state = omp_state_work_parallel;
           } else {
             exit_runtime_p = &dummy;
           }
@@ -1875,26 +1889,22 @@
           }
 
 #if OMPT_SUPPORT
-          *exit_runtime_p = NULL;
-          if (ompt_enabled) {
-#if OMPT_TRACE
-            lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
-
-            if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
-              ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
-                  my_parallel_id, my_task_id);
+          if (ompt_enabled.enabled) {
+            *exit_runtime_p = NULL;
+            if (ompt_enabled.ompt_callback_implicit_task) {
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_end, NULL, &(task_info->task_data), 1,
+                  __kmp_tid_from_gtid(gtid));
             }
-#endif
 
+            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
             __ompt_lw_taskteam_unlink(master_th);
-            // reset clear the task id only after unlinking the task
-            lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
-
-            if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
-              ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
-                  ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
+            if (ompt_enabled.ompt_callback_parallel_end) {
+              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+                  &ompt_parallel_data, parent_task_data,
+                  OMPT_INVOKER(call_context), return_address);
             }
-            master_th->th.ompt_thread_info.state = ompt_state_overhead;
+            master_th->th.ompt_thread_info.state = omp_state_overhead;
           }
 #endif
 #if OMP_40_ENABLED
@@ -1902,14 +1912,13 @@
 #endif /* OMP_40_ENABLED */
       } else if (call_context == fork_context_gnu) {
 #if OMPT_SUPPORT
-        ompt_lw_taskteam_t *lwt =
-            (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
-        __ompt_lw_taskteam_init(lwt, master_th, gtid, unwrapped_task,
-                                ompt_parallel_id);
+        ompt_lw_taskteam_t lwt;
+        __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
+                                return_address);
 
-        lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
-        lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
-        __ompt_lw_taskteam_link(lwt, master_th);
+        lwt.ompt_task_info.frame.exit_runtime_frame = NULL;
+        __ompt_lw_taskteam_link(&lwt, master_th, 1);
+// don't use lw_taskteam after linking. content was swaped
 #endif
 
         // we were called from GNU native code
@@ -2004,7 +2013,7 @@
       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
       team = __kmp_allocate_team(root, nthreads, nthreads,
 #if OMPT_SUPPORT
-                                 ompt_parallel_id,
+                                 ompt_parallel_data,
 #endif
 #if OMP_40_ENABLED
                                  proc_bind,
@@ -2015,7 +2024,7 @@
       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
       team = __kmp_allocate_team(root, nthreads, nthreads,
 #if OMPT_SUPPORT
-                                 ompt_parallel_id,
+                                 ompt_parallel_data,
 #endif
 #if OMP_40_ENABLED
                                  proc_bind,
@@ -2033,7 +2042,8 @@
     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
 #if OMPT_SUPPORT
-    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
+    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
+                          return_address);
 #endif
     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
 // TODO: parent_team->t.t_level == INT_MAX ???
@@ -2167,7 +2177,7 @@
                          &master_th->th.th_current_task->td_icvs, loc);
 
 #if OMPT_SUPPORT
-    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+    master_th->th.ompt_thread_info.state = omp_state_work_parallel;
 #endif
 
     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
@@ -2251,8 +2261,8 @@
   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
-    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+  if (ompt_enabled.enabled) {
+    master_th->th.ompt_thread_info.state = omp_state_overhead;
   }
 #endif
 
@@ -2264,17 +2274,18 @@
                                             kmp_team_t *team) {
   // restore state outside the region
   thread->th.ompt_thread_info.state =
-      ((team->t.t_serialized) ? ompt_state_work_serial
-                              : ompt_state_work_parallel);
+      ((team->t.t_serialized) ? omp_state_work_serial
+                              : omp_state_work_parallel);
 }
 
-static inline void __kmp_join_ompt(kmp_info_t *thread, kmp_team_t *team,
-                                   ompt_parallel_id_t parallel_id,
-                                   fork_context_e fork_context) {
-  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
-  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
-    ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
-        parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
+static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
+                                   kmp_team_t *team, ompt_data_t *parallel_data,
+                                   fork_context_e fork_context, void *codeptr) {
+  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+  if (ompt_enabled.ompt_callback_parallel_end) {
+    ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+        parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
+        codeptr);
   }
 
   task_info->frame.reenter_runtime_frame = NULL;
@@ -2311,8 +2322,8 @@
   master_th->th.th_ident = loc;
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
-    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+  if (ompt_enabled.enabled) {
+    master_th->th.ompt_thread_info.state = omp_state_overhead;
   }
 #endif
 
@@ -2349,7 +2360,7 @@
     __kmpc_end_serialized_parallel(loc, gtid);
 
 #if OMPT_SUPPORT
-    if (ompt_enabled) {
+    if (ompt_enabled.enabled) {
       __kmp_join_restore_state(master_th, parent_team);
     }
 #endif
@@ -2377,7 +2388,8 @@
   KMP_MB();
 
 #if OMPT_SUPPORT
-  ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
+  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
+  void *codeptr = team->t.ompt_team_info.master_return_address;
 #endif
 
 #if USE_ITT_BUILD
@@ -2449,8 +2461,9 @@
     }
 
 #if OMPT_SUPPORT
-    if (ompt_enabled) {
-      __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
+    if (ompt_enabled.enabled) {
+      __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
+                      codeptr);
     }
 #endif
 
@@ -2479,15 +2492,18 @@
   }
   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled) {
-    ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
-    if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
-      ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
-          parallel_id, task_info->task_id);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      int ompt_team_size = team->t.t_nproc;
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
+          __kmp_tid_from_gtid(gtid));
     }
+
     task_info->frame.exit_runtime_frame = NULL;
-    task_info->task_id = 0;
+    task_info->task_data = ompt_data_none;
   }
 #endif
 
@@ -2558,8 +2574,9 @@
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
-    __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
+  if (ompt_enabled.enabled) {
+    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
+                    codeptr);
   }
 #endif
 
@@ -3154,7 +3171,7 @@
                           1, // new_nproc
                           1, // max_nproc
 #if OMPT_SUPPORT
-                          0, // root parallel id
+                          ompt_data_none, // root parallel id
 #endif
 #if OMP_40_ENABLED
                           __kmp_nested_proc_bind.bind_types[0],
@@ -3195,7 +3212,7 @@
                           1, // new_nproc
                           __kmp_dflt_team_nth_ub * 2, // max_nproc
 #if OMPT_SUPPORT
-                          0, // root parallel id
+                          ompt_data_none, // root parallel id
 #endif
 #if OMP_40_ENABLED
                           __kmp_nested_proc_bind.bind_types[0],
@@ -3734,6 +3751,9 @@
       __kmp_print_thread_storage_map(root_thread, gtid);
     }
     root_thread->th.th_info.ds.ds_gtid = gtid;
+#if OMPT_SUPPORT
+    root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
+#endif
     root_thread->th.th_root = root;
     if (__kmp_env_consistency_check) {
       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
@@ -3756,7 +3776,7 @@
     root_thread->th.th_serial_team =
         __kmp_allocate_team(root, 1, 1,
 #if OMPT_SUPPORT
-                            0, // root parallel id
+                            ompt_data_none, // root parallel id
 #endif
 #if OMP_40_ENABLED
                             proc_bind_default,
@@ -3826,6 +3846,29 @@
 
   __kmp_root_counter++;
 
+#if OMPT_SUPPORT
+  if (!initial_thread && ompt_enabled.enabled) {
+
+    ompt_thread_t *root_thread = ompt_get_thread();
+
+    ompt_set_thread_state(root_thread, omp_state_overhead);
+
+    if (ompt_enabled.ompt_callback_thread_begin) {
+      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
+          ompt_thread_initial, __ompt_get_thread_data_internal());
+    }
+    ompt_data_t *task_data;
+    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          NULL, NULL, task_data, ompt_task_initial, 0, NULL);
+      // initial task has nothing to return to
+    }
+
+    ompt_set_thread_state(root_thread, omp_state_work_serial);
+  }
+#endif
+
   KMP_MB();
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 
@@ -3909,9 +3952,9 @@
 #endif /* KMP_OS_WINDOWS */
 
 #if OMPT_SUPPORT
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
-    int gtid = __kmp_get_gtid();
-    __ompt_thread_end(ompt_thread_initial, gtid);
+  if (ompt_enabled.ompt_callback_thread_end) {
+    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
+        &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
   }
 #endif
 
@@ -3961,7 +4004,7 @@
   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
 #if OMPT_SUPPORT
     // the runtime is shutting down so we won't report any events
-    thread->th.ompt_thread_info.state = ompt_state_undefined;
+    thread->th.ompt_thread_info.state = omp_state_undefined;
 #endif
     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
   }
@@ -4282,7 +4325,7 @@
     new_thr->th.th_serial_team = serial_team =
         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
 #if OMPT_SUPPORT
-                                          0, // root parallel id
+                                          ompt_data_none, // root parallel id
 #endif
 #if OMP_40_ENABLED
                                           proc_bind_default,
@@ -4813,7 +4856,7 @@
 kmp_team_t *
 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if OMPT_SUPPORT
-                    ompt_parallel_id_t ompt_parallel_id,
+                    ompt_data_t ompt_parallel_data,
 #endif
 #if OMP_40_ENABLED
                     kmp_proc_bind_t new_proc_bind,
@@ -5180,7 +5223,7 @@
 #endif
 
 #if OMPT_SUPPORT
-    __ompt_team_assign_id(team, ompt_parallel_id);
+    __ompt_team_assign_id(team, ompt_parallel_data);
 #endif
 
     KMP_MB();
@@ -5232,7 +5275,7 @@
                     team->t.t_id));
 
 #if OMPT_SUPPORT
-      __ompt_team_assign_id(team, ompt_parallel_id);
+      __ompt_team_assign_id(team, ompt_parallel_data);
 #endif
 
       KMP_MB();
@@ -5296,7 +5339,7 @@
 #endif
 
 #if OMPT_SUPPORT
-  __ompt_team_assign_id(team, ompt_parallel_id);
+  __ompt_team_assign_id(team, ompt_parallel_data);
   team->t.ompt_serialized_team_info = NULL;
 #endif
 
@@ -5563,16 +5606,26 @@
   }
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
-    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+  ompt_data_t *thread_data;
+  if (ompt_enabled.enabled) {
+    thread_data = &(this_thr->th.ompt_thread_info.thread_data);
+    thread_data->ptr = NULL;
+
+    this_thr->th.ompt_thread_info.state = omp_state_overhead;
     this_thr->th.ompt_thread_info.wait_id = 0;
-    this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
-    if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
-      __ompt_thread_begin(ompt_thread_worker, gtid);
+    this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_thread_begin) {
+      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
+          ompt_thread_worker, thread_data);
     }
   }
 #endif
 
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    this_thr->th.ompt_thread_info.state = omp_state_idle;
+  }
+#endif
   /* This is the place where threads wait for work */
   while (!TCR_4(__kmp_global.g.g_done)) {
     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
@@ -5581,18 +5634,12 @@
     /* wait for work to do */
     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
 
-#if OMPT_SUPPORT
-    if (ompt_enabled) {
-      this_thr->th.ompt_thread_info.state = ompt_state_idle;
-    }
-#endif
-
     /* No tid yet since not part of a team */
     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
 
 #if OMPT_SUPPORT
-    if (ompt_enabled) {
-      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    if (ompt_enabled.enabled) {
+      this_thr->th.ompt_thread_info.state = omp_state_overhead;
     }
 #endif
 
@@ -5600,14 +5647,6 @@
 
     /* have we been allocated? */
     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
-#if OMPT_SUPPORT
-      ompt_task_info_t *task_info;
-      ompt_parallel_id_t my_parallel_id;
-      if (ompt_enabled) {
-        task_info = __ompt_get_taskinfo(0);
-        my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
-      }
-#endif
       /* we were just woken up, so run our new task */
       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
         int rc;
@@ -5619,11 +5658,8 @@
         updateHWFPControl(*pteam);
 
 #if OMPT_SUPPORT
-        if (ompt_enabled) {
-          this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
-          // Initialize OMPT task id for implicit task.
-          int tid = __kmp_tid_from_gtid(gtid);
-          task_info->task_id = __ompt_task_id_new(tid);
+        if (ompt_enabled.enabled) {
+          this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
         }
 #endif
 
@@ -5634,40 +5670,29 @@
         }
         KMP_ASSERT(rc);
 
-#if OMPT_SUPPORT
-        if (ompt_enabled) {
-          /* no frame set while outside task */
-          task_info->frame.exit_runtime_frame = NULL;
-
-          this_thr->th.ompt_thread_info.state = ompt_state_overhead;
-        }
-#endif
         KMP_MB();
         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
                       (*pteam)->t.t_pkfn));
       }
-      /* join barrier after parallel region */
-      __kmp_join_barrier(gtid);
-#if OMPT_SUPPORT && OMPT_TRACE
-      if (ompt_enabled) {
-        if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
-          // don't access *pteam here: it may have already been freed
-          // by the master thread behind the barrier (possible race)
-          ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
-              my_parallel_id, task_info->task_id);
-        }
-        task_info->frame.exit_runtime_frame = NULL;
-        task_info->task_id = 0;
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        /* no frame set while outside task */
+        __ompt_get_task_info_object(0)->frame.exit_runtime_frame = NULL;
+
+        this_thr->th.ompt_thread_info.state = omp_state_overhead;
+        this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
       }
 #endif
+      /* join barrier after parallel region */
+      __kmp_join_barrier(gtid);
     }
   }
   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
 
 #if OMPT_SUPPORT
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
-    __ompt_thread_end(ompt_thread_worker, gtid);
+  if (ompt_enabled.ompt_callback_thread_end) {
+    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
   }
 #endif
 
@@ -6925,26 +6950,27 @@
 #if OMPT_SUPPORT
   void *dummy;
   void **exit_runtime_p;
-  ompt_task_id_t my_task_id;
-  ompt_parallel_id_t my_parallel_id;
+  ompt_data_t *my_task_data;
+  ompt_data_t *my_parallel_data;
+  int ompt_team_size;
 
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid]
                            .ompt_task_info.frame.exit_runtime_frame);
   } else {
     exit_runtime_p = &dummy;
   }
 
-#if OMPT_TRACE
-  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
-  my_parallel_id = team->t.ompt_team_info.parallel_id;
-  if (ompt_enabled &&
-      ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
-    ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(my_parallel_id,
-                                                                 my_task_id);
+  my_task_data =
+      &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
+  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
+  if (ompt_enabled.ompt_callback_implicit_task) {
+    ompt_team_size = team->t.t_nproc;
+    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+        ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
+        __kmp_tid_from_gtid(gtid));
   }
 #endif
-#endif
 
   {
     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
@@ -6991,9 +7017,6 @@
   SSC_MARK_FORKING();
 #endif
   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
-#if OMPT_SUPPORT
-                  (void *)thr->th.th_teams_microtask, // "unwrapped" task
-#endif
                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
 #if INCLUDE_SSC_MARKS
@@ -7170,6 +7193,36 @@
 #endif /* KMP_DEBUG */
 
   __kmp_join_barrier(gtid); /* wait for everyone */
+#if OMPT_SUPPORT
+  int ds_tid = this_thr->th.th_info.ds.ds_tid;
+  if (this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
+    ompt_data_t *tId = OMPT_CUR_TASK_DATA(this_thr);
+    ompt_data_t *pId = OMPT_CUR_TEAM_DATA(this_thr);
+    this_thr->th.ompt_thread_info.state = omp_state_overhead;
+#if OMPT_OPTIONAL
+    void *codeptr = NULL;
+    if (KMP_MASTER_TID(ds_tid) &&
+        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
+         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
+      codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
+
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
+    }
+#endif
+    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, tId, 0, ds_tid);
+    }
+    // return to idle state
+    this_thr->th.ompt_thread_info.state = omp_state_overhead;
+  }
+#endif
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
   KMP_ASSERT(this_thr->th.th_team == team);
Index: runtime/src/kmp_sched.cpp
===================================================================
--- runtime/src/kmp_sched.cpp
+++ runtime/src/kmp_sched.cpp
@@ -44,7 +44,12 @@
                                   T *plower, T *pupper,
                                   typename traits_t<T>::signed_t *pstride,
                                   typename traits_t<T>::signed_t incr,
-                                  typename traits_t<T>::signed_t chunk) {
+                                  typename traits_t<T>::signed_t chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                  ,
+                                  void *codeptr
+#endif
+                                  ) {
   KMP_COUNT_BLOCK(OMP_FOR_static);
   KMP_TIME_PARTITIONED_BLOCK(FOR_static_scheduling);
 
@@ -58,14 +63,29 @@
   kmp_team_t *team;
   kmp_info_t *th = __kmp_threads[gtid];
 
-#if OMPT_SUPPORT && OMPT_TRACE
+#if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_team_info_t *team_info = NULL;
   ompt_task_info_t *task_info = NULL;
+  ompt_work_type_t ompt_work_type;
 
-  if (ompt_enabled) {
+  if (ompt_enabled.enabled) {
     // Only fully initialize variables needed by OMPT if OMPT is enabled.
     team_info = __ompt_get_teaminfo(0, NULL);
-    task_info = __ompt_get_taskinfo(0);
+    task_info = __ompt_get_task_info_object(0);
+    // Determine workshare type
+    if (loc != NULL) {
+      if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
+        ompt_work_type = ompt_work_loop;
+      } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
+        ompt_work_type = ompt_work_sections;
+      } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
+        ompt_work_type = ompt_work_distribute;
+      } else {
+        KMP_ASSERT2(0,
+                    "__kmpc_for_static_init: can't determine workshare type");
+      }
+      KMP_DEBUG_ASSERT(ompt_work_type);
+    }
   }
 #endif
 
@@ -119,10 +139,11 @@
 #endif
     KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
 
-#if OMPT_SUPPORT && OMPT_TRACE
-    if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
-      ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
-          team_info->parallel_id, task_info->task_id, team_info->microtask);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), 0, codeptr);
     }
 #endif
     KMP_COUNT_VALUE(FOR_static_iterations, 0);
@@ -170,10 +191,11 @@
 #endif
     KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
 
-#if OMPT_SUPPORT && OMPT_TRACE
-    if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
-      ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
-          team_info->parallel_id, task_info->task_id, team_info->microtask);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), *pstride, codeptr);
     }
 #endif
     return;
@@ -198,10 +220,11 @@
 #endif
     KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
 
-#if OMPT_SUPPORT && OMPT_TRACE
-    if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
-      ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
-          team_info->parallel_id, task_info->task_id, team_info->microtask);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), *pstride, codeptr);
     }
 #endif
     return;
@@ -354,10 +377,11 @@
 #endif
   KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
 
-#if OMPT_SUPPORT && OMPT_TRACE
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
-    ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
-        team_info->parallel_id, task_info->task_id, team_info->microtask);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), trip_count, codeptr);
   }
 #endif
 
@@ -745,7 +769,12 @@
                               kmp_int32 *pupper, kmp_int32 *pstride,
                               kmp_int32 incr, kmp_int32 chunk) {
   __kmp_for_static_init<kmp_int32>(loc, gtid, schedtype, plastiter, plower,
-                                   pupper, pstride, incr, chunk);
+                                   pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                   ,
+                                   OMPT_GET_RETURN_ADDRESS(0)
+#endif
+                                       );
 }
 
 /*!
@@ -757,7 +786,12 @@
                                kmp_int32 *pstride, kmp_int32 incr,
                                kmp_int32 chunk) {
   __kmp_for_static_init<kmp_uint32>(loc, gtid, schedtype, plastiter, plower,
-                                    pupper, pstride, incr, chunk);
+                                    pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                    ,
+                                    OMPT_GET_RETURN_ADDRESS(0)
+#endif
+                                        );
 }
 
 /*!
@@ -768,7 +802,12 @@
                               kmp_int64 *pupper, kmp_int64 *pstride,
                               kmp_int64 incr, kmp_int64 chunk) {
   __kmp_for_static_init<kmp_int64>(loc, gtid, schedtype, plastiter, plower,
-                                   pupper, pstride, incr, chunk);
+                                   pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                   ,
+                                   OMPT_GET_RETURN_ADDRESS(0)
+#endif
+                                       );
 }
 
 /*!
@@ -780,7 +819,12 @@
                                kmp_int64 *pstride, kmp_int64 incr,
                                kmp_int64 chunk) {
   __kmp_for_static_init<kmp_uint64>(loc, gtid, schedtype, plastiter, plower,
-                                    pupper, pstride, incr, chunk);
+                                    pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                    ,
+                                    OMPT_GET_RETURN_ADDRESS(0)
+#endif
+                                        );
 }
 /*!
 @}
Index: runtime/src/kmp_settings.cpp
===================================================================
--- runtime/src/kmp_settings.cpp
+++ runtime/src/kmp_settings.cpp
@@ -334,13 +334,11 @@
   }
 } // __kmp_stg_parse_size
 
-#if KMP_AFFINITY_SUPPORTED
 static void __kmp_stg_parse_str(char const *name, char const *value,
                                 char const **out) {
   __kmp_str_free(out);
   *out = __kmp_str_format("%s", value);
 } // __kmp_stg_parse_str
-#endif
 
 static void __kmp_stg_parse_int(
     char const
@@ -4354,7 +4352,29 @@
 
 #endif
 
-// -----------------------------------------------------------------------------
+#if OMP_50_ENABLED && OMPT_SUPPORT
+
+static void __kmp_stg_parse_omp_tool_libraries(char const *name,
+                                               char const *value, void *data) {
+  __kmp_stg_parse_str(name, value, &__kmp_tool_libraries);
+} // __kmp_stg_parse_omp_tool_libraries
+
+static void __kmp_stg_print_omp_tool_libraries(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+  if (__kmp_tool_libraries)
+    __kmp_stg_print_str(buffer, name, __kmp_tool_libraries);
+  else {
+    if (__kmp_env_format) {
+      KMP_STR_BUF_PRINT_NAME;
+    } else {
+      __kmp_str_buf_print(buffer, "   %s", name);
+    }
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+} // __kmp_stg_print_omp_tool_libraries
+
+#endif
+
 // Table.
 
 static kmp_setting_t __kmp_stg_table[] = {
@@ -4598,6 +4618,12 @@
     {"OMP_CANCELLATION", __kmp_stg_parse_omp_cancellation,
      __kmp_stg_print_omp_cancellation, NULL, 0, 0},
 #endif
+
+#if OMP_50_ENABLED && OMPT_SUPPORT
+    {"OMP_TOOL_LIBRARIES", __kmp_stg_parse_omp_tool_libraries,
+     __kmp_stg_print_omp_tool_libraries, NULL, 0, 0},
+#endif
+
     {"", NULL, NULL, NULL, 0, 0}}; // settings
 
 static int const __kmp_stg_count =
Index: runtime/src/kmp_taskdeps.cpp
===================================================================
--- runtime/src/kmp_taskdeps.cpp
+++ runtime/src/kmp_taskdeps.cpp
@@ -16,6 +16,9 @@
 #include "kmp.h"
 #include "kmp_io.h"
 #include "kmp_wait_release.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
 
 #if OMP_40_ENABLED
 
@@ -217,18 +220,19 @@
                task_source->td_ident->psource, sink->dn.id,
                task_sink->td_ident->psource);
 #endif
-#if OMPT_SUPPORT && OMPT_TRACE
-  // OMPT tracks dependences between task (a=source, b=sink) in which
-  // task a blocks the execution of b through the ompt_new_dependence_callback
-  if (ompt_enabled &&
-      ompt_callbacks.ompt_callback(ompt_event_task_dependence_pair)) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  /* OMPT tracks dependences between task (a=source, b=sink) in which
+     task a blocks the execution of b through the ompt_new_dependence_callback
+     */
+  if (ompt_enabled.ompt_callback_task_dependence) {
     kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
     kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
 
-    ompt_callbacks.ompt_callback(ompt_event_task_dependence_pair)(
-        task_source->ompt_task_info.task_id, task_sink->ompt_task_info.task_id);
+    ompt_callbacks.ompt_callback(ompt_callback_task_dependence)(
+        &(task_source->ompt_task_info.task_data),
+        &(task_sink->ompt_task_info.task_data));
   }
-#endif /* OMPT_SUPPORT && OMPT_TRACE */
+#endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
 }
 
 template <bool filter>
@@ -470,10 +474,29 @@
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
 
-#if OMPT_SUPPORT && OMPT_TRACE
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_task_create) {
+      kmp_taskdata_t *parent = new_taskdata->td_parent;
+      ompt_data_t task_data = ompt_data_none;
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          parent ? &(parent->ompt_task_info.task_data) : &task_data,
+          parent ? &(parent->ompt_task_info.frame) : NULL,
+          &(new_taskdata->ompt_task_info.task_data),
+          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
+          OMPT_LOAD_RETURN_ADDRESS(gtid));
+    }
+
+    new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
+        OMPT_GET_FRAME_ADDRESS(0);
+  }
+
+#if OMPT_OPTIONAL
   /* OMPT grab all dependences if requested by the tool */
-  if (ompt_enabled && ndeps + ndeps_noalias > 0 &&
-      ompt_callbacks.ompt_callback(ompt_event_task_dependences)) {
+  if (ndeps + ndeps_noalias > 0 &&
+      ompt_enabled.ompt_callback_task_dependences) {
     kmp_int32 i;
 
     new_taskdata->ompt_task_info.ndeps = ndeps + ndeps_noalias;
@@ -509,8 +532,17 @@
         new_taskdata->ompt_task_info.deps[ndeps + i].dependence_flags =
             ompt_task_dependence_type_in;
     }
+    ompt_callbacks.ompt_callback(ompt_callback_task_dependences)(
+        &(new_taskdata->ompt_task_info.task_data),
+        new_taskdata->ompt_task_info.deps, new_taskdata->ompt_task_info.ndeps);
+    /* We can now free the allocated memory for the dependencies */
+    /* For OMPD we might want to delay the free until task_end */
+    KMP_OMPT_DEPS_FREE(thread, new_taskdata->ompt_task_info.deps);
+    new_taskdata->ompt_task_info.deps = NULL;
+    new_taskdata->ompt_task_info.ndeps = 0;
   }
-#endif /* OMPT_SUPPORT && OMPT_TRACE */
+#endif /* OMPT_OPTIONAL */
+#endif /* OMPT_SUPPORT */
 
   bool serial = current_task->td_flags.team_serial ||
                 current_task->td_flags.tasking_ser ||
@@ -557,7 +589,7 @@
                 "loc=%p task=%p, transferring to __kmpc_omp_task\n",
                 gtid, loc_ref, new_taskdata));
 
-  return __kmpc_omp_task(loc_ref, gtid, new_task);
+  return __kmp_omp_task(gtid, new_task, true);
 }
 
 /*!
Index: runtime/src/kmp_tasking.cpp
===================================================================
--- runtime/src/kmp_tasking.cpp
+++ runtime/src/kmp_tasking.cpp
@@ -446,40 +446,78 @@
 
   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
 
+  return;
+}
+
 #if OMPT_SUPPORT
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
-    kmp_taskdata_t *parent = taskdata->td_parent;
-    ompt_callbacks.ompt_callback(ompt_event_task_begin)(
-        parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
-        parent ? &(parent->ompt_task_info.frame) : NULL,
-        taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.function);
-  }
-#endif
-#if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE
-  /* OMPT emit all dependences if requested by the tool */
-  if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 &&
-      ompt_callbacks.ompt_callback(ompt_event_task_dependences)) {
-    ompt_callbacks.ompt_callback(ompt_event_task_dependences)(
-        taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.deps,
-        taskdata->ompt_task_info.ndeps);
-    /* We can now free the allocated memory for the dependencies */
-    KMP_OMPT_DEPS_FREE(thread, taskdata->ompt_task_info.deps);
-    taskdata->ompt_task_info.deps = NULL;
-    taskdata->ompt_task_info.ndeps = 0;
-  }
-#endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */
+//------------------------------------------------------------------------------
+// __ompt_task_init:
+//   Initialize OMPT fields maintained by a task. This will only be called after
+//   ompt_start_tool, so we already know whether ompt is enabled or not.
+
+static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
+  // The calls to __ompt_task_init already have the ompt_enabled condition.
+  task->ompt_task_info.task_data.value = 0;
+  task->ompt_task_info.frame.exit_runtime_frame = NULL;
+  task->ompt_task_info.frame.reenter_runtime_frame = NULL;
+#if OMP_40_ENABLED
+  task->ompt_task_info.ndeps = 0;
+  task->ompt_task_info.deps = NULL;
+#endif /* OMP_40_ENABLED */
+}
 
-  return;
+// __ompt_task_start:
+//   Build and trigger task-begin event
+static inline void __ompt_task_start(kmp_task_t *task,
+                                     kmp_taskdata_t *current_task,
+                                     kmp_int32 gtid) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  ompt_task_status_t status = ompt_task_others;
+  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
+    status = ompt_task_yield;
+    __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
+  }
+  /* let OMPT know that we're about to run this task */
+  if (ompt_enabled.ompt_callback_task_schedule) {
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        &(current_task->ompt_task_info.task_data), status,
+        &(taskdata->ompt_task_info.task_data));
+  }
+  taskdata->ompt_task_info.scheduling_parent = current_task;
 }
 
-// __kmpc_omp_task_begin_if0: report that a given serialized task has started
-// execution
-//
-// loc_ref: source location information; points to beginning of task block.
-// gtid: global thread number.
-// task: task thunk for the started task.
-void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
-                               kmp_task_t *task) {
+// __ompt_task_finish:
+//   Build and trigger final task-schedule event
+static inline void __ompt_task_finish(kmp_task_t *task,
+                                      kmp_taskdata_t *resumed_task) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  ompt_task_status_t status = ompt_task_complete;
+  if (taskdata->td_flags.tiedness == TASK_UNTIED &&
+      KMP_TEST_THEN_ADD32(&(taskdata->td_untied_count), 0) > 1)
+    status = ompt_task_others;
+  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
+      taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
+    status = ompt_task_cancel;
+  }
+
+  /* let OMPT know that we're returning to the callee task */
+  if (ompt_enabled.ompt_callback_task_schedule) {
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        &(taskdata->ompt_task_info.task_data), status,
+        &((resumed_task ? resumed_task
+                        : (taskdata->ompt_task_info.scheduling_parent
+                               ? taskdata->ompt_task_info.scheduling_parent
+                               : taskdata->td_parent))
+              ->ompt_task_info.task_data));
+  }
+}
+#endif
+
+template <bool ompt>
+static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
+                                               kmp_task_t *task,
+                                               void *frame_address,
+                                               void *return_address) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
 
@@ -500,10 +538,57 @@
       1; // Execute this task immediately, not deferred.
   __kmp_task_start(gtid, task, current_task);
 
+#if OMPT_SUPPORT
+  if (ompt) {
+    if (current_task->ompt_task_info.frame.reenter_runtime_frame == NULL) {
+      current_task->ompt_task_info.frame.reenter_runtime_frame =
+          taskdata->ompt_task_info.frame.exit_runtime_frame = frame_address;
+    }
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          &(parent_info->task_data), &(parent_info->frame),
+          &(taskdata->ompt_task_info.task_data),
+          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
+          return_address);
+    }
+    __ompt_task_start(task, current_task, gtid);
+  }
+#endif // OMPT_SUPPORT
+
   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
                 loc_ref, taskdata));
+}
 
-  return;
+#if OMPT_SUPPORT
+OMPT_NOINLINE
+static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
+                                           kmp_task_t *task,
+                                           void *frame_address,
+                                           void *return_address) {
+  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
+                                           return_address);
+}
+#endif // OMPT_SUPPORT
+
+// __kmpc_omp_task_begin_if0: report that a given serialized task has started
+// execution
+//
+// loc_ref: source location information; points to beginning of task block.
+// gtid: global thread number.
+// task: task thunk for the started task.
+void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
+                               kmp_task_t *task) {
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
+                                   OMPT_GET_FRAME_ADDRESS(1),
+                                   OMPT_LOAD_RETURN_ADDRESS(gtid));
+    return;
+  }
+#endif
+  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
 }
 
 #ifdef TASK_UNUSED
@@ -623,14 +708,6 @@
       thread->th.th_task_team; // might be NULL for serial teams...
   kmp_int32 children = 0;
 
-#if OMPT_SUPPORT
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_end)) {
-    kmp_taskdata_t *parent = taskdata->td_parent;
-    ompt_callbacks.ompt_callback(ompt_event_task_end)(
-        taskdata->ompt_task_info.task_id);
-  }
-#endif
-
   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
                 "task %p\n",
                 gtid, taskdata, resumed_task));
@@ -760,23 +837,55 @@
   return;
 }
 
-// __kmpc_omp_task_complete_if0: report that a task has completed execution
-//
-// loc_ref: source location information; points to end of task block.
-// gtid: global thread number.
-// task: task thunk for the completed task.
-void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
-                                  kmp_task_t *task) {
+template <bool ompt>
+static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
+                                                  kmp_int32 gtid,
+                                                  kmp_task_t *task) {
   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
+
   // this routine will provide task to resume
   __kmp_task_finish(gtid, task, NULL);
 
   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
+
+#if OMPT_SUPPORT
+  if (ompt) {
+    __ompt_task_finish(task, NULL);
+    ompt_frame_t *ompt_frame;
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->reenter_runtime_frame = NULL;
+  }
+#endif
+
   return;
 }
 
+#if OMPT_SUPPORT
+OMPT_NOINLINE
+void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
+                                       kmp_task_t *task) {
+  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
+}
+#endif // OMPT_SUPPORT
+
+// __kmpc_omp_task_complete_if0: report that a task has completed execution
+//
+// loc_ref: source location information; points to end of task block.
+// gtid: global thread number.
+// task: task thunk for the completed task.
+void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_task_t *task) {
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
+    return;
+  }
+#endif
+  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
+}
+
 #ifdef TASK_UNUSED
 // __kmpc_omp_task_complete: report that a task has completed execution
 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
@@ -793,25 +902,6 @@
 }
 #endif // TASK_UNUSED
 
-#if OMPT_SUPPORT
-// __kmp_task_init_ompt: Initialize OMPT fields maintained by a task. This will
-//  only be called after ompt_tool, so we already know whether ompt is enabled
-// or not.
-static inline void __kmp_task_init_ompt(kmp_taskdata_t *task, int tid,
-                                        void *function) {
-  if (ompt_enabled) {
-    task->ompt_task_info.task_id = __ompt_task_id_new(tid);
-    task->ompt_task_info.function = function;
-    task->ompt_task_info.frame.exit_runtime_frame = NULL;
-    task->ompt_task_info.frame.reenter_runtime_frame = NULL;
-#if OMP_40_ENABLED
-    task->ompt_task_info.ndeps = 0;
-    task->ompt_task_info.deps = NULL;
-#endif /* OMP_40_ENABLED */
-  }
-}
-#endif
-
 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
 // task for a given thread
 //
@@ -876,7 +966,8 @@
   }
 
 #if OMPT_SUPPORT
-  __kmp_task_init_ompt(task, tid, NULL);
+  if (__builtin_expect(ompt_enabled.enabled, 0))
+    __ompt_task_init(task, tid);
 #endif
 
   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
@@ -1121,7 +1212,8 @@
   ANNOTATE_HAPPENS_BEFORE(task);
 
 #if OMPT_SUPPORT
-  __kmp_task_init_ompt(taskdata, gtid, (void *)task_entry);
+  if (__builtin_expect(ompt_enabled.enabled, 0))
+    __ompt_task_init(taskdata, gtid);
 #endif
 
   return task;
@@ -1207,7 +1299,7 @@
   if (taskdata->td_flags.proxy != TASK_PROXY) {
 #endif
     ANNOTATE_HAPPENS_AFTER(task);
-    __kmp_task_start(gtid, task, current_task);
+    __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
 #if OMP_45_ENABLED
   }
 #endif
@@ -1215,14 +1307,16 @@
 #if OMPT_SUPPORT
   ompt_thread_info_t oldInfo;
   kmp_info_t *thread;
-  if (ompt_enabled) {
+  if (__builtin_expect(ompt_enabled.enabled, 0)) {
     // Store the threads states and restore them after the task
     thread = __kmp_threads[gtid];
     oldInfo = thread->th.ompt_thread_info;
     thread->th.ompt_thread_info.wait_id = 0;
-    thread->th.ompt_thread_info.state = ompt_state_work_parallel;
+    thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
+                                            ? omp_state_work_serial
+                                            : omp_state_work_parallel;
     taskdata->ompt_task_info.frame.exit_runtime_frame =
-        __builtin_frame_address(0);
+        OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
 
@@ -1236,6 +1330,18 @@
     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
     if ((taskgroup && taskgroup->cancel_request) ||
         (this_team->t.t_cancel_request == cancel_parallel)) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+      ompt_data_t *task_data;
+      if (__builtin_expect(ompt_enabled.ompt_callback_cancel, 0)) {
+        __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
+        ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+            task_data,
+            ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
+                                                      : ompt_cancel_parallel) |
+                ompt_cancel_discarded_task,
+            NULL);
+      }
+#endif
       KMP_COUNT_BLOCK(TASK_cancelled);
       // this task belongs to a task group and we need to cancel it
       discard = 1 /* true */;
@@ -1270,13 +1376,10 @@
 #endif // KMP_STATS_ENABLED
 #endif // OMP_40_ENABLED
 
-#if OMPT_SUPPORT && OMPT_TRACE
-    /* let OMPT know that we're about to run this task */
-    if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
-      ompt_callbacks.ompt_callback(ompt_event_task_switch)(
-          current_task->ompt_task_info.task_id,
-          taskdata->ompt_task_info.task_id);
-    }
+// OMPT task begin
+#if OMPT_SUPPORT
+    if (__builtin_expect(ompt_enabled.enabled, 0))
+      __ompt_task_start(task, current_task, gtid);
 #endif
 
 #ifdef KMP_GOMP_COMPAT
@@ -1289,21 +1392,16 @@
     }
     KMP_POP_PARTITIONED_TIMER();
 
-#if OMPT_SUPPORT && OMPT_TRACE
-    /* let OMPT know that we're returning to the callee task */
-    if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
-      ompt_callbacks.ompt_callback(ompt_event_task_switch)(
-          taskdata->ompt_task_info.task_id,
-          current_task->ompt_task_info.task_id);
-    }
+#if OMPT_SUPPORT
+    if (__builtin_expect(ompt_enabled.enabled, 0))
+      __ompt_task_finish(task, current_task);
 #endif
-
 #if OMP_40_ENABLED
   }
 #endif // OMP_40_ENABLED
 
 #if OMPT_SUPPORT
-  if (ompt_enabled) {
+  if (__builtin_expect(ompt_enabled.enabled, 0)) {
     thread->th.ompt_thread_info = oldInfo;
     taskdata->ompt_task_info.frame.exit_runtime_frame = NULL;
   }
@@ -1314,7 +1412,7 @@
   if (taskdata->td_flags.proxy != TASK_PROXY) {
 #endif
     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
-    __kmp_task_finish(gtid, task, current_task);
+    __kmp_task_finish(gtid, task, current_task); // OMPT only if not discarded
 #if OMP_45_ENABLED
   }
 #endif
@@ -1352,6 +1450,21 @@
   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
                 loc_ref, new_taskdata));
 
+#if OMPT_SUPPORT
+  kmp_taskdata_t *parent;
+  if (__builtin_expect(ompt_enabled.enabled, 0)) {
+    parent = new_taskdata->td_parent;
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_data_t task_data = ompt_data_none;
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          parent ? &(parent->ompt_task_info.task_data) : &task_data,
+          parent ? &(parent->ompt_task_info.frame) : NULL,
+          &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+#endif
+
   /* Should we execute the new task or queue it? For now, let's just always try
      to queue it.  If the queue fills up, then we'll execute it.  */
 
@@ -1369,6 +1482,11 @@
        gtid, loc_ref, new_taskdata));
 
   ANNOTATE_HAPPENS_BEFORE(new_task);
+#if OMPT_SUPPORT
+  if (__builtin_expect(ompt_enabled.enabled, 0)) {
+    parent->ompt_task_info.frame.reenter_runtime_frame = NULL;
+  }
+#endif
   return TASK_CURRENT_NOT_QUEUED;
 }
 
@@ -1387,13 +1505,6 @@
                          bool serialize_immediate) {
   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
 
-#if OMPT_SUPPORT
-  if (ompt_enabled) {
-    new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
-        __builtin_frame_address(1);
-  }
-#endif
-
 /* Should we execute the new task or queue it? For now, let's just always try to
    queue it.  If the queue fills up, then we'll execute it.  */
 #if OMP_45_ENABLED
@@ -1409,12 +1520,6 @@
     __kmp_invoke_task(gtid, new_task, current_task);
   }
 
-#if OMPT_SUPPORT
-  if (ompt_enabled) {
-    new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
-  }
-#endif
-
   ANNOTATE_HAPPENS_BEFORE(new_task);
   return TASK_CURRENT_NOT_QUEUED;
 }
@@ -1436,23 +1541,50 @@
   kmp_int32 res;
   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
 
-#if KMP_DEBUG
+#if KMP_DEBUG || OMPT_SUPPORT
   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
 #endif
   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
                 new_taskdata));
 
+#if OMPT_SUPPORT
+  kmp_taskdata_t *parent = NULL;
+  if (__builtin_expect(ompt_enabled.enabled && !new_taskdata->td_flags.started,
+                       0)) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    parent = new_taskdata->td_parent;
+    if (!parent->ompt_task_info.frame.reenter_runtime_frame)
+      parent->ompt_task_info.frame.reenter_runtime_frame =
+          OMPT_GET_FRAME_ADDRESS(1);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_data_t task_data = ompt_data_none;
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          parent ? &(parent->ompt_task_info.task_data) : &task_data,
+          parent ? &(parent->ompt_task_info.frame) : NULL,
+          &(new_taskdata->ompt_task_info.task_data),
+          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
+          OMPT_LOAD_RETURN_ADDRESS(gtid));
+    }
+  }
+#endif
+
   res = __kmp_omp_task(gtid, new_task, true);
 
   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
                 gtid, loc_ref, new_taskdata));
+#if OMPT_SUPPORT
+  if (__builtin_expect(ompt_enabled.enabled && parent != NULL, 0)) {
+    parent->ompt_task_info.frame.reenter_runtime_frame = NULL;
+  }
+#endif
   return res;
 }
 
-// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
-// complete
-kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
+template <bool ompt>
+static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
+                                              void *frame_address,
+                                              void *return_address) {
   kmp_taskdata_t *taskdata;
   kmp_info_t *thread;
   int thread_finished = FALSE;
@@ -1463,23 +1595,30 @@
   if (__kmp_tasking_mode != tskm_immediate_exec) {
     thread = __kmp_threads[gtid];
     taskdata = thread->th.th_current_task;
-#if OMPT_SUPPORT && OMPT_TRACE
-    ompt_task_id_t my_task_id;
-    ompt_parallel_id_t my_parallel_id;
-
-    if (ompt_enabled) {
-      kmp_team_t *team = thread->th.th_team;
-      my_task_id = taskdata->ompt_task_info.task_id;
-      my_parallel_id = team->t.ompt_team_info.parallel_id;
-
-      taskdata->ompt_task_info.frame.reenter_runtime_frame =
-          __builtin_frame_address(1);
-      if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) {
-        ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(my_parallel_id,
-                                                                my_task_id);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    ompt_data_t *my_task_data;
+    ompt_data_t *my_parallel_data;
+
+    if (ompt) {
+      my_task_data = &(taskdata->ompt_task_info.task_data);
+      my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
+
+      taskdata->ompt_task_info.frame.reenter_runtime_frame = frame_address;
+
+      if (ompt_enabled.ompt_callback_sync_region) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+            ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
+            my_task_data, return_address);
+      }
+
+      if (ompt_enabled.ompt_callback_sync_region_wait) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+            ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
+            my_task_data, return_address);
       }
     }
-#endif
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
 
 // Debugger: The taskwait is active. Store location and thread encountered the
 // taskwait.
@@ -1522,15 +1661,22 @@
     // negated.
     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
 
-#if OMPT_SUPPORT && OMPT_TRACE
-    if (ompt_enabled) {
-      if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) {
-        ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(my_parallel_id,
-                                                              my_task_id);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt) {
+      if (ompt_enabled.ompt_callback_sync_region_wait) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+            ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
+            my_task_data, return_address);
+      }
+      if (ompt_enabled.ompt_callback_sync_region) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+            ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
+            my_task_data, return_address);
       }
       taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
     }
-#endif
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+
     ANNOTATE_HAPPENS_AFTER(taskdata);
   }
 
@@ -1541,6 +1687,29 @@
   return TASK_CURRENT_NOT_QUEUED;
 }
 
+#if OMPT_SUPPORT
+OMPT_NOINLINE
+static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
+                                          void *frame_address,
+                                          void *return_address) {
+  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
+                                            return_address);
+}
+#endif // OMPT_SUPPORT
+
+// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
+// complete
+kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(1),
+                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
+  }
+#endif
+  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
+}
+
 // __kmpc_omp_taskyield: switch to a different task
 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
   kmp_taskdata_t *taskdata;
@@ -1575,10 +1744,18 @@
       kmp_task_team_t *task_team = thread->th.th_task_team;
       if (task_team != NULL) {
         if (KMP_TASKING_ENABLED(task_team)) {
+#if OMPT_SUPPORT
+          if (__builtin_expect(ompt_enabled.enabled, 0))
+            thread->th.ompt_thread_info.ompt_task_yielded = 1;
+#endif
           __kmp_execute_tasks_32(
               thread, gtid, NULL, FALSE,
               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
               __kmp_task_stealing_constraint);
+#if OMPT_SUPPORT
+          if (__builtin_expect(ompt_enabled.enabled, 0))
+            thread->th.ompt_thread_info.ompt_task_yielded = 0;
+#endif
         }
       }
     }
@@ -1809,6 +1986,22 @@
   tg_new->reduce_num_data = 0;
 #endif
   taskdata->td_taskgroup = tg_new;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (__builtin_expect(ompt_enabled.ompt_callback_sync_region, 0)) {
+    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+    kmp_team_t *team = thread->th.th_team;
+    ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
+    // FIXME: I think this is wrong for lwt!
+    ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
+
+    ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+        ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
+        &(my_task_data), codeptr);
+  }
+#endif
 }
 
 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
@@ -1819,6 +2012,22 @@
   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
   int thread_finished = FALSE;
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_team_t *team;
+  ompt_data_t my_task_data;
+  ompt_data_t my_parallel_data;
+  void *codeptr;
+  if (__builtin_expect(ompt_enabled.enabled, 0)) {
+    team = thread->th.th_team;
+    my_task_data = taskdata->ompt_task_info.task_data;
+    // FIXME: I think this is wrong for lwt!
+    my_parallel_data = team->t.ompt_team_info.parallel_data;
+    codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  }
+#endif
+
   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
   KMP_DEBUG_ASSERT(taskgroup != NULL);
   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
@@ -1832,6 +2041,14 @@
       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
 #endif /* USE_ITT_BUILD */
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (__builtin_expect(ompt_enabled.ompt_callback_sync_region_wait, 0)) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
+          &(my_task_data), codeptr);
+    }
+#endif
+
 #if OMP_45_ENABLED
     if (!taskdata->td_flags.team_serial ||
         (thread->th.th_task_team != NULL &&
@@ -1848,6 +2065,14 @@
       }
     }
 
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (__builtin_expect(ompt_enabled.ompt_callback_sync_region_wait, 0)) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
+          &(my_task_data), codeptr);
+    }
+#endif
+
 #if USE_ITT_BUILD
     if (itt_sync_obj != NULL)
       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
@@ -1867,6 +2092,14 @@
   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
                 gtid, taskdata));
   ANNOTATE_HAPPENS_AFTER(taskdata);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (__builtin_expect(ompt_enabled.ompt_callback_sync_region, 0)) {
+    ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+        ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
+        &(my_task_data), codeptr);
+  }
+#endif
 }
 #endif
 
@@ -3255,8 +3488,8 @@
            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
             thread, taskdata, taskdata->td_parent));
 #if OMPT_SUPPORT
-  __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid,
-                       (void *)task->routine);
+  if (__builtin_expect(ompt_enabled.enabled, 0))
+    __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
 #endif
   return task;
 }
@@ -3539,8 +3772,22 @@
                 "grain %llu(%d), dup %p\n",
                 gtid, taskdata, *lb, *ub, st, grainsize, sched, task_dup));
 
-  if (nogroup == 0)
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (nogroup == 0) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
     __kmpc_taskgroup(loc, gtid);
+  }
 
   // =========================================================================
   // calculate loop parameters
@@ -3614,6 +3861,9 @@
   if (if_val == 0) { // if(0) specified, mark task as serial
     taskdata->td_flags.task_serial = 1;
     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
     // always start serial tasks linearly
     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
                           grainsize, extras, tc, task_dup);
@@ -3621,18 +3871,35 @@
     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
                   "(%lld), grain %llu, extras %llu\n",
                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
                          grainsize, extras, tc, num_tasks_min, task_dup);
   } else {
     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
                   "(%lld), grain %llu, extras %llu\n",
                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
                           grainsize, extras, tc, task_dup);
   }
 
-  if (nogroup == 0)
+  if (nogroup == 0) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
     __kmpc_end_taskgroup(loc, gtid);
+  }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
+        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
 }
 
Index: runtime/src/kmp_wait_release.h
===================================================================
--- runtime/src/kmp_wait_release.h
+++ runtime/src/kmp_wait_release.h
@@ -17,6 +17,9 @@
 #include "kmp.h"
 #include "kmp_itt.h"
 #include "kmp_stats.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
 
 /*!
 @defgroup WAIT_RELEASE Wait/Release operations
@@ -85,6 +88,44 @@
   */
 };
 
+#if OMPT_SUPPORT
+static inline void __ompt_implicit_task_end(kmp_info_t *this_thr,
+                                            omp_state_t omp_state,
+                                            ompt_data_t *tId,
+                                            ompt_data_t *pId) {
+  int ds_tid = this_thr->th.th_info.ds.ds_tid;
+  if (omp_state == omp_state_wait_barrier_implicit) {
+    this_thr->th.ompt_thread_info.state = omp_state_overhead;
+#if OMPT_OPTIONAL
+    void *codeptr = NULL;
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier, ompt_scope_end, NULL, tId, codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier, ompt_scope_end, NULL, tId, codeptr);
+    }
+#endif
+    if (!KMP_MASTER_TID(ds_tid)) {
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_end, NULL, tId, 0, ds_tid);
+      }
+#if OMPT_OPTIONAL
+      if (ompt_enabled.ompt_callback_idle) {
+        ompt_callbacks.ompt_callback(ompt_callback_idle)(ompt_scope_begin);
+      }
+#endif
+      // return to idle state
+      this_thr->th.ompt_thread_info.state = omp_state_idle;
+    } else {
+      this_thr->th.ompt_thread_info.state = omp_state_overhead;
+    }
+  }
+}
+#endif
+
 /* Spin wait loop that first does pause, then yield, then sleep. A thread that
    calls __kmp_wait_*  must make certain that another thread calls __kmp_release
    to wake it back up to prevent deadlocks!  */
@@ -116,30 +157,88 @@
   stats_state_e thread_state = KMP_GET_THREAD_STATE();
 #endif
 
-#if OMPT_SUPPORT && OMPT_BLAME
-  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
-  if (ompt_enabled && ompt_state != ompt_state_undefined) {
-    if (ompt_state == ompt_state_idle) {
-      if (ompt_callbacks.ompt_callback(ompt_event_idle_begin)) {
-        ompt_callbacks.ompt_callback(ompt_event_idle_begin)(th_gtid + 1);
-      }
-    } else if (ompt_callbacks.ompt_callback(ompt_event_wait_barrier_begin)) {
-      KMP_DEBUG_ASSERT(ompt_state == ompt_state_wait_barrier ||
-                       ompt_state == ompt_state_wait_barrier_implicit ||
-                       ompt_state == ompt_state_wait_barrier_explicit);
-
+/* OMPT Behavior:
+THIS function is called from
+  __kmp_barrier (2 times)  (implicit or explicit barrier in parallel regions)
+            these have join / fork behavior
+
+       In these cases, we don't change the state or trigger events in THIS
+function.
+       Events are triggered in the calling code (__kmp_barrier):
+
+                state := omp_state_overhead
+            barrier-begin
+            barrier-wait-begin
+                state := omp_state_wait_barrier
+          call join-barrier-implementation (finally arrive here)
+          {}
+          call fork-barrier-implementation (finally arrive here)
+          {}
+                state := omp_state_overhead
+            barrier-wait-end
+            barrier-end
+                state := omp_state_work_parallel
+
+
+  __kmp_fork_barrier  (after thread creation, before executing implicit task)
+          call fork-barrier-implementation (finally arrive here)
+          {} // worker arrive here with state = omp_state_idle
+
+
+  __kmp_join_barrier  (implicit barrier at end of parallel region)
+                state := omp_state_barrier_implicit
+            barrier-begin
+            barrier-wait-begin
+          call join-barrier-implementation (finally arrive here
+final_spin=FALSE)
+          {
+          }
+  __kmp_fork_barrier  (implicit barrier at end of parallel region)
+          call fork-barrier-implementation (finally arrive here final_spin=TRUE)
+
+       Worker after task-team is finished:
+            barrier-wait-end
+            barrier-end
+            implicit-task-end
+            idle-begin
+                state := omp_state_idle
+
+       Before leaving, if state = omp_state_idle
+            idle-end
+                state := omp_state_overhead
+*/
+#if OMPT_SUPPORT
+  omp_state_t ompt_entry_state;
+  ompt_data_t *pId = NULL;
+  ompt_data_t *tId;
+  if (ompt_enabled.enabled) {
+    ompt_entry_state = this_thr->th.ompt_thread_info.state;
+    if (!final_spin || ompt_entry_state != omp_state_wait_barrier_implicit ||
+        KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
       ompt_lw_taskteam_t *team =
           this_thr->th.th_team->t.ompt_serialized_team_info;
-      ompt_parallel_id_t pId;
-      ompt_task_id_t tId;
       if (team) {
-        pId = team->ompt_team_info.parallel_id;
-        tId = team->ompt_task_info.task_id;
+        pId = &(team->ompt_team_info.parallel_data);
+        tId = &(team->ompt_task_info.task_data);
       } else {
-        pId = this_thr->th.th_team->t.ompt_team_info.parallel_id;
-        tId = this_thr->th.th_current_task->ompt_task_info.task_id;
+        pId = OMPT_CUR_TEAM_DATA(this_thr);
+        tId = OMPT_CUR_TASK_DATA(this_thr);
+      }
+    } else {
+      pId = NULL;
+      tId = &(this_thr->th.ompt_thread_info.task_data);
+    }
+#if OMPT_OPTIONAL
+    if (ompt_entry_state == omp_state_idle) {
+      if (ompt_enabled.ompt_callback_idle) {
+        ompt_callbacks.ompt_callback(ompt_callback_idle)(ompt_scope_begin);
       }
-      ompt_callbacks.ompt_callback(ompt_event_wait_barrier_begin)(pId, tId);
+    } else
+#endif
+        if (final_spin && (__kmp_tasking_mode == tskm_immediate_exec ||
+                           this_thr->th.th_task_team == NULL)) {
+      // implicit task is done. Either no taskqueue, or task-team finished
+      __ompt_implicit_task_end(this_thr, ompt_entry_state, tId, pId);
     }
   }
 #endif
@@ -206,6 +305,11 @@
             this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
         } else {
           KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
+#if OMPT_SUPPORT
+          // task-team is done now, other cases should be catched above
+          if (final_spin && ompt_enabled.enabled)
+            __ompt_implicit_task_end(this_thr, ompt_entry_state, tId, pId);
+#endif
           this_thr->th.th_task_team = NULL;
           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
         }
@@ -293,29 +397,22 @@
     // TODO: If thread is done with work and times out, disband/free
   }
 
-#if OMPT_SUPPORT && OMPT_BLAME
-  if (ompt_enabled && ompt_state != ompt_state_undefined) {
-    if (ompt_state == ompt_state_idle) {
-      if (ompt_callbacks.ompt_callback(ompt_event_idle_end)) {
-        ompt_callbacks.ompt_callback(ompt_event_idle_end)(th_gtid + 1);
-      }
-    } else if (ompt_callbacks.ompt_callback(ompt_event_wait_barrier_end)) {
-      KMP_DEBUG_ASSERT(ompt_state == ompt_state_wait_barrier ||
-                       ompt_state == ompt_state_wait_barrier_implicit ||
-                       ompt_state == ompt_state_wait_barrier_explicit);
-
-      ompt_lw_taskteam_t *team =
-          this_thr->th.th_team->t.ompt_serialized_team_info;
-      ompt_parallel_id_t pId;
-      ompt_task_id_t tId;
-      if (team) {
-        pId = team->ompt_team_info.parallel_id;
-        tId = team->ompt_task_info.task_id;
-      } else {
-        pId = this_thr->th.th_team->t.ompt_team_info.parallel_id;
-        tId = this_thr->th.th_current_task->ompt_task_info.task_id;
+#if OMPT_SUPPORT
+  omp_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state;
+  if (ompt_enabled.enabled && ompt_exit_state != omp_state_undefined) {
+#if OMPT_OPTIONAL
+    if (final_spin) {
+      __ompt_implicit_task_end(this_thr, ompt_exit_state, tId, pId);
+      ompt_exit_state = this_thr->th.ompt_thread_info.state;
+    }
+#endif
+    if (ompt_exit_state == omp_state_idle) {
+#if OMPT_OPTIONAL
+      if (ompt_enabled.ompt_callback_idle) {
+        ompt_callbacks.ompt_callback(ompt_callback_idle)(ompt_scope_end);
       }
-      ompt_callbacks.ompt_callback(ompt_event_wait_barrier_end)(pId, tId);
+#endif
+      this_thr->th.ompt_thread_info.state = omp_state_overhead;
     }
   }
 #endif
Index: runtime/src/ompt-event-specific.h
===================================================================
--- runtime/src/ompt-event-specific.h
+++ runtime/src/ompt-event-specific.h
@@ -22,132 +22,84 @@
  | the OMPT TR. They are exposed to tools through ompt_set_callback.
  +--------------------------------------------------------------------------*/
 
-#define ompt_event_NEVER ompt_set_result_event_never_occurs
-#define ompt_event_UNIMPLEMENTED ompt_set_result_event_may_occur_no_callback
-#define ompt_event_MAY_CONVENIENT ompt_set_result_event_may_occur_callback_some
-#define ompt_event_MAY_ALWAYS ompt_set_result_event_may_occur_callback_always
+#define ompt_event_UNIMPLEMENTED ompt_set_never
+#define ompt_event_MAY_CONVENIENT ompt_set_sometimes
+#define ompt_event_MAY_ALWAYS ompt_set_always
 
-#if OMPT_TRACE
-#define ompt_event_MAY_ALWAYS_TRACE ompt_event_MAY_ALWAYS
+#if OMPT_OPTIONAL
+#define ompt_event_MAY_ALWAYS_OPTIONAL ompt_event_MAY_ALWAYS
 #else
-#define ompt_event_MAY_ALWAYS_TRACE ompt_event_UNIMPLEMENTED
-#endif
-
-#if OMPT_BLAME
-#define ompt_event_MAY_ALWAYS_BLAME ompt_event_MAY_ALWAYS
-#else
-#define ompt_event_MAY_ALWAYS_BLAME ompt_event_UNIMPLEMENTED
+#define ompt_event_MAY_ALWAYS_OPTIONAL ompt_event_UNIMPLEMENTED
 #endif
 
 /*----------------------------------------------------------------------------
  | Mandatory Events
  +--------------------------------------------------------------------------*/
 
-#define ompt_event_parallel_begin_implemented ompt_event_MAY_ALWAYS
-#define ompt_event_parallel_end_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_parallel_begin_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_parallel_end_implemented ompt_event_MAY_ALWAYS
 
-#define ompt_event_task_begin_implemented ompt_event_MAY_ALWAYS
-#define ompt_event_task_end_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_task_create_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_task_schedule_implemented ompt_event_MAY_ALWAYS
 
-#define ompt_event_thread_begin_implemented ompt_event_MAY_ALWAYS
-#define ompt_event_thread_end_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_thread_begin_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_thread_end_implemented ompt_event_MAY_ALWAYS
 
-#define ompt_event_control_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_control_tool_implemented ompt_event_MAY_ALWAYS
 
-#define ompt_event_runtime_shutdown_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_implicit_task_implemented ompt_event_MAY_ALWAYS
 
 /*----------------------------------------------------------------------------
- | Optional Events (blame shifting)
+ | Target Related Events (not yet implemented)
  +--------------------------------------------------------------------------*/
 
-#define ompt_event_idle_begin_implemented ompt_event_MAY_ALWAYS_BLAME
-#define ompt_event_idle_end_implemented ompt_event_MAY_ALWAYS_BLAME
-
-#define ompt_event_wait_barrier_begin_implemented ompt_event_MAY_ALWAYS_BLAME
-#define ompt_event_wait_barrier_end_implemented ompt_event_MAY_ALWAYS_BLAME
+#define ompt_callback_target_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_target_data_op_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_target_submit_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_device_initialize_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_device_finalize_implemented ompt_event_UNIMPLEMENTED
 
-#define ompt_event_wait_taskwait_begin_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_wait_taskwait_end_implemented ompt_event_UNIMPLEMENTED
-
-#define ompt_event_wait_taskgroup_begin_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_wait_taskgroup_end_implemented ompt_event_UNIMPLEMENTED
-
-#define ompt_event_release_lock_implemented ompt_event_MAY_ALWAYS_BLAME
-#define ompt_event_release_nest_lock_last_implemented                          \
-  ompt_event_MAY_ALWAYS_BLAME
-#define ompt_event_release_critical_implemented ompt_event_MAY_ALWAYS_BLAME
-#define ompt_event_release_atomic_implemented ompt_event_MAY_ALWAYS_BLAME
-#define ompt_event_release_ordered_implemented ompt_event_MAY_ALWAYS_BLAME
+#define ompt_callback_target_map_implemented ompt_event_UNIMPLEMENTED
 
 /*----------------------------------------------------------------------------
- | Optional Events (synchronous events)
+ | Optional Events (blame shifting)
  +--------------------------------------------------------------------------*/
 
-#define ompt_event_implicit_task_begin_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_implicit_task_end_implemented ompt_event_MAY_ALWAYS_TRACE
+#define ompt_callback_idle_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_event_initial_task_begin_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_initial_task_end_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_sync_region_wait_implemented                             \
+  ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_event_task_switch_implemented ompt_event_MAY_ALWAYS_TRACE
+#define ompt_callback_mutex_released_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_event_loop_begin_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_loop_end_implemented ompt_event_MAY_ALWAYS_TRACE
-
-#define ompt_event_sections_begin_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_sections_end_implemented ompt_event_UNIMPLEMENTED
-
-#define ompt_event_single_in_block_begin_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_single_in_block_end_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_single_others_begin_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_single_others_end_implemented ompt_event_MAY_ALWAYS_TRACE
-
-#define ompt_event_workshare_begin_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_workshare_end_implemented ompt_event_UNIMPLEMENTED
-
-#define ompt_event_master_begin_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_master_end_implemented ompt_event_MAY_ALWAYS_TRACE
-
-#define ompt_event_barrier_begin_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_barrier_end_implemented ompt_event_MAY_ALWAYS_TRACE
+/*----------------------------------------------------------------------------
+ | Optional Events (synchronous events)
+ +--------------------------------------------------------------------------*/
 
-#define ompt_event_taskwait_begin_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_taskwait_end_implemented ompt_event_MAY_ALWAYS_TRACE
+#define ompt_callback_work_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_event_taskgroup_begin_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_taskgroup_end_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_master_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_event_release_nest_lock_prev_implemented                          \
-  ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_wait_lock_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_wait_nest_lock_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_wait_critical_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_wait_atomic_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_wait_ordered_implemented ompt_event_MAY_ALWAYS_TRACE
+#define ompt_callback_sync_region_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_event_acquired_lock_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_acquired_nest_lock_first_implemented                        \
-  ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_acquired_nest_lock_next_implemented                         \
-  ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_acquired_critical_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_acquired_atomic_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_acquired_ordered_implemented ompt_event_MAY_ALWAYS_TRACE
+#define ompt_callback_mutex_acquire_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_mutex_acquired_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_nest_lock_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_event_init_lock_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_init_nest_lock_implemented ompt_event_MAY_ALWAYS_TRACE
+#define ompt_callback_lock_init_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_lock_destroy_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_event_destroy_lock_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_destroy_nest_lock_implemented ompt_event_MAY_ALWAYS_TRACE
+#define ompt_callback_flush_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_event_flush_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
 #if OMP_40_ENABLED
-#define ompt_event_task_dependences_implemented ompt_event_MAY_ALWAYS_TRACE
-#define ompt_event_task_dependence_pair_implemented ompt_event_MAY_ALWAYS_TRACE
+#define ompt_callback_task_dependences_implemented                             \
+  ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_task_dependence_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 #else
-#define ompt_event_task_dependences_implemented ompt_event_UNIMPLEMENTED
-#define ompt_event_task_dependence_pair_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_task_dependences_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_task_dependence_implemented ompt_event_UNIMPLEMENTED
 #endif /* OMP_40_ENABLED */
 
 #endif
Index: runtime/src/ompt-general.cpp
===================================================================
--- runtime/src/ompt-general.cpp
+++ runtime/src/ompt-general.cpp
@@ -8,6 +8,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#if KMP_OS_UNIX
+#include <dlfcn.h>
+#endif
 
 /*****************************************************************************
  * ompt include files
@@ -36,8 +39,13 @@
 
 typedef struct {
   const char *state_name;
-  ompt_state_t state_id;
-} ompt_state_info_t;
+  omp_state_t state_id;
+} omp_state_info_t;
+
+typedef struct {
+  const char *name;
+  ompt_mutex_impl_t id;
+} ompt_mutex_impl_info_t;
 
 enum tool_setting_e {
   omp_tool_error,
@@ -46,25 +54,27 @@
   omp_tool_enabled
 };
 
-typedef void (*ompt_initialize_t)(ompt_function_lookup_t ompt_fn_lookup,
-                                  const char *version,
-                                  unsigned int ompt_version);
-
 /*****************************************************************************
  * global variables
  ****************************************************************************/
 
-int ompt_enabled = 0;
+ompt_callbacks_active_t ompt_enabled;
+
+omp_state_info_t omp_state_info[] = {
+#define omp_state_macro(state, code) {#state, state},
+    FOREACH_OMP_STATE(omp_state_macro)
+#undef omp_state_macro
+};
 
-ompt_state_info_t ompt_state_info[] = {
-#define ompt_state_macro(state, code) {#state, state},
-    FOREACH_OMPT_STATE(ompt_state_macro)
-#undef ompt_state_macro
+ompt_mutex_impl_info_t ompt_mutex_impl_info[] = {
+#define ompt_mutex_impl_macro(name, id) {#name, name},
+    FOREACH_OMPT_MUTEX_IMPL(ompt_mutex_impl_macro)
+#undef ompt_mutex_impl_macro
 };
 
-ompt_callbacks_t ompt_callbacks;
+ompt_callbacks_internal_t ompt_callbacks;
 
-static ompt_initialize_t ompt_initialize_fn = NULL;
+static ompt_fns_t *ompt_fns = NULL;
 
 /*****************************************************************************
  * forward declarations
@@ -72,48 +82,71 @@
 
 static ompt_interface_fn_t ompt_fn_lookup(const char *s);
 
-OMPT_API_ROUTINE ompt_thread_id_t ompt_get_thread_id(void);
+OMPT_API_ROUTINE ompt_data_t *ompt_get_thread_data(void);
 
 /*****************************************************************************
  * initialization and finalization (private operations)
  ****************************************************************************/
 
 /* On Unix-like systems that support weak symbols the following implementation
- * of ompt_tool() will be used in case no tool-supplied implementation of
+ * of ompt_start_tool() will be used in case no tool-supplied implementation of
  * this function is present in the address space of a process.
  *
  * On Windows, the ompt_tool_windows function is used to find the
  * ompt_tool symbol across all modules loaded by a process. If ompt_tool is
  * found, ompt_tool's return value is used to initialize the tool. Otherwise,
  * NULL is returned and OMPT won't be enabled */
+
+typedef ompt_fns_t *(*ompt_start_tool_t)(unsigned int, const char *);
+
+#if KMP_OS_UNIX
+
 #if OMPT_HAVE_WEAK_ATTRIBUTE
+_OMP_EXTERN __attribute__((weak))
+#elif defined KMP_DYNAMIC_LIB
 _OMP_EXTERN
-__attribute__((weak)) ompt_initialize_t ompt_tool() {
+#warning Activation of OMPT is might fail for tools statically linked into the application.
+#else
+#error Activation of OMPT is not supported on this platform.
+#endif
+ompt_fns_t *
+ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
+#ifdef KMP_DYNAMIC_LIB
+  ompt_fns_t *ret = NULL;
+  // Try next symbol in the address space
+  ompt_start_tool_t next_tool = NULL;
+  next_tool = (ompt_start_tool_t)dlsym(RTLD_NEXT, "ompt_start_tool");
+  if (next_tool)
+    ret = (next_tool)(omp_version, runtime_version);
+  return ret;
+#else
 #if OMPT_DEBUG
-  printf("ompt_tool() is called from the RTL\n");
+  printf("ompt_start_tool() is called from the RTL\n");
 #endif
   return NULL;
+#endif
 }
 
 #elif OMPT_HAVE_PSAPI
 
 #include <psapi.h>
 #pragma comment(lib, "psapi.lib")
-#define ompt_tool ompt_tool_windows
+#define ompt_start_tool ompt_tool_windows
 
 // The number of loaded modules to start enumeration with EnumProcessModules()
 #define NUM_MODULES 128
 
-static ompt_initialize_t ompt_tool_windows() {
+static ompt_fns_t *ompt_tool_windows(unsigned int omp_version,
+                                     const char *runtime_version) {
   int i;
   DWORD needed, new_size;
   HMODULE *modules;
   HANDLE process = GetCurrentProcess();
   modules = (HMODULE *)malloc(NUM_MODULES * sizeof(HMODULE));
-  ompt_initialize_t (*ompt_tool_p)() = NULL;
+  ompt_start_tool_t ompt_tool_p = NULL;
 
 #if OMPT_DEBUG
-  printf("ompt_tool_windows(): looking for ompt_tool\n");
+  printf("ompt_tool_windows(): looking for ompt_start_tool\n");
 #endif
   if (!EnumProcessModules(process, modules, NUM_MODULES * sizeof(HMODULE),
                           &needed)) {
@@ -135,21 +168,22 @@
     }
   }
   for (i = 0; i < new_size; ++i) {
-    (FARPROC &)ompt_tool_p = GetProcAddress(modules[i], "ompt_tool");
+    (FARPROC &)ompt_tool_p = GetProcAddress(modules[i], "ompt_start_tool");
     if (ompt_tool_p) {
 #if OMPT_DEBUG
       TCHAR modName[MAX_PATH];
       if (GetModuleFileName(modules[i], modName, MAX_PATH))
-        printf("ompt_tool_windows(): ompt_tool found in module %s\n", modName);
+        printf("ompt_tool_windows(): ompt_start_tool found in module %s\n",
+               modName);
 #endif
       free(modules);
-      return ompt_tool_p();
+      return (*ompt_tool_p)(omp_version, runtime_version);
     }
 #if OMPT_DEBUG
     else {
       TCHAR modName[MAX_PATH];
       if (GetModuleFileName(modules[i], modName, MAX_PATH))
-        printf("ompt_tool_windows(): ompt_tool not found in module %s\n",
+        printf("ompt_tool_windows(): ompt_start_tool not found in module %s\n",
                modName);
     }
 #endif
@@ -161,6 +195,49 @@
 #error Either __attribute__((weak)) or psapi.dll are required for OMPT support
 #endif // OMPT_HAVE_WEAK_ATTRIBUTE
 
+static ompt_fns_t *ompt_try_start_tool(unsigned int omp_version,
+                                       const char *runtime_version) {
+  ompt_fns_t *ret = NULL;
+  ompt_start_tool_t start_tool = NULL;
+#if KMP_OS_WINDOWS
+  // Cannot use colon to describe a list of absolute paths on Windows
+  const char *sep = ";";
+#else
+  const char *sep = ":";
+#endif
+
+  // Try in the current address space
+  if ((ret = ompt_start_tool(omp_version, runtime_version)))
+    return ret;
+
+  // Try tool-libraries-var ICV
+  const char *tool_libs = getenv("OMP_TOOL_LIBRARIES");
+  if (tool_libs) {
+    const char *libs = __kmp_str_format("%s", tool_libs);
+    char *buf;
+    char *fname = __kmp_str_token(CCAST(char *, libs), sep, &buf);
+    while (fname) {
+#if KMP_OS_UNIX
+      void *h = dlopen(fname, RTLD_LAZY);
+      if (h) {
+        start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool");
+#elif KMP_OS_WINDOWS
+      HMODULE h = LoadLibrary(fname);
+      if (h) {
+        start_tool = (ompt_start_tool_t)GetProcAddress(h, "ompt_start_tool");
+#else
+#error Activation of OMPT is not supported on this platform.
+#endif
+        if (start_tool && (ret = (*start_tool)(omp_version, runtime_version)))
+          break;
+      }
+      fname = __kmp_str_token(NULL, sep, &buf);
+    }
+    __kmp_str_free(&libs);
+  }
+  return ret;
+}
+
 void ompt_pre_init() {
   //--------------------------------------------------
   // Execute the pre-initialization logic only once.
@@ -194,10 +271,14 @@
 
   case omp_tool_unset:
   case omp_tool_enabled:
-    ompt_initialize_fn = ompt_tool();
-    if (ompt_initialize_fn) {
-      ompt_enabled = 1;
-    }
+
+    //--------------------------------------------------
+    // Load tool iff specified in environment variable
+    //--------------------------------------------------
+    ompt_fns =
+        ompt_try_start_tool(__kmp_openmp_version, ompt_get_runtime_version());
+
+    memset(&ompt_enabled, 0, sizeof(ompt_enabled));
     break;
 
   case omp_tool_error:
@@ -226,31 +307,34 @@
   //--------------------------------------------------
   // Initialize the tool if so indicated.
   //--------------------------------------------------
-  if (ompt_enabled) {
-    ompt_initialize_fn(ompt_fn_lookup, ompt_get_runtime_version(),
-                       OMPT_VERSION);
+  if (ompt_fns) {
+    ompt_enabled.enabled = !!ompt_fns->initialize(ompt_fn_lookup, ompt_fns);
 
     ompt_thread_t *root_thread = ompt_get_thread();
 
-    ompt_set_thread_state(root_thread, ompt_state_overhead);
+    ompt_set_thread_state(root_thread, omp_state_overhead);
 
-    if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
-      ompt_callbacks.ompt_callback(ompt_event_thread_begin)(
-          ompt_thread_initial, ompt_get_thread_id());
+    if (ompt_enabled.ompt_callback_thread_begin) {
+      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
+          ompt_thread_initial, __ompt_get_thread_data_internal());
+    }
+    ompt_data_t *task_data;
+    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          NULL, NULL, task_data, ompt_task_initial, 0, NULL);
     }
 
-    ompt_set_thread_state(root_thread, ompt_state_work_serial);
+    ompt_set_thread_state(root_thread, omp_state_work_serial);
   }
 }
 
 void ompt_fini() {
-  if (ompt_enabled) {
-    if (ompt_callbacks.ompt_callback(ompt_event_runtime_shutdown)) {
-      ompt_callbacks.ompt_callback(ompt_event_runtime_shutdown)();
-    }
+  if (ompt_enabled.enabled) {
+    ompt_fns->finalize(ompt_fns);
   }
 
-  ompt_enabled = 0;
+  memset(&ompt_enabled, 0, sizeof(ompt_enabled));
 }
 
 /*****************************************************************************
@@ -261,15 +345,15 @@
  * state
  ****************************************************************************/
 
-OMPT_API_ROUTINE int ompt_enumerate_state(int current_state, int *next_state,
-                                          const char **next_state_name) {
-  const static int len = sizeof(ompt_state_info) / sizeof(ompt_state_info_t);
+OMPT_API_ROUTINE int ompt_enumerate_states(int current_state, int *next_state,
+                                           const char **next_state_name) {
+  const static int len = sizeof(omp_state_info) / sizeof(omp_state_info_t);
   int i = 0;
 
   for (i = 0; i < len - 1; i++) {
-    if (ompt_state_info[i].state_id == current_state) {
-      *next_state = ompt_state_info[i + 1].state_id;
-      *next_state_name = ompt_state_info[i + 1].state_name;
+    if (omp_state_info[i].state_id == current_state) {
+      *next_state = omp_state_info[i + 1].state_id;
+      *next_state_name = omp_state_info[i + 1].state_name;
       return 1;
     }
   }
@@ -277,17 +361,35 @@
   return 0;
 }
 
+OMPT_API_ROUTINE int ompt_enumerate_mutex_impls(int current_impl,
+                                                int *next_impl,
+                                                const char **next_impl_name) {
+  const static int len =
+      sizeof(ompt_mutex_impl_info) / sizeof(ompt_mutex_impl_info_t);
+  int i = 0;
+  for (i = 0; i < len - 1; i++) {
+    if (ompt_mutex_impl_info[i].id != current_impl)
+      continue;
+    *next_impl = ompt_mutex_impl_info[i + 1].id;
+    *next_impl_name = ompt_mutex_impl_info[i + 1].name;
+    return 1;
+  }
+  return 0;
+}
+
 /*****************************************************************************
  * callbacks
  ****************************************************************************/
 
-OMPT_API_ROUTINE int ompt_set_callback(ompt_event_t evid, ompt_callback_t cb) {
-  switch (evid) {
+OMPT_API_ROUTINE int ompt_set_callback(ompt_callbacks_t which,
+                                       ompt_callback_t callback) {
+  switch (which) {
 
 #define ompt_event_macro(event_name, callback_type, event_id)                  \
   case event_name:                                                             \
     if (ompt_event_implementation_status(event_name)) {                        \
-      ompt_callbacks.ompt_callback(event_name) = (callback_type)cb;            \
+      ompt_callbacks.ompt_callback(event_name) = (callback_type)callback;      \
+      ompt_enabled.event_name = 1;                                             \
     }                                                                          \
     return ompt_event_implementation_status(event_name);
 
@@ -296,12 +398,13 @@
 #undef ompt_event_macro
 
   default:
-    return ompt_set_result_registration_error;
+    return ompt_set_error;
   }
 }
 
-OMPT_API_ROUTINE int ompt_get_callback(ompt_event_t evid, ompt_callback_t *cb) {
-  switch (evid) {
+OMPT_API_ROUTINE int ompt_get_callback(ompt_callbacks_t which,
+                                       ompt_callback_t *callback) {
+  switch (which) {
 
 #define ompt_event_macro(event_name, callback_type, event_id)                  \
   case event_name:                                                             \
@@ -309,7 +412,7 @@
       ompt_callback_t mycb =                                                   \
           (ompt_callback_t)ompt_callbacks.ompt_callback(event_name);           \
       if (mycb) {                                                              \
-        *cb = mycb;                                                            \
+        *callback = mycb;                                                      \
         return ompt_get_callback_success;                                      \
       }                                                                        \
     }                                                                          \
@@ -328,54 +431,149 @@
  * parallel regions
  ****************************************************************************/
 
-OMPT_API_ROUTINE ompt_parallel_id_t ompt_get_parallel_id(int ancestor_level) {
-  return __ompt_get_parallel_id_internal(ancestor_level);
-}
-
-OMPT_API_ROUTINE int ompt_get_parallel_team_size(int ancestor_level) {
-  return __ompt_get_parallel_team_size_internal(ancestor_level);
-}
-
-OMPT_API_ROUTINE void *ompt_get_parallel_function(int ancestor_level) {
-  return __ompt_get_parallel_function_internal(ancestor_level);
+OMPT_API_ROUTINE int ompt_get_parallel_info(int ancestor_level,
+                                            ompt_data_t **parallel_data,
+                                            int *team_size) {
+  return __ompt_get_parallel_info_internal(ancestor_level, parallel_data,
+                                           team_size);
 }
 
-OMPT_API_ROUTINE ompt_state_t ompt_get_state(ompt_wait_id_t *ompt_wait_id) {
-  ompt_state_t thread_state = __ompt_get_state_internal(ompt_wait_id);
+OMPT_API_ROUTINE omp_state_t ompt_get_state(ompt_wait_id_t *wait_id) {
+  omp_state_t thread_state = __ompt_get_state_internal(wait_id);
 
-  if (thread_state == ompt_state_undefined) {
-    thread_state = ompt_state_work_serial;
+  if (thread_state == omp_state_undefined) {
+    thread_state = omp_state_work_serial;
   }
 
   return thread_state;
 }
 
 /*****************************************************************************
- * threads
+ * tasks
  ****************************************************************************/
 
-OMPT_API_ROUTINE void *ompt_get_idle_frame() {
-  return __ompt_get_idle_frame_internal();
+OMPT_API_ROUTINE ompt_data_t *ompt_get_thread_data(void) {
+  return __ompt_get_thread_data_internal();
+}
+
+OMPT_API_ROUTINE int ompt_get_task_info(int ancestor_level, int *type,
+                                        ompt_data_t **task_data,
+                                        ompt_frame_t **task_frame,
+                                        ompt_data_t **parallel_data,
+                                        int *thread_num) {
+  return __ompt_get_task_info_internal(ancestor_level, type, task_data,
+                                       task_frame, parallel_data, thread_num);
 }
 
 /*****************************************************************************
- * tasks
+ * places
  ****************************************************************************/
 
-OMPT_API_ROUTINE ompt_thread_id_t ompt_get_thread_id(void) {
-  return __ompt_get_thread_id_internal();
+OMPT_API_ROUTINE int ompt_get_num_places(void) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  return __kmp_affinity_num_masks;
+#endif
+}
+
+OMPT_API_ROUTINE int ompt_get_place_proc_ids(int place_num, int ids_size,
+                                             int *ids) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  int i, count;
+  int tmp_ids[ids_size];
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
+    return 0;
+  /* TODO: Is this safe for asynchronous call from signal handler during runtime
+   * shutdown? */
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
+  count = 0;
+  KMP_CPU_SET_ITERATE(i, mask) {
+    if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
+        (!KMP_CPU_ISSET(i, mask))) {
+      continue;
+    }
+    if (count < ids_size)
+      tmp_ids[count] = i;
+    count++;
+  }
+  if (ids_size >= count) {
+    for (i = 0; i < count; i++) {
+      ids[i] = tmp_ids[i];
+    }
+  }
+  return count;
+#endif
 }
 
-OMPT_API_ROUTINE ompt_task_id_t ompt_get_task_id(int depth) {
-  return __ompt_get_task_id_internal(depth);
+OMPT_API_ROUTINE int ompt_get_place_num(void) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  int gtid;
+  kmp_info_t *thread;
+  if (!KMP_AFFINITY_CAPABLE())
+    return -1;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread == NULL || thread->th.th_current_place < 0)
+    return -1;
+  return thread->th.th_current_place;
+#endif
 }
 
-OMPT_API_ROUTINE ompt_frame_t *ompt_get_task_frame(int depth) {
-  return __ompt_get_task_frame_internal(depth);
+OMPT_API_ROUTINE int ompt_get_partition_place_nums(int place_nums_size,
+                                                   int *place_nums) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  int i, gtid, place_num, first_place, last_place, start, end;
+  kmp_info_t *thread;
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread == NULL)
+    return 0;
+  first_place = thread->th.th_first_place;
+  last_place = thread->th.th_last_place;
+  if (first_place < 0 || last_place < 0)
+    return 0;
+  if (first_place <= last_place) {
+    start = first_place;
+    end = last_place;
+  } else {
+    start = last_place;
+    end = first_place;
+  }
+  if (end - start <= place_nums_size)
+    for (i = 0, place_num = start; place_num <= end; ++place_num, ++i) {
+      place_nums[i] = place_num;
+    }
+  return end - start;
+#endif
 }
 
-OMPT_API_ROUTINE void *ompt_get_task_function(int depth) {
-  return __ompt_get_task_function_internal(depth);
+/*****************************************************************************
+ * places
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_proc_id(void) {
+#if KMP_OS_LINUX
+  return sched_getcpu();
+#else
+  return -1;
+#endif
 }
 
 /*****************************************************************************
@@ -435,28 +633,59 @@
 OMPT_API_ROUTINE int ompt_get_ompt_version() { return OMPT_VERSION; }
 
 /*****************************************************************************
- * application-facing API
+* application-facing API
  ****************************************************************************/
 
 /*----------------------------------------------------------------------------
  | control
  ---------------------------------------------------------------------------*/
 
-_OMP_EXTERN void ompt_control(uint64_t command, uint64_t modifier) {
-  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_control)) {
-    ompt_callbacks.ompt_callback(ompt_event_control)(command, modifier);
+int __kmp_control_tool(uint64_t command, uint64_t modifier, void *arg) {
+
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_control_tool) {
+      return ompt_callbacks.ompt_callback(ompt_callback_control_tool)(
+          command, modifier, arg, OMPT_LOAD_RETURN_ADDRESS(__kmp_entry_gtid()));
+    } else {
+      return -1;
+    }
+  } else {
+    return -2;
   }
 }
 
 /*****************************************************************************
+ * misc
+ ****************************************************************************/
+
+OMPT_API_ROUTINE uint64_t ompt_get_unique_id(void) {
+  return __ompt_get_unique_id_internal();
+}
+
+/*****************************************************************************
+ * Target
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_target_info(uint64_t *device_num,
+                                          ompt_id_t *target_id,
+                                          ompt_id_t *host_op_id) {
+  return 0; // thread is not in a target region
+}
+
+OMPT_API_ROUTINE int ompt_get_num_devices(void) {
+  return 1; // only one device (the current device) is available
+}
+
+/*****************************************************************************
  * API inquiry for tool
  ****************************************************************************/
 
 static ompt_interface_fn_t ompt_fn_lookup(const char *s) {
 
 #define ompt_interface_fn(fn)                                                  \
+  fn##_t fn##_f = fn;                                                          \
   if (strcmp(s, #fn) == 0)                                                     \
-    return (ompt_interface_fn_t)fn;
+    return (ompt_interface_fn_t)fn##_f;
 
   FOREACH_OMPT_INQUIRY_FN(ompt_interface_fn)
 
Index: runtime/src/ompt-internal.h
===================================================================
--- runtime/src/ompt-internal.h
+++ runtime/src/ompt-internal.h
@@ -13,19 +13,38 @@
 
 #define ompt_callback(e) e##_callback
 
-typedef struct ompt_callbacks_s {
+typedef struct ompt_callbacks_internal_s {
 #define ompt_event_macro(event, callback, eventid)                             \
   callback ompt_callback(event);
 
   FOREACH_OMPT_EVENT(ompt_event_macro)
 
 #undef ompt_event_macro
-} ompt_callbacks_t;
+} ompt_callbacks_internal_t;
+
+typedef struct ompt_callbacks_active_s {
+  unsigned int enabled : 1;
+#define ompt_event_macro(event, callback, eventid) unsigned int event : 1;
+
+  FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+} ompt_callbacks_active_t;
+
+typedef struct kmp_taskdata kmp_taskdata_t;
+
+#define TASK_TYPE_DETAILS_FORMAT(info)                                         \
+  ((info->td_flags.task_serial || info->td_flags.tasking_ser)                  \
+       ? ompt_task_undeferred                                                  \
+       : 0x0) |                                                                \
+      ((!(info->td_flags.tiedness)) ? ompt_task_untied : 0x0) |                \
+      (info->td_flags.final ? ompt_task_final : 0x0) |                         \
+      (info->td_flags.merged_if0 ? ompt_task_mergeable : 0x0)
 
 typedef struct {
   ompt_frame_t frame;
-  void *function;
-  ompt_task_id_t task_id;
+  ompt_data_t task_data;
+  kmp_taskdata_t *scheduling_parent;
 #if OMP_40_ENABLED
   int ndeps;
   ompt_task_dependence_t *deps;
@@ -33,32 +52,31 @@
 } ompt_task_info_t;
 
 typedef struct {
-  ompt_parallel_id_t parallel_id;
-  void *microtask;
+  ompt_data_t parallel_data;
+  void *master_return_address;
 } ompt_team_info_t;
 
 typedef struct ompt_lw_taskteam_s {
   ompt_team_info_t ompt_team_info;
   ompt_task_info_t ompt_task_info;
+  int heap;
   struct ompt_lw_taskteam_s *parent;
 } ompt_lw_taskteam_t;
 
-typedef struct ompt_parallel_info_s {
-  ompt_task_id_t parent_task_id; /* id of parent task            */
-  ompt_parallel_id_t parallel_id; /* id of parallel region        */
-  ompt_frame_t *parent_task_frame; /* frame data of parent task    */
-  void *parallel_function; /* pointer to outlined function */
-} ompt_parallel_info_t;
-
 typedef struct {
-  ompt_state_t state;
+  ompt_data_t thread_data;
+  ompt_data_t task_data; /* stored here from implicit barrier-begin until
+                            implicit-task-end */
+  void *return_address; /* stored here on entry of runtime */
+  omp_state_t state;
   ompt_wait_id_t wait_id;
+  int ompt_task_yielded;
   void *idle_frame;
 } ompt_thread_info_t;
 
-extern ompt_callbacks_t ompt_callbacks;
+extern ompt_callbacks_internal_t ompt_callbacks;
 
-#if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE
+#if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_OPTIONAL
 #if USE_FAST_MEMORY
 #define KMP_OMPT_DEPS_ALLOC __kmp_fast_allocate
 #define KMP_OMPT_DEPS_FREE __kmp_fast_free
@@ -66,7 +84,7 @@
 #define KMP_OMPT_DEPS_ALLOC __kmp_thread_malloc
 #define KMP_OMPT_DEPS_FREE __kmp_thread_free
 #endif
-#endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */
+#endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_OPTIONAL */
 
 #ifdef __cplusplus
 extern "C" {
@@ -76,7 +94,20 @@
 void ompt_post_init(void);
 void ompt_fini(void);
 
-extern int ompt_enabled;
+#define OMPT_GET_RETURN_ADDRESS(level) __builtin_return_address(level)
+#define OMPT_GET_FRAME_ADDRESS(level) __builtin_frame_address(level)
+
+int __kmp_control_tool(uint64_t command, uint64_t modifier, void *arg);
+
+extern ompt_callbacks_active_t ompt_enabled;
+
+#if KMP_OS_WINDOWS
+#define UNLIKELY(x) (x)
+#define OMPT_NOINLINE __declspec(noinline)
+#else
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#define OMPT_NOINLINE __attribute__((noinline))
+#endif
 
 #ifdef __cplusplus
 };
Index: runtime/src/ompt-specific.h
===================================================================
--- runtime/src/ompt-specific.h
+++ runtime/src/ompt-specific.h
@@ -13,42 +13,63 @@
  * forward declarations
  ****************************************************************************/
 
-void __ompt_team_assign_id(kmp_team_t *team, ompt_parallel_id_t ompt_pid);
+void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid);
 void __ompt_thread_assign_wait_id(void *variable);
 
 void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, ompt_thread_t *thr,
-                             int gtid, void *microtask,
-                             ompt_parallel_id_t ompt_pid);
+                             int gtid, ompt_data_t *ompt_pid, void *codeptr);
 
-void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, ompt_thread_t *thr);
+void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, ompt_thread_t *thr,
+                             int on_heap);
 
-ompt_lw_taskteam_t *__ompt_lw_taskteam_unlink(ompt_thread_t *thr);
-
-ompt_parallel_id_t __ompt_parallel_id_new(int gtid);
-ompt_task_id_t __ompt_task_id_new(int gtid);
+void __ompt_lw_taskteam_unlink(ompt_thread_t *thr);
 
 ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size);
 
-ompt_task_info_t *__ompt_get_taskinfo(int depth);
-
-void __ompt_thread_begin(ompt_thread_type_t thread_type, int gtid);
+ompt_task_info_t *__ompt_get_task_info_object(int depth);
 
-void __ompt_thread_end(ompt_thread_type_t thread_type, int gtid);
+int __ompt_get_parallel_info_internal(int ancestor_level,
+                                      ompt_data_t **parallel_data,
+                                      int *team_size);
 
-int __ompt_get_parallel_team_size_internal(int ancestor_level);
+int __ompt_get_task_info_internal(int ancestor_level, int *type,
+                                  ompt_data_t **task_data,
+                                  ompt_frame_t **task_frame,
+                                  ompt_data_t **parallel_data, int *thread_num);
 
-ompt_task_id_t __ompt_get_task_id_internal(int depth);
+ompt_data_t *__ompt_get_thread_data_internal();
 
-ompt_frame_t *__ompt_get_task_frame_internal(int depth);
+static uint64_t __ompt_get_get_unique_id_internal();
 
 /*****************************************************************************
  * macros
  ****************************************************************************/
 
+#define OMPT_CUR_TASK_INFO(thr) (&(thr->th.th_current_task->ompt_task_info))
+#define OMPT_CUR_TASK_DATA(thr)                                                \
+  (&(thr->th.th_current_task->ompt_task_info.task_data))
+#define OMPT_CUR_TEAM_INFO(thr) (&(thr->th.th_team->t.ompt_team_info))
+#define OMPT_CUR_TEAM_DATA(thr)                                                \
+  (&(thr->th.th_team->t.ompt_team_info.parallel_data))
+
 #define OMPT_HAVE_WEAK_ATTRIBUTE KMP_HAVE_WEAK_ATTRIBUTE
 #define OMPT_HAVE_PSAPI KMP_HAVE_PSAPI
 #define OMPT_STR_MATCH(haystack, needle) __kmp_str_match(haystack, 0, needle)
 
+inline void *__ompt_load_return_address(int gtid) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  void *return_address = thr->th.ompt_thread_info.return_address;
+  thr->th.ompt_thread_info.return_address = NULL;
+  return return_address;
+}
+
+#define OMPT_STORE_RETURN_ADDRESS(gtid)                                        \
+  if (ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] &&              \
+      !__kmp_threads[gtid]->th.ompt_thread_info.return_address)                \
+  __kmp_threads[gtid]->th.ompt_thread_info.return_address =                    \
+      __builtin_return_address(0)
+#define OMPT_LOAD_RETURN_ADDRESS(gtid) __ompt_load_return_address(gtid)
+
 //******************************************************************************
 // inline functions
 //******************************************************************************
@@ -62,7 +83,7 @@
   return ompt_get_thread_gtid(gtid);
 }
 
-inline void ompt_set_thread_state(ompt_thread_t *thread, ompt_state_t state) {
+inline void ompt_set_thread_state(ompt_thread_t *thread, omp_state_t state) {
   thread->th.ompt_thread_info.state = state;
 }
 
Index: runtime/src/ompt-specific.cpp
===================================================================
--- runtime/src/ompt-specific.cpp
+++ runtime/src/ompt-specific.cpp
@@ -6,39 +6,31 @@
 #include "ompt-internal.h"
 #include "ompt-specific.h"
 
+#if KMP_OS_UNIX
+#include <dlfcn.h>
+#endif
+
+#if KMP_OS_WINDOWS
+#define THREAD_LOCAL __declspec(thread)
+#else
+#define THREAD_LOCAL __thread
+#endif
+
 //******************************************************************************
 // macros
 //******************************************************************************
 
-#define GTID_TO_OMPT_THREAD_ID(id) ((ompt_thread_id_t)(id >= 0) ? id + 1 : 0)
-
-#define LWT_FROM_TEAM(team) (team)->t.ompt_serialized_team_info;
+#define LWT_FROM_TEAM(team) (team)->t.ompt_serialized_team_info
 
 #define OMPT_THREAD_ID_BITS 16
 
-// 2013 08 24 - John Mellor-Crummey
-//   ideally, a thread should assign its own ids based on thread private data.
-//   however, the way the intel runtime reinitializes thread data structures
-//   when it creates teams makes it difficult to maintain persistent thread
-//   data. using a shared variable instead is simple. I leave it to intel to
-//   sort out how to implement a higher performance version in their runtime.
-
-// when using fetch_and_add to generate the IDs, there isn't any reason to waste
-// bits for thread id.
-#if 0
-#define NEXT_ID(id_ptr, tid)                                                   \
-  ((KMP_TEST_THEN_INC64(id_ptr) << OMPT_THREAD_ID_BITS) | (tid))
-#else
-#define NEXT_ID(id_ptr, tid) (KMP_TEST_THEN_INC64((volatile kmp_int64 *)id_ptr))
-#endif
-
 //******************************************************************************
 // private operations
 //******************************************************************************
 
 //----------------------------------------------------------
 // traverse the team and task hierarchy
-// note: __ompt_get_teaminfo and __ompt_get_taskinfo
+// note: __ompt_get_teaminfo and __ompt_get_task_info_object
 //       traverse the hierarchy similarly and need to be
 //       kept consistent
 //----------------------------------------------------------
@@ -51,7 +43,7 @@
     if (team == NULL)
       return NULL;
 
-    ompt_lw_taskteam_t *lwt = LWT_FROM_TEAM(team);
+    ompt_lw_taskteam_t *next_lwt = LWT_FROM_TEAM(team), *lwt = NULL;
 
     while (depth > 0) {
       // next lightweight team (if any)
@@ -61,9 +53,14 @@
       // next heavyweight team (if any) after
       // lightweight teams are exhausted
       if (!lwt && team) {
-        team = team->t.t_parent;
-        if (team) {
-          lwt = LWT_FROM_TEAM(team);
+        if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          team = team->t.t_parent;
+          if (team) {
+            next_lwt = LWT_FROM_TEAM(team);
+          }
         }
       }
 
@@ -90,13 +87,14 @@
   return NULL;
 }
 
-ompt_task_info_t *__ompt_get_taskinfo(int depth) {
+ompt_task_info_t *__ompt_get_task_info_object(int depth) {
   ompt_task_info_t *info = NULL;
   kmp_info_t *thr = ompt_get_thread();
 
   if (thr) {
     kmp_taskdata_t *taskdata = thr->th.th_current_task;
-    ompt_lw_taskteam_t *lwt = LWT_FROM_TEAM(taskdata->td_team);
+    ompt_lw_taskteam_t *lwt = NULL,
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
 
     while (depth > 0) {
       // next lightweight team (if any)
@@ -106,9 +104,59 @@
       // next heavyweight team (if any) after
       // lightweight teams are exhausted
       if (!lwt && taskdata) {
-        taskdata = taskdata->td_parent;
-        if (taskdata) {
-          lwt = LWT_FROM_TEAM(taskdata->td_team);
+        if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          taskdata = taskdata->td_parent;
+          if (taskdata) {
+            next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+          }
+        }
+      }
+      depth--;
+    }
+
+    if (lwt) {
+      info = &lwt->ompt_task_info;
+    } else if (taskdata) {
+      info = &taskdata->ompt_task_info;
+    }
+  }
+
+  return info;
+}
+
+ompt_task_info_t *__ompt_get_scheduling_taskinfo(int depth) {
+  ompt_task_info_t *info = NULL;
+  kmp_info_t *thr = ompt_get_thread();
+
+  if (thr) {
+    kmp_taskdata_t *taskdata = thr->th.th_current_task;
+
+    ompt_lw_taskteam_t *lwt = NULL,
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+
+    while (depth > 0) {
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && taskdata) {
+        // first try scheduling parent (for explicit task scheduling)
+        if (taskdata->ompt_task_info.scheduling_parent) {
+          taskdata = taskdata->ompt_task_info.scheduling_parent;
+        } else if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          // then go for implicit tasks
+          taskdata = taskdata->td_parent;
+          if (taskdata) {
+            next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+          }
         }
       }
       depth--;
@@ -132,29 +180,14 @@
 // thread support
 //----------------------------------------------------------
 
-ompt_parallel_id_t __ompt_thread_id_new() {
-  static uint64_t ompt_thread_id = 1;
-  return NEXT_ID(&ompt_thread_id, 0);
-}
-
-void __ompt_thread_begin(ompt_thread_type_t thread_type, int gtid) {
-  ompt_callbacks.ompt_callback(ompt_event_thread_begin)(
-      thread_type, GTID_TO_OMPT_THREAD_ID(gtid));
-}
-
-void __ompt_thread_end(ompt_thread_type_t thread_type, int gtid) {
-  ompt_callbacks.ompt_callback(ompt_event_thread_end)(
-      thread_type, GTID_TO_OMPT_THREAD_ID(gtid));
-}
-
-ompt_thread_id_t __ompt_get_thread_id_internal() {
-  // FIXME: until we have a better way of assigning ids, use __kmp_get_gtid
-  // since the return value might be negative, we need to test that before
-  // assigning it to an ompt_thread_id_t, which is unsigned.
-  int id = __kmp_get_gtid();
-  assert(id >= 0);
-
-  return GTID_TO_OMPT_THREAD_ID(id);
+ompt_data_t *__ompt_get_thread_data_internal() {
+  if (__kmp_get_gtid() >= 0) {
+    kmp_info_t *thread = ompt_get_thread();
+    if (thread == NULL)
+      return NULL;
+    return &(thread->th.ompt_thread_info.thread_data);
+  }
+  return NULL;
 }
 
 //----------------------------------------------------------
@@ -162,13 +195,12 @@
 //----------------------------------------------------------
 
 void __ompt_thread_assign_wait_id(void *variable) {
-  int gtid = __kmp_gtid_get_specific();
-  kmp_info_t *ti = ompt_get_thread_gtid(gtid);
+  kmp_info_t *ti = ompt_get_thread();
 
   ti->th.ompt_thread_info.wait_id = (ompt_wait_id_t)variable;
 }
 
-ompt_state_t __ompt_get_state_internal(ompt_wait_id_t *ompt_wait_id) {
+omp_state_t __ompt_get_state_internal(ompt_wait_id_t *ompt_wait_id) {
   kmp_info_t *ti = ompt_get_thread();
 
   if (ti) {
@@ -176,46 +208,26 @@
       *ompt_wait_id = ti->th.ompt_thread_info.wait_id;
     return ti->th.ompt_thread_info.state;
   }
-  return ompt_state_undefined;
-}
-
-//----------------------------------------------------------
-// idle frame support
-//----------------------------------------------------------
-
-void *__ompt_get_idle_frame_internal(void) {
-  kmp_info_t *ti = ompt_get_thread();
-  return ti ? ti->th.ompt_thread_info.idle_frame : NULL;
+  return omp_state_undefined;
 }
 
 //----------------------------------------------------------
 // parallel region support
 //----------------------------------------------------------
 
-ompt_parallel_id_t __ompt_parallel_id_new(int gtid) {
-  static uint64_t ompt_parallel_id = 1;
-  return gtid >= 0 ? NEXT_ID(&ompt_parallel_id, gtid) : 0;
-}
-
-void *__ompt_get_parallel_function_internal(int depth) {
-  ompt_team_info_t *info = __ompt_get_teaminfo(depth, NULL);
-  void *function = info ? info->microtask : NULL;
-  return function;
-}
-
-ompt_parallel_id_t __ompt_get_parallel_id_internal(int depth) {
-  ompt_team_info_t *info = __ompt_get_teaminfo(depth, NULL);
-  ompt_parallel_id_t id = info ? info->parallel_id : 0;
-  return id;
-}
-
-int __ompt_get_parallel_team_size_internal(int depth) {
-  // initialize the return value with the error value.
-  // if there is a team at the specified depth, the default
-  // value will be overwritten the size of that team.
-  int size = -1;
-  (void)__ompt_get_teaminfo(depth, &size);
-  return size;
+int __ompt_get_parallel_info_internal(int ancestor_level,
+                                      ompt_data_t **parallel_data,
+                                      int *team_size) {
+  ompt_team_info_t *info;
+  if (team_size) {
+    info = __ompt_get_teaminfo(ancestor_level, team_size);
+  } else {
+    info = __ompt_get_teaminfo(ancestor_level, NULL);
+  }
+  if (parallel_data) {
+    *parallel_data = info ? &(info->parallel_data) : NULL;
+  }
+  return info ? 2 : 0;
 }
 
 //----------------------------------------------------------
@@ -223,60 +235,182 @@
 //----------------------------------------------------------
 
 void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid,
-                             void *microtask, ompt_parallel_id_t ompt_pid) {
-  lwt->ompt_team_info.parallel_id = ompt_pid;
-  lwt->ompt_team_info.microtask = microtask;
-  lwt->ompt_task_info.task_id = 0;
+                             ompt_data_t *ompt_pid, void *codeptr) {
+  // initialize parallel_data with input, return address to parallel_data on
+  // exit
+  lwt->ompt_team_info.parallel_data = *ompt_pid;
+  lwt->ompt_team_info.master_return_address = codeptr;
+  lwt->ompt_task_info.task_data.value = 0;
   lwt->ompt_task_info.frame.reenter_runtime_frame = NULL;
   lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
-  lwt->ompt_task_info.function = NULL;
+  lwt->ompt_task_info.scheduling_parent = NULL;
+  lwt->ompt_task_info.deps = NULL;
+  lwt->ompt_task_info.ndeps = 0;
+  lwt->heap = 0;
   lwt->parent = 0;
 }
 
-void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr) {
-  ompt_lw_taskteam_t *my_parent = thr->th.th_team->t.ompt_serialized_team_info;
-  lwt->parent = my_parent;
-  thr->th.th_team->t.ompt_serialized_team_info = lwt;
+void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
+                             int on_heap) {
+  ompt_lw_taskteam_t *link_lwt = lwt;
+  if (thr->th.th_team->t.t_serialized >
+      1) { // we already have a team, so link the new team and swap values
+    if (on_heap) { // the lw_taskteam cannot stay on stack, allocate it on heap
+      link_lwt =
+          (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
+    }
+    link_lwt->heap = on_heap;
+
+    // would be swap in the (on_stack) case.
+    ompt_team_info_t tmp_team = lwt->ompt_team_info;
+    link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
+    *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
+
+    ompt_task_info_t tmp_task = lwt->ompt_task_info;
+    link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
+
+    // link the taskteam into the list of taskteams:
+    ompt_lw_taskteam_t *my_parent =
+        thr->th.th_team->t.ompt_serialized_team_info;
+    link_lwt->parent = my_parent;
+    thr->th.th_team->t.ompt_serialized_team_info = link_lwt;
+  } else {
+    // this is the first serialized team, so we just store the values in the
+    // team and drop the taskteam-object
+    *OMPT_CUR_TEAM_INFO(thr) = lwt->ompt_team_info;
+    *OMPT_CUR_TASK_INFO(thr) = lwt->ompt_task_info;
+  }
 }
 
-ompt_lw_taskteam_t *__ompt_lw_taskteam_unlink(kmp_info_t *thr) {
+void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
   ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
-  if (lwtask)
+  if (lwtask) {
     thr->th.th_team->t.ompt_serialized_team_info = lwtask->parent;
-  return lwtask;
+
+    ompt_team_info_t tmp_team = lwtask->ompt_team_info;
+    lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
+    *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
+
+    ompt_task_info_t tmp_task = lwtask->ompt_task_info;
+    lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
+
+    if (lwtask->heap) {
+      __kmp_free(lwtask);
+      lwtask = NULL;
+    }
+  }
+  //    return lwtask;
 }
 
 //----------------------------------------------------------
 // task support
 //----------------------------------------------------------
 
-ompt_task_id_t __ompt_task_id_new(int gtid) {
-  static uint64_t ompt_task_id = 1;
-  return NEXT_ID(&ompt_task_id, gtid);
-}
+int __ompt_get_task_info_internal(int ancestor_level, int *type,
+                                  ompt_data_t **task_data,
+                                  ompt_frame_t **task_frame,
+                                  ompt_data_t **parallel_data,
+                                  int *thread_num) {
+  if (ancestor_level < 0)
+    return 0;
 
-ompt_task_id_t __ompt_get_task_id_internal(int depth) {
-  ompt_task_info_t *info = __ompt_get_taskinfo(depth);
-  ompt_task_id_t task_id = info ? info->task_id : 0;
-  return task_id;
-}
+  // copied from __ompt_get_scheduling_taskinfo
+  ompt_task_info_t *info = NULL;
+  ompt_team_info_t *team_info = NULL;
+  kmp_info_t *thr = ompt_get_thread();
 
-void *__ompt_get_task_function_internal(int depth) {
-  ompt_task_info_t *info = __ompt_get_taskinfo(depth);
-  void *function = info ? info->function : NULL;
-  return function;
-}
+  if (thr) {
+    kmp_taskdata_t *taskdata = thr->th.th_current_task;
+    if (taskdata == NULL)
+      return 0;
+    kmp_team *team = thr->th.th_team;
+    if (team == NULL)
+      return 0;
+    ompt_lw_taskteam_t *lwt = NULL,
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+
+    while (ancestor_level > 0) {
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && taskdata) {
+        // first try scheduling parent (for explicit task scheduling)
+        if (taskdata->ompt_task_info.scheduling_parent) {
+          taskdata = taskdata->ompt_task_info.scheduling_parent;
+        } else if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          // then go for implicit tasks
+          taskdata = taskdata->td_parent;
+          if (team == NULL)
+            return 0;
+          team = team->t.t_parent;
+          if (taskdata) {
+            next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+          }
+        }
+      }
+      ancestor_level--;
+    }
 
-ompt_frame_t *__ompt_get_task_frame_internal(int depth) {
-  ompt_task_info_t *info = __ompt_get_taskinfo(depth);
-  ompt_frame_t *frame = info ? frame = &info->frame : NULL;
-  return frame;
+    if (lwt) {
+      info = &lwt->ompt_task_info;
+      team_info = &lwt->ompt_team_info;
+      if (type) {
+        *type = ompt_task_implicit;
+      }
+    } else if (taskdata) {
+      info = &taskdata->ompt_task_info;
+      team_info = &team->t.ompt_team_info;
+      if (type) {
+        if (taskdata->td_parent) {
+          *type = (taskdata->td_flags.tasktype ? ompt_task_explicit
+                                               : ompt_task_implicit) |
+                  TASK_TYPE_DETAILS_FORMAT(taskdata);
+        } else {
+          *type = ompt_task_initial;
+        }
+      }
+    }
+    if (task_data) {
+      *task_data = info ? &info->task_data : NULL;
+    }
+    if (task_frame) {
+      // OpenMP spec asks for the scheduling task to be returned.
+      *task_frame = info ? &info->frame : NULL;
+    }
+    if (parallel_data) {
+      *parallel_data = team_info ? &(team_info->parallel_data) : NULL;
+    }
+    return info ? 2 : 0;
+  }
+  return 0;
 }
 
 //----------------------------------------------------------
 // team support
 //----------------------------------------------------------
 
-void __ompt_team_assign_id(kmp_team_t *team, ompt_parallel_id_t ompt_pid) {
-  team->t.ompt_team_info.parallel_id = ompt_pid;
+void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid) {
+  team->t.ompt_team_info.parallel_data = ompt_pid;
+}
+
+//----------------------------------------------------------
+// misc
+//----------------------------------------------------------
+
+static uint64_t __ompt_get_unique_id_internal() {
+  static uint64_t thread = 1;
+  static THREAD_LOCAL uint64_t ID = 0;
+  if (ID == 0) {
+    uint64_t new_thread = KMP_TEST_THEN_INC64((kmp_int64 *)&thread);
+    ID = new_thread << (sizeof(uint64_t) * 8 - OMPT_THREAD_ID_BITS);
+  }
+  return ++ID;
 }
Index: runtime/src/z_Linux_util.cpp
===================================================================
--- runtime/src/z_Linux_util.cpp
+++ runtime/src/z_Linux_util.cpp
@@ -2280,7 +2280,7 @@
 #endif
                            ) {
 #if OMPT_SUPPORT
-  *exit_frame_ptr = __builtin_frame_address(0);
+  *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
 #endif
 
   switch (argc) {
Index: runtime/test/CMakeLists.txt
===================================================================
--- runtime/test/CMakeLists.txt
+++ runtime/test/CMakeLists.txt
@@ -34,8 +34,7 @@
 
 pythonize_bool(LIBOMP_USE_HWLOC)
 pythonize_bool(LIBOMP_OMPT_SUPPORT)
-pythonize_bool(LIBOMP_OMPT_BLAME)
-pythonize_bool(LIBOMP_OMPT_TRACE)
+pythonize_bool(LIBOMP_OMPT_OPTIONAL)
 pythonize_bool(LIBOMP_HAVE_LIBM)
 pythonize_bool(LIBOMP_HAVE_LIBATOMIC)
 
Index: runtime/test/lit.cfg
===================================================================
--- runtime/test/lit.cfg
+++ runtime/test/lit.cfg
@@ -92,15 +92,15 @@
     # for callback.h
     config.test_cflags += " -I " + config.test_source_root + "/ompt"
 
+if 'Linux' in config.operating_system:
+    config.available_features.add("linux")
+
 # to run with icc INTEL_LICENSE_FILE must be set
 if 'INTEL_LICENSE_FILE' in os.environ:
     config.environment['INTEL_LICENSE_FILE'] = os.environ['INTEL_LICENSE_FILE']
 
-# substitutions
-if config.has_ompt:
-    config.substitutions.append(("FileCheck", config.test_filecheck))
-    config.substitutions.append(("%sort-threads", "sort --numeric-sort --stable"))
 
+# substitutions
 config.substitutions.append(("%libomp-compile-and-run", \
     "%libomp-compile && %libomp-run"))
 config.substitutions.append(("%libomp-cxx-compile-and-run", \
@@ -109,9 +109,14 @@
     "%clangXX %cflags -std=c++11 %s -o %t" + libs))
 config.substitutions.append(("%libomp-compile", \
     "%clang %cflags %s -o %t" + libs))
+config.substitutions.append(("%libomp-tool", \
+    "%clang %cflags -shared -fPIC -o %T/tool.so" + libs))
 config.substitutions.append(("%libomp-run", "%t"))
 config.substitutions.append(("%clangXX", config.test_cxx_compiler))
 config.substitutions.append(("%clang", config.test_compiler))
 config.substitutions.append(("%openmp_flag", config.test_openmp_flag))
 config.substitutions.append(("%cflags", config.test_cflags))
 
+if config.has_ompt:
+    config.substitutions.append(("FileCheck", config.test_filecheck))
+    config.substitutions.append(("%sort-threads", "sort --numeric-sort --stable"))
Index: runtime/test/lit.site.cfg.in
===================================================================
--- runtime/test/lit.site.cfg.in
+++ runtime/test/lit.site.cfg.in
@@ -11,7 +11,7 @@
 config.operating_system = "@CMAKE_SYSTEM_NAME@"
 config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@"
 config.using_hwloc = @LIBOMP_USE_HWLOC@
-config.has_ompt = @LIBOMP_OMPT_SUPPORT@ and @LIBOMP_OMPT_BLAME@ and @LIBOMP_OMPT_TRACE@
+config.has_ompt = @LIBOMP_OMPT_SUPPORT@ and @LIBOMP_OMPT_OPTIONAL@
 config.has_libm = @LIBOMP_HAVE_LIBM@
 config.has_libatomic = @LIBOMP_HAVE_LIBATOMIC@
 
Index: runtime/test/ompt/callback.h
===================================================================
--- runtime/test/ompt/callback.h
+++ runtime/test/ompt/callback.h
@@ -1,119 +1,660 @@
+#define _BSD_SOURCE
 #include <stdio.h>
 #include <inttypes.h>
+#include <omp.h>
 #include <ompt.h>
+#include "ompt-signal.h"
 
-static ompt_get_task_id_t ompt_get_task_id;
-static ompt_get_task_frame_t ompt_get_task_frame;
-static ompt_get_thread_id_t ompt_get_thread_id;
-static ompt_get_parallel_id_t ompt_get_parallel_id;
+static const char* ompt_thread_type_t_values[] = {
+  NULL,
+  "ompt_thread_initial",
+  "ompt_thread_worker",
+  "ompt_thread_other"
+};
+
+static const char* ompt_task_status_t_values[] = {
+  NULL,
+  "ompt_task_complete",
+  "ompt_task_yield",
+  "ompt_task_cancel",
+  "ompt_task_others"
+};
+static const char* ompt_cancel_flag_t_values[] = {
+  "ompt_cancel_parallel",
+  "ompt_cancel_sections",
+  "ompt_cancel_do",
+  "ompt_cancel_taskgroup",
+  "ompt_cancel_activated",
+  "ompt_cancel_detected",
+  "ompt_cancel_discarded_task"
+};
+
+static ompt_set_callback_t ompt_set_callback;
+static ompt_get_task_info_t ompt_get_task_info;
+static ompt_get_thread_data_t ompt_get_thread_data;
+static ompt_get_parallel_info_t ompt_get_parallel_info;
+static ompt_get_unique_id_t ompt_get_unique_id;
+static ompt_get_num_places_t ompt_get_num_places;
+static ompt_get_place_proc_ids_t ompt_get_place_proc_ids;
+static ompt_get_place_num_t ompt_get_place_num;
+static ompt_get_partition_place_nums_t ompt_get_partition_place_nums;
+static ompt_get_proc_id_t ompt_get_proc_id;
+static ompt_enumerate_states_t ompt_enumerate_states;
+static ompt_enumerate_mutex_impls_t ompt_enumerate_mutex_impls;
 
 static void print_ids(int level)
 {
-  ompt_frame_t* frame = ompt_get_task_frame(level);
-  printf("%" PRIu64 ": level %d: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", exit_frame=%p, reenter_frame=%p\n", ompt_get_thread_id(), level, ompt_get_parallel_id(level), ompt_get_task_id(level), frame->exit_runtime_frame, frame->reenter_runtime_frame);
+  ompt_frame_t* frame ;
+  ompt_data_t* parallel_data;
+  ompt_data_t* task_data;
+  int exists_task = ompt_get_task_info(level, NULL, &task_data, &frame, &parallel_data, NULL);
+  if (frame)
+  {
+    printf("%" PRIu64 ": task level %d: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", exit_frame=%p, reenter_frame=%p\n", ompt_get_thread_data()->value, level, exists_task ? parallel_data->value : 0, exists_task ? task_data->value : 0, frame->exit_runtime_frame, frame->reenter_runtime_frame);
+  }
+  else
+    printf("%" PRIu64 ": task level %d: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", frame=%p\n", ompt_get_thread_data()->value, level, exists_task ? parallel_data->value : 0, exists_task ? task_data->value : 0, frame);
 }
 
 #define print_frame(level)\
 do {\
-  printf("%" PRIu64 ": __builtin_frame_address(%d)=%p\n", ompt_get_thread_id(), level, __builtin_frame_address(level));\
+  printf("%" PRIu64 ": __builtin_frame_address(%d)=%p\n", ompt_get_thread_data()->value, level, __builtin_frame_address(level));\
 } while(0)
 
+#define print_current_address(id)\
+{}              /* Empty block between "#pragma omp ..." and __asm__ statement as a workaround for icc bug */ \
+__asm__("nop"); /* provide an instruction as jump target (compiler would insert an instruction if label is target of a jmp ) */ \
+ompt_label_##id:\
+    printf("%" PRIu64 ": current_address=%p or %p\n", ompt_get_thread_data()->value, (char*)(&& ompt_label_##id)-1, (char*)(&& ompt_label_##id)-4) 
+    /* "&& label" returns the address of the label (GNU extension); works with gcc, clang, icc */
+    /* for void-type runtime function, the label is after the nop (-1), for functions with return value, there is a mov instruction before the label (-4) */
+
+#define print_fuzzy_address(id)\
+{}              /* Empty block between "#pragma omp ..." and __asm__ statement as a workaround for icc bug */ \
+__asm__("nop"); /* provide an instruction as jump target (compiler would insert an instruction if label is target of a jmp ) */ \
+ompt_label_##id:\
+    printf("%" PRIu64 ": fuzzy_address=0x%lx or 0x%lx\n", ompt_get_thread_data()->value, ((uint64_t)(char*)(&& ompt_label_##id))/256-1, ((uint64_t)(char*)(&& ompt_label_##id))/256) 
+    /* "&& label" returns the address of the label (GNU extension); works with gcc, clang, icc */
+    /* for void-type runtime function, the label is after the nop (-1), for functions with return value, there is a mov instruction before the label (-4) */
+
+static void format_task_type(int type, char* buffer)
+{
+  char* progress = buffer;
+  if(type & ompt_task_initial) progress += sprintf(progress, "ompt_task_initial");
+  if(type & ompt_task_implicit) progress += sprintf(progress, "ompt_task_implicit");
+  if(type & ompt_task_explicit) progress += sprintf(progress, "ompt_task_explicit");
+  if(type & ompt_task_target) progress += sprintf(progress, "ompt_task_target");
+  if(type & ompt_task_undeferred) progress += sprintf(progress, "|ompt_task_undeferred");
+  if(type & ompt_task_untied) progress += sprintf(progress, "|ompt_task_untied");
+  if(type & ompt_task_final) progress += sprintf(progress, "|ompt_task_final");
+  if(type & ompt_task_mergeable) progress += sprintf(progress, "|ompt_task_mergeable");
+  if(type & ompt_task_merged) progress += sprintf(progress, "|ompt_task_merged");
+}
+
+static void
+on_ompt_callback_mutex_acquire(
+  ompt_mutex_kind_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_wait_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_wait_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_critical:
+      printf("%" PRIu64 ": ompt_event_wait_critical: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_atomic:
+      printf("%" PRIu64 ": ompt_event_wait_atomic: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_ordered:
+      printf("%" PRIu64 ": ompt_event_wait_ordered: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_mutex_acquired(
+  ompt_mutex_kind_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_acquired_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_acquired_nest_lock_first: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_critical:
+      printf("%" PRIu64 ": ompt_event_acquired_critical: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_atomic:
+      printf("%" PRIu64 ": ompt_event_acquired_atomic: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_ordered:
+      printf("%" PRIu64 ": ompt_event_acquired_ordered: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_mutex_released(
+  ompt_mutex_kind_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_release_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_release_nest_lock_last: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_critical:
+      printf("%" PRIu64 ": ompt_event_release_critical: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_atomic:
+      printf("%" PRIu64 ": ompt_event_release_atomic: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_ordered:
+      printf("%" PRIu64 ": ompt_event_release_ordered: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_nest_lock(
+    ompt_scope_endpoint_t endpoint,
+    ompt_wait_id_t wait_id,
+    const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      printf("%" PRIu64 ": ompt_event_acquired_nest_lock_next: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_release_nest_lock_prev: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+  }
+}
+
+static void
+on_ompt_callback_sync_region(
+  ompt_sync_region_kind_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+          printf("%" PRIu64 ": ompt_event_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          print_ids(0);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_taskwait_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+      }
+      break;
+    case ompt_scope_end:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+          printf("%" PRIu64 ": ompt_event_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_taskwait_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+      }
+      break;
+  }
+}
+
+static void
+on_ompt_callback_sync_region_wait(
+  ompt_sync_region_kind_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+          printf("%" PRIu64 ": ompt_event_wait_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_wait_taskwait_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+      }
+      break;
+    case ompt_scope_end:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+          printf("%" PRIu64 ": ompt_event_wait_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_wait_taskwait_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+      }
+      break;
+  }
+}
+
+static void
+on_ompt_callback_flush(
+    ompt_data_t *thread_data,
+    const void *codeptr_ra)
+{
+  printf("%" PRIu64 ": ompt_event_flush: codeptr_ra=%p\n", thread_data->value, codeptr_ra);
+}
+
+static void
+on_ompt_callback_cancel(
+    ompt_data_t *task_data,
+    int flags,
+    const void *codeptr_ra)
+{
+  const char* first_flag_value;
+  const char* second_flag_value;
+  if(flags & ompt_cancel_parallel)
+    first_flag_value = ompt_cancel_flag_t_values[0];
+  else if(flags & ompt_cancel_sections)
+    first_flag_value = ompt_cancel_flag_t_values[1];
+  else if(flags & ompt_cancel_do)
+    first_flag_value = ompt_cancel_flag_t_values[2];
+  else if(flags & ompt_cancel_taskgroup)
+    first_flag_value = ompt_cancel_flag_t_values[3];
+
+  if(flags & ompt_cancel_activated)
+    second_flag_value = ompt_cancel_flag_t_values[4];
+  else if(flags & ompt_cancel_detected)
+    second_flag_value = ompt_cancel_flag_t_values[5];
+  else if(flags & ompt_cancel_discarded_task)
+    second_flag_value = ompt_cancel_flag_t_values[6];
+    
+  printf("%" PRIu64 ": ompt_event_cancel: task_data=%" PRIu64 ", flags=%s|%s=%" PRIu32 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, task_data->value, first_flag_value, second_flag_value, flags,  codeptr_ra);
+}
 
 static void
-on_ompt_event_barrier_begin(
-  ompt_parallel_id_t parallel_id,
-  ompt_task_id_t task_id)
+on_ompt_callback_idle(
+  ompt_scope_endpoint_t endpoint)
 {
-  printf("%" PRIu64 ": ompt_event_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 "\n", ompt_get_thread_id(), parallel_id, task_id);
-  print_ids(0);
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      printf("%" PRIu64 ": ompt_event_idle_begin:\n", ompt_get_thread_data()->value);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_idle_end:\n", ompt_get_thread_data()->value);
+      break;
+  }
 }
 
 static void
-on_ompt_event_barrier_end(
-  ompt_parallel_id_t parallel_id,
-  ompt_task_id_t task_id)
+on_ompt_callback_implicit_task(
+    ompt_scope_endpoint_t endpoint,
+    ompt_data_t *parallel_data,
+    ompt_data_t *task_data,
+    unsigned int team_size,
+    unsigned int thread_num)
 {
-  printf("%" PRIu64 ": ompt_event_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 "\n", ompt_get_thread_id(), parallel_id, task_id);
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      if(task_data->ptr)
+        printf("%s\n", "0: task_data initially not null");
+      task_data->value = ompt_get_unique_id();
+      printf("%" PRIu64 ": ompt_event_implicit_task_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", team_size=%" PRIu32 ", thread_num=%" PRIu32 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, team_size, thread_num);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_implicit_task_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", team_size=%" PRIu32 ", thread_num=%" PRIu32 "\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, team_size, thread_num);
+      break;
+  }
 }
 
 static void
-on_ompt_event_implicit_task_begin(
-  ompt_parallel_id_t parallel_id,
-  ompt_task_id_t task_id)
+on_ompt_callback_lock_init(
+  ompt_mutex_kind_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
 {
-  printf("%" PRIu64 ": ompt_event_implicit_task_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 "\n", ompt_get_thread_id(), parallel_id, task_id);
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_init_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_init_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    default:
+      break;
+  }
 }
 
 static void
-on_ompt_event_implicit_task_end(
-  ompt_parallel_id_t parallel_id,
-  ompt_task_id_t task_id)
+on_ompt_callback_lock_destroy(
+  ompt_mutex_kind_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
 {
-  printf("%" PRIu64 ": ompt_event_implicit_task_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 "\n", ompt_get_thread_id(), parallel_id, task_id);
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_destroy_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_destroy_nest_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    default:
+      break;
+  }
 }
 
 static void
-on_ompt_event_loop_begin(
-  ompt_parallel_id_t parallel_id,
-  ompt_task_id_t parent_task_id,
-  void *workshare_function)
+on_ompt_callback_work(
+  ompt_work_type_t wstype,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  uint64_t count,
+  const void *codeptr_ra)
 {
-  printf("%" PRIu64 ": ompt_event_loop_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", workshare_function=%p\n", ompt_get_thread_id(), parallel_id, parent_task_id, workshare_function);
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      switch(wstype)
+      {
+        case ompt_work_loop:
+          printf("%" PRIu64 ": ompt_event_loop_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_sections:
+          printf("%" PRIu64 ": ompt_event_sections_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_executor:
+          printf("%" PRIu64 ": ompt_event_single_in_block_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_other:
+          printf("%" PRIu64 ": ompt_event_single_others_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_workshare:
+          //impl
+          break;
+        case ompt_work_distribute:
+          printf("%" PRIu64 ": ompt_event_distribute_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_taskloop:
+          //impl
+          printf("%" PRIu64 ": ompt_event_taskloop_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+      }
+      break;
+    case ompt_scope_end:
+      switch(wstype)
+      {
+        case ompt_work_loop:
+          printf("%" PRIu64 ": ompt_event_loop_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_sections:
+          printf("%" PRIu64 ": ompt_event_sections_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_executor:
+          printf("%" PRIu64 ": ompt_event_single_in_block_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_other:
+          printf("%" PRIu64 ": ompt_event_single_others_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_workshare:
+          //impl
+          break;
+        case ompt_work_distribute:
+          printf("%" PRIu64 ": ompt_event_distribute_end: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_taskloop:
+          //impl
+          printf("%" PRIu64 ": ompt_event_taskloop_end: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+      }
+      break;
+  }
 }
 
 static void
-on_ompt_event_loop_end(
-  ompt_parallel_id_t parallel_id,
-  ompt_task_id_t task_id)
+on_ompt_callback_master(
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
 {
-  printf("%" PRIu64 ": ompt_event_loop_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 "\n", ompt_get_thread_id(), parallel_id, task_id);
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      printf("%" PRIu64 ": ompt_event_master_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_master_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      break;
+  }
 }
 
 static void
-on_ompt_event_parallel_begin(
-  ompt_task_id_t parent_task_id,
-  ompt_frame_t *parent_task_frame,
-  ompt_parallel_id_t parallel_id,
+on_ompt_callback_parallel_begin(
+  ompt_data_t *parent_task_data,
+  const ompt_frame_t *parent_task_frame,
+  ompt_data_t* parallel_data,
   uint32_t requested_team_size,
-  void *parallel_function,
-  ompt_invoker_t invoker)
+  ompt_invoker_t invoker,
+  const void *codeptr_ra)
 {
-  printf("%" PRIu64 ": ompt_event_parallel_begin: parent_task_id=%" PRIu64 ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, parallel_id=%" PRIu64 ", requested_team_size=%" PRIu32 ", parallel_function=%p, invoker=%d\n", ompt_get_thread_id(), parent_task_id, parent_task_frame->exit_runtime_frame, parent_task_frame->reenter_runtime_frame, parallel_id, requested_team_size, parallel_function, invoker);
+  if(parallel_data->ptr)
+    printf("%s\n", "0: parallel_data initially not null");
+  parallel_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ": ompt_event_parallel_begin: parent_task_id=%" PRIu64 ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, parallel_id=%" PRIu64 ", requested_team_size=%" PRIu32 ", codeptr_ra=%p, invoker=%d\n", ompt_get_thread_data()->value, parent_task_data->value, parent_task_frame->exit_runtime_frame, parent_task_frame->reenter_runtime_frame, parallel_data->value, requested_team_size, codeptr_ra, invoker);
 }
 
 static void
-on_ompt_event_parallel_end(
-  ompt_parallel_id_t parallel_id,
-  ompt_task_id_t task_id,
-  ompt_invoker_t invoker)
+on_ompt_callback_parallel_end(
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  ompt_invoker_t invoker,
+  const void *codeptr_ra)
 {
-  printf("%" PRIu64 ": ompt_event_parallel_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", invoker=%d\n", ompt_get_thread_id(), parallel_id, task_id, invoker);
+  printf("%" PRIu64 ": ompt_event_parallel_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", invoker=%d, codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, invoker, codeptr_ra);
 }
 
+static void
+on_ompt_callback_task_create(
+    ompt_data_t *parent_task_data,     /* id of parent task            */
+    const ompt_frame_t *parent_frame,  /* frame data for parent task   */
+    ompt_data_t* new_task_data,        /* id of created task           */
+    int type,
+    int has_dependences,
+    const void *codeptr_ra)               /* pointer to outlined function */
+{
+  if(new_task_data->ptr)
+    printf("%s\n", "0: new_task_data initially not null");
+  new_task_data->value = ompt_get_unique_id();
+  char buffer[2048];
+
+  format_task_type(type, buffer);
 
-void ompt_initialize(
+  //there is no paralllel_begin callback for implicit parallel region
+  //thus it is initialized in initial task
+  if(type & ompt_task_initial)
+  {
+    ompt_data_t *parallel_data;
+    ompt_get_parallel_info(0, &parallel_data, NULL);
+    if(parallel_data->ptr)
+      printf("%s\n", "0: parallel_data initially not null");
+    parallel_data->value = ompt_get_unique_id();
+  }
+
+  printf("%" PRIu64 ": ompt_event_task_create: parent_task_id=%" PRIu64 ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, new_task_id=%" PRIu64 ", codeptr_ra=%p, task_type=%s=%d, has_dependences=%s\n", ompt_get_thread_data()->value, parent_task_data ? parent_task_data->value : 0, parent_frame ? parent_frame->exit_runtime_frame : NULL, parent_frame ? parent_frame->reenter_runtime_frame : NULL, new_task_data->value, codeptr_ra, buffer, type, has_dependences ? "yes" : "no");
+}
+
+static void
+on_ompt_callback_task_schedule(
+    ompt_data_t *first_task_data,
+    ompt_task_status_t prior_task_status,
+    ompt_data_t *second_task_data)
+{
+  printf("%" PRIu64 ": ompt_event_task_schedule: first_task_id=%" PRIu64 ", second_task_id=%" PRIu64 ", prior_task_status=%s=%d\n", ompt_get_thread_data()->value, first_task_data->value, second_task_data->value, ompt_task_status_t_values[prior_task_status], prior_task_status);
+  if(prior_task_status == ompt_task_complete)
+  {
+    printf("%" PRIu64 ": ompt_event_task_end: task_id=%" PRIu64 "\n", ompt_get_thread_data()->value, first_task_data->value);
+  }
+}
+
+static void
+on_ompt_callback_task_dependences(
+  ompt_data_t *task_data,
+  const ompt_task_dependence_t *deps,
+  int ndeps)
+{
+  printf("%" PRIu64 ": ompt_event_task_dependences: task_id=%" PRIu64 ", deps=%p, ndeps=%d\n", ompt_get_thread_data()->value, task_data->value, (void *)deps, ndeps);
+}
+
+static void
+on_ompt_callback_task_dependence(
+  ompt_data_t *first_task_data,
+  ompt_data_t *second_task_data)
+{
+  printf("%" PRIu64 ": ompt_event_task_dependence_pair: first_task_id=%" PRIu64 ", second_task_id=%" PRIu64 "\n", ompt_get_thread_data()->value, first_task_data->value, second_task_data->value);
+}
+
+static void
+on_ompt_callback_thread_begin(
+  ompt_thread_type_t thread_type,
+  ompt_data_t *thread_data)
+{
+  if(thread_data->ptr)
+    printf("%s\n", "0: thread_data initially not null");
+  thread_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ": ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, ompt_thread_type_t_values[thread_type], thread_type, thread_data->value);
+}
+
+static void
+on_ompt_callback_thread_end(
+  ompt_data_t *thread_data)
+{
+  printf("%" PRIu64 ": ompt_event_thread_end: thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, thread_data->value);
+}
+
+static int
+on_ompt_callback_control_tool(
+  uint64_t command,
+  uint64_t modifier,
+  void *arg,
+  const void *codeptr_ra)
+{
+  ompt_frame_t* omptTaskFrame;
+  ompt_get_task_info(0, NULL, (ompt_data_t**) NULL, &omptTaskFrame, NULL, NULL);
+  printf("%" PRIu64 ": ompt_event_control_tool: command=%" PRIu64 ", modifier=%" PRIu64 ", arg=%p, codeptr_ra=%p, current_task_frame.exit=%p, current_task_frame.reenter=%p \n", ompt_get_thread_data()->value, command, modifier, arg, codeptr_ra, omptTaskFrame->exit_runtime_frame, omptTaskFrame->reenter_runtime_frame);
+  return 0; //success
+}
+
+#define register_callback_t(name, type)                       \
+do{                                                           \
+  type f_##name = &on_##name;                                 \
+  if (ompt_set_callback(name, (ompt_callback_t)f_##name) ==   \
+      ompt_set_never)                                         \
+    printf("0: Could not register callback '" #name "'\n");   \
+}while(0)
+
+#define register_callback(name) register_callback_t(name, name##_t)
+
+int ompt_initialize(
   ompt_function_lookup_t lookup,
-  const char *runtime_version,
-  unsigned int ompt_version)
+  ompt_fns_t* fns)
 {
-  ompt_set_callback_t ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
-  ompt_get_task_id = (ompt_get_task_id_t) lookup("ompt_get_task_id");
-  ompt_get_task_frame = (ompt_get_task_frame_t) lookup("ompt_get_task_frame");
-  ompt_get_thread_id = (ompt_get_thread_id_t) lookup("ompt_get_thread_id");
-  ompt_get_parallel_id = (ompt_get_parallel_id_t) lookup("ompt_get_parallel_id");
+  ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+  ompt_get_task_info = (ompt_get_task_info_t) lookup("ompt_get_task_info");
+  ompt_get_thread_data = (ompt_get_thread_data_t) lookup("ompt_get_thread_data");
+  ompt_get_parallel_info = (ompt_get_parallel_info_t) lookup("ompt_get_parallel_info");
+  ompt_get_unique_id = (ompt_get_unique_id_t) lookup("ompt_get_unique_id");
 
-  ompt_set_callback(ompt_event_barrier_begin, (ompt_callback_t) &on_ompt_event_barrier_begin);
-  ompt_set_callback(ompt_event_barrier_end, (ompt_callback_t) &on_ompt_event_barrier_end);
-  ompt_set_callback(ompt_event_implicit_task_begin, (ompt_callback_t) &on_ompt_event_implicit_task_begin);
-  ompt_set_callback(ompt_event_implicit_task_end, (ompt_callback_t) &on_ompt_event_implicit_task_end);
-  ompt_set_callback(ompt_event_loop_begin, (ompt_callback_t) &on_ompt_event_loop_begin);
-  ompt_set_callback(ompt_event_loop_end, (ompt_callback_t) &on_ompt_event_loop_end);
-  ompt_set_callback(ompt_event_parallel_begin, (ompt_callback_t) &on_ompt_event_parallel_begin);
-  ompt_set_callback(ompt_event_parallel_end, (ompt_callback_t) &on_ompt_event_parallel_end);
-  printf("0: NULL_POINTER=%p\n", NULL);
+  ompt_get_num_places = (ompt_get_num_places_t) lookup("ompt_get_num_places");
+  ompt_get_place_proc_ids = (ompt_get_place_proc_ids_t) lookup("ompt_get_place_proc_ids");
+  ompt_get_place_num = (ompt_get_place_num_t) lookup("ompt_get_place_num");
+  ompt_get_partition_place_nums = (ompt_get_partition_place_nums_t) lookup("ompt_get_partition_place_nums");
+  ompt_get_proc_id = (ompt_get_proc_id_t) lookup("ompt_get_proc_id");
+  ompt_enumerate_states = (ompt_enumerate_states_t) lookup("ompt_enumerate_states");
+  ompt_enumerate_mutex_impls = (ompt_enumerate_mutex_impls_t) lookup("ompt_enumerate_mutex_impls");
+
+  register_callback(ompt_callback_mutex_acquire);
+  register_callback_t(ompt_callback_mutex_acquired, ompt_callback_mutex_t);
+  register_callback_t(ompt_callback_mutex_released, ompt_callback_mutex_t);
+  register_callback(ompt_callback_nest_lock);
+  register_callback(ompt_callback_sync_region);
+  register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+  register_callback(ompt_callback_control_tool);
+  register_callback(ompt_callback_flush);
+  register_callback(ompt_callback_cancel);
+  register_callback(ompt_callback_idle);
+  register_callback(ompt_callback_implicit_task);
+  register_callback_t(ompt_callback_lock_init, ompt_callback_mutex_acquire_t);
+  register_callback_t(ompt_callback_lock_destroy, ompt_callback_mutex_t);
+  register_callback(ompt_callback_work);
+  register_callback(ompt_callback_master);
+  register_callback(ompt_callback_parallel_begin);
+  register_callback(ompt_callback_parallel_end);
+  register_callback(ompt_callback_task_create);
+  register_callback(ompt_callback_task_schedule);
+  register_callback(ompt_callback_task_dependences);
+  register_callback(ompt_callback_task_dependence);
+  register_callback(ompt_callback_thread_begin);
+  register_callback(ompt_callback_thread_end);
+  printf("0: NULL_POINTER=%p\n", (void*)NULL);
+  return 1; //success
+}
+
+void ompt_finalize(ompt_fns_t* fns)
+{
+  printf("0: ompt_event_runtime_shutdown\n");
 }
 
-ompt_initialize_t ompt_tool()
+ompt_fns_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
 {
-  return &ompt_initialize;
+  static ompt_fns_t ompt_fns = {&ompt_initialize,&ompt_finalize};
+  return &ompt_fns;
 }
Index: runtime/test/ompt/cancel/cancel_parallel.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/cancel/cancel_parallel.c
@@ -0,0 +1,42 @@
+// RUN: %libomp-compile && env OMP_CANCELLATION=true %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implementation does not support cancellation
+// XFAIL: gcc
+
+#include "callback.h"
+#include "omp.h"
+
+int main()
+{
+  #pragma omp parallel num_threads(2)
+  {
+    if(omp_get_thread_num() == 0)
+    {
+      printf("%" PRIu64 ": fuzzy_address=0x%lx or 0x%lx\n", ompt_get_thread_data()->value, ((uint64_t)(char*)(&& ompt_label_1))/256-1, ((uint64_t)(char*)(&& ompt_label_1))/256);
+      #pragma omp cancel parallel
+      print_fuzzy_address(1); //does not actually print the address but provides a label
+    }
+    else
+    {
+      delay(100);
+      printf("%" PRIu64 ": fuzzy_address=0x%lx or 0x%lx\n", ompt_get_thread_data()->value, ((uint64_t)(char*)(&& ompt_label_2))/256-1, ((uint64_t)(char*)(&& ompt_label_2))/256);
+      #pragma omp cancellation point parallel
+      print_fuzzy_address(2); //does not actually print the address but provides a label
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[NULL]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[NULL]], task_type=ompt_task_initial=1, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_parallel|ompt_cancel_activated=17, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: fuzzy_address={{.*}}[[OTHER_RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_parallel|ompt_cancel_detected=33, codeptr_ra=[[OTHER_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+  return 0;
+}
Index: runtime/test/ompt/cancel/cancel_taskgroup.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/cancel/cancel_taskgroup.c
@@ -0,0 +1,88 @@
+// RUN:  %libomp-compile && env OMP_CANCELLATION=true %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implementation does not support cancellation
+// XFAIL: gcc
+
+#include "callback.h"
+#include <unistd.h>  
+#include <stdio.h>
+
+int main()
+{
+  int condition=0;
+  #pragma omp parallel num_threads(2)
+  {}
+
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      #pragma omp taskgroup
+      {
+        #pragma omp task shared(condition)
+        {
+          printf("start execute task 1\n");
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+          #pragma omp cancellation point taskgroup
+          printf("end execute task 1\n");
+        }
+        #pragma omp task shared(condition)
+        {
+          printf("start execute task 2\n");
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+          #pragma omp cancellation point taskgroup
+          printf("end execute task 2\n");
+        }
+      #pragma omp task shared(condition)
+        {
+          printf("start execute task 3\n");
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+          #pragma omp cancellation point taskgroup
+          printf("end execute task 3\n");
+        }
+      #pragma omp task if(0) shared(condition)
+        {
+          printf("start execute task 4\n");
+          OMPT_WAIT(condition,1);
+          #pragma omp cancel taskgroup
+          printf("end execute task 4\n");
+        }
+        OMPT_SIGNAL(condition);
+      }
+    }
+    #pragma omp barrier
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_master_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[FIRST_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[SECOND_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[THIRD_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[CANCEL_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[PARENT_TASK_ID]], second_task_id=[[CANCEL_TASK_ID]], prior_task_status=ompt_task_others=4
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[CANCEL_TASK_ID]], flags=ompt_cancel_taskgroup|ompt_cancel_activated=24, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[CANCEL_TASK_ID]], second_task_id=[[PARENT_TASK_ID]], prior_task_status=ompt_task_cancel=3
+
+  // CHECK-DAG: {{^}}{{[0-9]+}}: ompt_event_cancel: task_data={{[0-9]+}}, flags=ompt_cancel_taskgroup|ompt_cancel_discarded_task=72, codeptr_ra=[[NULL]]
+  // CHECK-DAG: {{^}}{{[0-9]+}}: ompt_event_cancel: task_data={{[0-9]+}}, flags=ompt_cancel_taskgroup|ompt_cancel_discarded_task=72, codeptr_ra=[[NULL]]
+  
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_cancel: task_data={{[0-9]+}}, flags=ompt_cancel_taskgroup|ompt_cancel_detected=40, codeptr_ra={{0x[0-f]*}}
+
+  return 0;
+}
Index: runtime/test/ompt/cancel/cancel_worksharing.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/cancel/cancel_worksharing.c
@@ -0,0 +1,68 @@
+// RUN: %libomp-compile && env OMP_CANCELLATION=true %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implementation does not support cancellation
+// XFAIL: gcc
+
+
+#include "callback.h"
+#include <unistd.h>
+
+int main()
+{
+  int condition=0;
+  #pragma omp parallel num_threads(2)
+  {
+    int x = 0;
+    int i;
+    #pragma omp for
+    for(i = 0; i < 2; i++)
+    {
+      if(i == 0)
+      {
+        x++;
+        OMPT_SIGNAL(condition);
+        #pragma omp cancel for
+      }
+      else
+      {
+        x++;
+        OMPT_WAIT(condition,1);
+        delay(10000);
+        #pragma omp cancellation point for
+      }
+    }
+  }
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp sections
+    {
+      #pragma omp section
+      {
+        OMPT_SIGNAL(condition);
+        #pragma omp cancel sections
+      }
+      #pragma omp section
+      {
+        OMPT_WAIT(condition,2);
+        delay(10000);
+        #pragma omp cancellation point sections
+      }
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[NULL]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[NULL]], task_type=ompt_task_initial=1, has_dependences=no
+  
+  // cancel for and sections
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_do|ompt_cancel_activated=20, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_sections|ompt_cancel_activated=18, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[OTHER_THREAD_ID:[0-9]+]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_do|ompt_cancel_detected=36, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[OTHER_THREAD_ID:[0-9]+]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_sections|ompt_cancel_detected=34, codeptr_ra={{0x[0-f]*}}
+
+  return 0;
+}
Index: runtime/test/ompt/loadtool/tool_available.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/loadtool/tool_available.c
@@ -0,0 +1,59 @@
+// RUN: %libomp-compile -DCODE && %libomp-compile -DTOOL -o%T/tool.so -shared -fPIC && env OMP_TOOL_LIBRARIES=%T/tool.so %libomp-run | FileCheck %s
+// REQUIRES: ompt
+
+/*
+ *  This file contains code for an OMPT shared library tool to be 
+ *  loaded and the code for the OpenMP executable. 
+ *  -DTOOL enables the code for the tool during compilation
+ *  -DCODE enables the code for the executable during compilation
+ *  The RUN line compiles the two binaries and then tries to load
+ *  the tool using the OMP_TOOL_LIBRARIES environmental variable.
+ */
+
+#ifdef CODE
+#include "omp.h"
+
+int main()
+{
+  #pragma omp parallel num_threads(2)
+  {
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 
+  
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}0: ompt_event_runtime_shutdown
+
+  return 0;
+}
+
+#endif /* CODE */
+
+#ifdef TOOL
+
+#include <stdio.h>
+#include <ompt.h>
+
+int ompt_initialize(
+  ompt_function_lookup_t lookup,
+  ompt_fns_t* fns)
+{
+  printf("0: NULL_POINTER=%p\n", (void*)NULL);
+  return 1; //success
+}
+
+void ompt_finalize(ompt_fns_t* fns)
+{
+  printf("%d: ompt_event_runtime_shutdown\n", omp_get_thread_num());
+}
+
+ompt_fns_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  static ompt_fns_t ompt_fns = {&ompt_initialize,&ompt_finalize};
+  return &ompt_fns;
+}
+#endif /* TOOL */
Index: runtime/test/ompt/misc/api_calls.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/misc/api_calls.c
@@ -0,0 +1,66 @@
+// RUN: %libomp-compile && env OMP_PLACES=cores %libomp-run | FileCheck %s
+// REQUIRES: ompt, linux
+#include "callback.h"
+#include <omp.h>
+#define __USE_GNU
+#include <sched.h>
+#undef __USE_GNU
+
+void print_list(char* function_name, int list[])
+{
+  printf("%" PRIu64 ": %s(0)=(%d", ompt_get_thread_data()->value, function_name, list[0]);
+  int i;
+  for(i = 1; i < omp_get_place_num_procs(0); i++)
+  {
+    printf(",%d", list[i]);
+  }
+  printf(")\n");
+}
+
+int main()
+{
+  #pragma omp parallel num_threads(1)
+  {
+  	printf("%" PRIu64 ": omp_get_num_places()=%d\n", ompt_get_thread_data()->value, omp_get_num_places());
+  	printf("%" PRIu64 ": ompt_get_num_places()=%d\n", ompt_get_thread_data()->value, ompt_get_num_places());
+
+    int omp_ids[omp_get_place_num_procs(0)];
+    omp_get_place_proc_ids(0, omp_ids);
+    print_list("omp_get_place_proc_ids" ,omp_ids);
+    int ompt_ids[omp_get_place_num_procs(0)];
+    ompt_get_place_proc_ids(0, omp_get_place_num_procs(0), ompt_ids);
+    print_list("ompt_get_place_proc_ids", ompt_ids);
+
+  	printf("%" PRIu64 ": omp_get_place_num()=%d\n", ompt_get_thread_data()->value, omp_get_place_num());
+  	printf("%" PRIu64 ": ompt_get_place_num()=%d\n", ompt_get_thread_data()->value, ompt_get_place_num());
+
+    int omp_nums[omp_get_partition_num_places()];
+    omp_get_partition_place_nums(omp_nums);
+    print_list("omp_get_partition_place_nums" ,omp_nums);
+    int ompt_nums[omp_get_partition_num_places()];
+    ompt_get_partition_place_nums(omp_get_partition_num_places(), ompt_nums);
+    print_list("ompt_get_partition_place_nums", ompt_nums);
+
+    printf("%" PRIu64 ": sched_getcpu()=%d\n", ompt_get_thread_data()->value, sched_getcpu());
+    printf("%" PRIu64 ": ompt_get_proc_id()=%d\n", ompt_get_thread_data()->value, ompt_get_proc_id());
+  }
+
+  // Check if libomp supports the callbacks for this test.
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: omp_get_num_places()=[[NUM_PLACES:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_places()=[[NUM_PLACES]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_place_proc_ids(0)=([[PROC_IDS:[0-9\,]+]])
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_proc_ids(0)=([[PROC_IDS]])
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_place_num()=[[PLACE_NUM:[-]?[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_num()=[[PLACE_NUM]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: sched_getcpu()=[[CPU_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_proc_id()=[[CPU_ID]]
+
+
+  return 0;
+}
Index: runtime/test/ompt/misc/control_tool.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/misc/control_tool.c
@@ -0,0 +1,27 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel num_threads(1)
+  {
+    print_frame(1);
+    print_frame(0);
+    omp_control_tool(omp_control_tool_flush, 1, NULL);
+    print_current_address(0);
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_control_tool'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(1)=[[EXIT_FRAME:0x[0-f]*]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER_FRAME:0x[0-f]*]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_control_tool: command=3, modifier=1, arg=[[NULL]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]*]], current_task_frame.exit=[[EXIT_FRAME]], current_task_frame.reenter=[[REENTER_FRAME]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/misc/control_tool_no_ompt_support.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/misc/control_tool_no_ompt_support.c
@@ -0,0 +1,12 @@
+// RUN: %libomp-compile-and-run
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel num_threads(1)
+  {
+    omp_control_tool(omp_control_tool_flush, 1, NULL);
+  }
+
+  return 0;
+}
Index: runtime/test/ompt/misc/idle.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/misc/idle.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(3)
+  {
+    #pragma omp atomic
+    x++;
+  }
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp atomic
+    x++;
+  }
+
+
+  printf("x=%d\n", x);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_idle'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_idle_begin:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_idle_end:
+
+  return 0;
+}
Index: runtime/test/ompt/ompt-signal.h
===================================================================
--- /dev/null
+++ runtime/test/ompt/ompt-signal.h
@@ -0,0 +1,31 @@
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#else
+#include <unistd.h>
+#define delay(t) usleep(t);
+#endif
+
+// These functions are used to provide a signal-wait mechanism to enforce expected scheduling for the test cases.
+// Conditional variable (s) needs to be shared! Initialize to 0
+
+#define OMPT_SIGNAL(s) ompt_signal(&s)
+//inline 
+void ompt_signal(int* s) 
+{                
+  #pragma omp atomic
+  (*s)++;
+}
+                
+#define OMPT_WAIT(s,v) ompt_wait(&s,v)
+// wait for s >= v
+//inline 
+void ompt_wait(int *s, int v)
+{
+  int wait=0;
+  do{
+    delay(10);
+    #pragma omp atomic read
+	  wait = (*s);
+  }while(wait<v);
+}
Index: runtime/test/ompt/parallel/dynamic_enough_threads.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/parallel/dynamic_enough_threads.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  omp_set_dynamic(1);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+  //team-size of 1-4 is expected
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size={{[1-4]}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/parallel/dynamic_not_enough_threads.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/parallel/dynamic_not_enough_threads.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  omp_set_dynamic(1);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+  //team-size of 1-4 is expected
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size={{[1-4]}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/parallel/max_active_levels_serialized.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/parallel/max_active_levels_serialized.c
@@ -0,0 +1,72 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_set_nested(1);
+  omp_set_max_active_levels(1);
+
+  #pragma omp parallel num_threads(2)
+  {
+    print_ids(0);
+    print_ids(1);
+    #pragma omp parallel num_threads(2)
+    {
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
Index: runtime/test/ompt/parallel/nested.c
===================================================================
--- runtime/test/ompt/parallel/nested.c
+++ runtime/test/ompt/parallel/nested.c
@@ -3,9 +3,11 @@
 // REQUIRES: ompt
 #include "callback.h"
 #include <omp.h>
+#include <unistd.h>
 
 int main()
 {
+  int condition=0;
   omp_set_nested(1);
   print_frame(0);
 
@@ -15,6 +17,10 @@
     print_ids(0);
     print_ids(1);
     print_frame(0);
+
+    //get all implicit task events before starting nested:
+    #pragma omp barrier
+    
     #pragma omp parallel num_threads(4)
     {
       print_frame(1);
@@ -22,17 +28,38 @@
       print_ids(1);
       print_ids(2);
       print_frame(0);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition,16);
       #pragma omp barrier
+      print_fuzzy_address(1);
       print_ids(0);
     }
+    print_fuzzy_address(2);
     print_ids(0);
   }
+  print_fuzzy_address(3);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
 
   // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:.+]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
 
   // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
 
@@ -46,219 +73,224 @@
   // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
 
   // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
 
 
   // THREADS: {{^}}0: NULL_POINTER=[[NULL:.*$]]
   // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:.+]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
 
   // nested parallel masters
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
   // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address(1)=[[EXIT:0x[0-f]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
   // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_PARALLEL_FUNCTION:0x[0-f]+]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
   // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address(1)=[[NESTED_EXIT:0x[0-f]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
   // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[NESTED_REENTER:0x[0-f]+]]
   // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
   // explicit barrier
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NESTED_REENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NESTED_REENTER]]
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[BARRIER_RETURN_ADDRESS]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
   // implicit barrier
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
   // implicit barrier
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // nested parallel worker threads
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   return 0;
 }
Index: runtime/test/ompt/parallel/nested_lwt.c
===================================================================
--- runtime/test/ompt/parallel/nested_lwt.c
+++ runtime/test/ompt/parallel/nested_lwt.c
@@ -3,35 +3,59 @@
 // REQUIRES: ompt
 #include "callback.h"
 #include <omp.h>
+#include <unistd.h>
 
 int main()
 {
   omp_set_nested(1);
+  int condition;
 
   #pragma omp parallel num_threads(4)
   {
     print_ids(0);
     print_ids(1);
+    //get all implicit task events before starting nested:
+    #pragma omp barrier
     #pragma omp parallel num_threads(1)
     {
       print_ids(0);
       print_ids(1);
       print_ids(2);
+      //get all implicit task events before starting nested:
+      #pragma omp barrier
       #pragma omp parallel num_threads(4)
       {
         print_ids(0);
         print_ids(1);
         print_ids(2);
         print_ids(3);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition,16);
       }
+      print_fuzzy_address(1);
     }
+    print_fuzzy_address(2);
   }
+  print_fuzzy_address(3);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
 
   // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:.+]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
 
   // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
 
@@ -48,251 +72,261 @@
 
 
   // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
-  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:.+]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
 
   // nested parallel masters
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
 
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, parallel_function=[[NESTED_PARALLEL_FUNCTION:0x[0-f]+]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
 
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_NESTED_PARALLEL_FUNCTION:0x[0-f]+]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, parallel_function=[[NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, parallel_function=[[NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, parallel_function=[[NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // nested parallel worker threads
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
   // can't reliably tell which parallel region is the parent...
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
-  // THREADS: {{^}}[[THREAD_ID]]: level 3: parallel_id=0, task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
 
   return 0;
Index: runtime/test/ompt/parallel/nested_serialized.c
===================================================================
--- runtime/test/ompt/parallel/nested_serialized.c
+++ runtime/test/ompt/parallel/nested_serialized.c
@@ -18,13 +18,29 @@
       print_ids(1);
       print_ids(2);
     }
+    print_fuzzy_address(1);
   }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
 
   // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:.+]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
 
   // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
 
@@ -41,67 +57,71 @@
 
 
   // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
-  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:.+]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
 
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_PARALLEL_FUNCTION:0x[0-f]+]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
   // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=[[NESTED_PARALLEL_FUNCTION]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 2: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   return 0;
 }
Index: runtime/test/ompt/parallel/no_thread_num_clause.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/parallel/no_thread_num_clause.c
@@ -0,0 +1,95 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  omp_set_num_threads(4);
+  #pragma omp parallel
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=0, parent_task_frame.exit=(nil), parent_task_frame.reenter=(nil), new_task_id=281474976710658, codeptr_ra=(nil), task_type=ompt_task_initial=1, has_dependences=no
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker={{[0-9]+}}
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
Index: runtime/test/ompt/parallel/normal.c
===================================================================
--- runtime/test/ompt/parallel/normal.c
+++ runtime/test/ompt/parallel/normal.c
@@ -10,12 +10,27 @@
     print_ids(0);
     print_ids(1);
   }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
 
   // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:.+]]
+  
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
 
   // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
 
@@ -28,43 +43,48 @@
   // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
   // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
 
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
 
 
   // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
-  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=0x{{[0-f]+}}, invoker={{.*}}
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker={{.*}}
 
   // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
-  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
-  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
-  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
   // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   return 0;
 }
Index: runtime/test/ompt/parallel/not_enough_threads.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/parallel/not_enough_threads.c
@@ -0,0 +1,76 @@
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | FileCheck %s
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker={{[0-9]+}}
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
Index: runtime/test/ompt/parallel/parallel_if0.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/parallel/parallel_if0.c
@@ -0,0 +1,75 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+//  print_frame(0);
+  #pragma omp parallel if(0)
+  {
+//    print_frame(1);
+    print_ids(0);
+    print_ids(1);
+//    print_frame(0);
+    #pragma omp parallel if(0)
+    {
+//      print_frame(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+//      print_frame(0);
+      #pragma omp task
+      {
+//        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        print_ids(3);
+      }
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_end'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[EXPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[NESTED_IMPLICIT_TASK_ID]], second_task_id=[[EXPLICIT_TASK_ID]], prior_task_status=ompt_task_others=4
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[EXPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[EXPLICIT_TASK_ID]], second_task_id=[[NESTED_IMPLICIT_TASK_ID]], prior_task_status=ompt_task_complete=1
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[EXPLICIT_TASK_ID]]
+
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/parallel/serialized.c
===================================================================
--- runtime/test/ompt/parallel/serialized.c
+++ runtime/test/ompt/parallel/serialized.c
@@ -4,21 +4,73 @@
 
 int main()
 {
+//  print_frame(0);
   #pragma omp parallel num_threads(1)
   {
+//    print_frame(1);
     print_ids(0);
     print_ids(1);
+//    print_frame(0);
+    #pragma omp parallel num_threads(1)
+    {
+//      print_frame(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+//      print_frame(0);
+      #pragma omp task
+      {
+//        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        print_ids(3);
+      }
+    }
+    print_fuzzy_address(1);
   }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_end'
 
   // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, parallel_function=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:.+]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[OUTER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
 
   // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK: {{^}}[[MASTER_ID]]: level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // CHECK: {{^}}[[MASTER_ID]]: level 1: parallel_id=0, task_id=[[PARENT_TASK_ID]]
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[INNER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[EXPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[NESTED_IMPLICIT_TASK_ID]], second_task_id=[[EXPLICIT_TASK_ID]], prior_task_status=ompt_task_others=4
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[EXPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[EXPLICIT_TASK_ID]], second_task_id=[[NESTED_IMPLICIT_TASK_ID]], prior_task_status=ompt_task_complete=1
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[EXPLICIT_TASK_ID]]
+
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[INNER_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[INNER_RETURN_ADDRESS]]
 
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[OUTER_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[OUTER_RETURN_ADDRESS]]
 
   return 0;
 }
Index: runtime/test/ompt/synchronization/barrier/explicit.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/barrier/explicit.c
@@ -0,0 +1,57 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp atomic
+    x++;
+
+    #pragma omp barrier
+    print_current_address();
+
+    #pragma omp atomic
+    x++;
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread explicit barrier 
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+
+
+  // worker thread explicit barrier 
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[THREAD_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/barrier/for_loop.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/barrier/for_loop.c
@@ -0,0 +1,55 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int y[] = {0,1,2,3};
+
+  #pragma omp parallel num_threads(2)
+  {
+    //implicit barrier at end of for loop
+    int i;
+    #pragma omp for
+    for (i = 0; i < 4; i++)
+    {
+      y[i]++;
+    }
+    print_current_address();
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at loop end 
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+
+  // worker thread explicit barrier 
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // worker thread implicit barrier after parallel
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/barrier/for_simd.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/barrier/for_simd.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int y[] = {0,1,2,3};
+
+  int i;
+  #pragma omp for simd
+  for (i = 0; i < 4; i++)
+  {
+    y[i]++;
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at simd loop end 
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/barrier/parallel_region.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/barrier/parallel_region.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  //implicit barrier at end of a parallel region
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp atomic
+    x++;
+  }
+  print_fuzzy_address();
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/barrier/sections.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/barrier/sections.c
@@ -0,0 +1,63 @@
+// RUN: %libomp-compile-and-run | %sort-threads  | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  #pragma omp parallel num_threads(2)
+  {
+    //implicit barrier after sections with nowait but with lastprivates
+    //implicit barrier at end of sections
+    #pragma omp sections
+    {
+      #pragma omp section 
+      {
+        #pragma omp atomic
+        x++;
+      }
+      
+      #pragma omp section 
+      {
+        #pragma omp atomic
+        x++;
+      }
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at sections end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+
+  // worker thread implicit barrier at sections end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/barrier/single.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/barrier/single.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  #pragma omp parallel num_threads(2)
+  {
+    //implicit barrier at end of single
+    #pragma omp single
+    {
+      x++;
+    }
+    print_fuzzy_address();
+    //critical section to avoid merge of two barriers into one
+    #pragma omp critical
+    {
+      x++;
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at single end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+
+  // worker thread implicit barrier at single end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/critical.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/critical.c
@@ -0,0 +1,31 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp critical
+  {
+    print_current_address(1);
+    print_ids(0);
+  }
+  print_current_address(2);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_wait_critical: wait_id=[[WAIT_ID:[0-9]+]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_critical: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_critical: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/flush.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/flush.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the flush construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel num_threads(2)
+  {
+    int tid = omp_get_thread_num();
+    
+    #pragma omp flush
+    print_current_address(1);
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_flush'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_flush: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: current_address=[[RETURN_ADDRESS]]
+  //
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_flush: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: current_address=[[RETURN_ADDRESS]]
+
+
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/lock.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/lock.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  //need to use an OpenMP construct so that OMPT will be initalized
+  #pragma omp parallel num_threads(1)
+    print_ids(0);
+
+  omp_lock_t lock;
+  printf("%" PRIu64 ": &lock: %lli\n", ompt_get_thread_data()->value, (long long) &lock);
+  omp_init_lock(&lock);
+  print_current_address(1);
+  omp_set_lock(&lock);
+  print_current_address(2);
+  omp_unset_lock(&lock);
+  print_current_address(3);
+  omp_destroy_lock(&lock);
+  print_current_address(4);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: &lock: [[WAIT_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_init_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+ 
+  return 0;
+}
Index: runtime/test/ompt/synchronization/master.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/master.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the master construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      print_fuzzy_address(1);
+      x++;
+    }
+    print_current_address(2);
+  }
+
+  printf("%" PRIu64 ": x=%d\n", ompt_get_thread_data()->value, x);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_master_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_master_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS_END:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: current_address=[[RETURN_ADDRESS_END]]
+
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/nest_lock.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/nest_lock.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  //need to use an OpenMP construct so that OMPT will be initalized
+  #pragma omp parallel num_threads(1)
+    print_ids(0);
+
+  omp_nest_lock_t nest_lock;
+  printf("%" PRIu64 ": &nest_lock: %lli\n", ompt_get_thread_data()->value, (long long) &nest_lock);
+  omp_init_nest_lock(&nest_lock);
+  print_current_address(1);
+  omp_set_nest_lock(&nest_lock);
+  print_current_address(2);
+  omp_set_nest_lock(&nest_lock);
+  print_current_address(3);
+  omp_unset_nest_lock(&nest_lock);
+  print_current_address(4);
+  omp_unset_nest_lock(&nest_lock);
+  print_current_address(5);
+  omp_destroy_nest_lock(&nest_lock);
+  print_current_address(6);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_init_nest_lock: wait_id=[[WAIT_ID:[0-9]+]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_prev: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_nest_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/ordered.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/ordered.c
@@ -0,0 +1,31 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp ordered
+  {
+    print_current_address(1);
+    print_ids(0);
+  }
+  print_current_address(2);
+  
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_wait_ordered: wait_id=[[WAIT_ID:[0-9]+]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_ordered: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_ordered: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  
+  return 0;
+}
Index: runtime/test/ompt/synchronization/taskgroup.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/taskgroup.c
@@ -0,0 +1,48 @@
+// RUN:  %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <unistd.h>  
+#include <stdio.h>
+
+int main()
+{
+  int condition=0;
+  int x=0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      #pragma omp taskgroup
+      {
+        print_current_address(1);
+        #pragma omp task
+        {
+          #pragma omp atomic
+          x++;
+        }
+      }
+      print_current_address(2);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_taskgroup_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskgroup_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/taskwait.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/taskwait.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      #pragma omp task
+      {
+        x++;
+      }
+      #pragma omp taskwait
+      print_current_address(1);
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_taskwait_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskwait_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskwait_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: ompt_event_taskwait_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/test_lock.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/test_lock.c
@@ -0,0 +1,54 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_lock_t lock;
+  omp_init_lock(&lock);
+  print_current_address(1);
+
+  omp_test_lock(&lock);
+  print_current_address(2);
+  omp_unset_lock(&lock);
+  print_current_address(3);
+
+  omp_set_lock(&lock);
+  print_current_address(4);
+  omp_test_lock(&lock);
+  print_current_address(5);
+  omp_unset_lock(&lock);
+  print_current_address(6);
+
+  omp_destroy_lock(&lock);
+  print_current_address(7);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_init_lock: wait_id=[[WAIT_ID:[0-9]+]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] 
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]  
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]  
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]  
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]  
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]  
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]  
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/test_nest_lock.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/test_nest_lock.c
@@ -0,0 +1,42 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_nest_lock_t nest_lock;
+  omp_init_nest_lock(&nest_lock);
+
+  omp_test_nest_lock(&nest_lock);
+  omp_unset_nest_lock(&nest_lock);
+
+  omp_set_nest_lock(&nest_lock);
+  omp_test_nest_lock(&nest_lock);
+  omp_unset_nest_lock(&nest_lock);
+  omp_unset_nest_lock(&nest_lock);
+
+  omp_destroy_nest_lock(&nest_lock);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_init_nest_lock: wait_id=[[WAIT_ID:[0-9]+]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_prev: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+
+  return 0;
+}
Index: runtime/test/ompt/synchronization/test_nest_lock_parallel.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/synchronization/test_nest_lock_parallel.c
@@ -0,0 +1,59 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_nest_lock_t nest_lock;
+  omp_init_nest_lock(&nest_lock);
+
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      omp_set_nest_lock(&nest_lock);
+      print_current_address(1);
+    }
+    #pragma omp barrier
+    omp_test_nest_lock(&nest_lock); //should fail for non-master
+    print_current_address(2);
+    #pragma omp barrier
+    #pragma omp master
+    {
+      omp_unset_nest_lock(&nest_lock);
+      print_current_address(3);
+      omp_unset_nest_lock(&nest_lock);
+      print_current_address(4);
+    }
+  }
+
+  omp_destroy_nest_lock(&nest_lock);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID:[0-9]+]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] 
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]] 
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] 
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]] 
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_prev: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] 
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] 
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_nest_lock: wait_id=[[WAIT_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]] 
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]]
+  // CHECK-NEXT: {{^}}[[THREAD_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
Index: runtime/test/ompt/tasks/dependences.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/tasks/dependences.c
@@ -0,0 +1,53 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>   
+#include <math.h>
+#include <unistd.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {  
+      #pragma omp task depend(out:x)
+      {
+        x++;
+        delay(100);
+      }
+      print_fuzzy_address(1);
+    
+      #pragma omp task depend(in:x)
+      {
+        x = -1;
+      }
+    }
+  }
+
+  x++;
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_dependences'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_dependence'
+  
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+
+  // CHECK: {{^}}{{[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter=[[NULL]], new_task_id=[[FIRST_TASK:[0-f]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, task_type=ompt_task_explicit=4, has_dependences=yes
+  // CHECK: {{^}}{{[0-9]+}}: ompt_event_task_dependences: task_id=[[FIRST_TASK]], deps={{0x[0-f]+}}, ndeps=1
+  // CHECK: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // CHECK: {{^}}{{[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter=[[NULL]], new_task_id=[[SECOND_TASK:[0-f]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=yes
+  // CHECK: {{^}}{{[0-9]+}}: ompt_event_task_dependences: task_id=[[SECOND_TASK]], deps={{0x[0-f]+}}, ndeps=1
+  // CHECK: {{^}}{{[0-9]+}}: ompt_event_task_dependence_pair: first_task_id=[[FIRST_TASK]], second_task_id=[[SECOND_TASK]]
+
+
+  return 0;
+}
Index: runtime/test/ompt/tasks/explicit_task.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/tasks/explicit_task.c
@@ -0,0 +1,100 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h> 
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      #pragma omp task shared(condition)
+      {
+        OMPT_SIGNAL(condition);
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      print_fuzzy_address(1);
+      OMPT_WAIT(condition,1);
+      print_ids(0);
+    }
+    #pragma omp barrier
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(1)=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // <- ompt_event_task_create would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // explicit barrier after master
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // this is expected to come earlier and at MASTER:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
Index: runtime/test/ompt/tasks/serialized.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/tasks/serialized.c
@@ -0,0 +1,93 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>   
+#include <math.h>
+
+int main()
+{
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      int t = (int)sin(0.1);
+      #pragma omp task if(t)
+      {
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      print_fuzzy_address(1);
+      print_ids(0);
+    }
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[NULL]], new_task_id={{[0-9]+}}, codeptr_ra=[[NULL]], task_type=ompt_task_initial=1, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(1)=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // <- ompt_event_task_schedule ([[IMPLICIT_TASK_ID]], [[TASK_ID]]) would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // <- ompt_event_task_schedule ([[TASK_ID]], [[IMPLICIT_TASK_ID]]) would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reen
+
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
Index: runtime/test/ompt/tasks/task_in_joinbarrier.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/tasks/task_in_joinbarrier.c
@@ -0,0 +1,90 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h> 
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      #pragma omp task shared(condition)
+      {
+        OMPT_SIGNAL(condition);
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      OMPT_WAIT(condition,1);
+      print_ids(0);
+    }
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(1)=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // <- ompt_event_task_create would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[TASK_FUNCTION:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
Index: runtime/test/ompt/tasks/task_types.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/tasks/task_types.c
@@ -0,0 +1,112 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+#include <math.h>
+
+
+void print_task_type(int id)
+{
+  #pragma omp critical
+  {
+    int task_type;
+    char buffer[2048];
+    ompt_get_task_info(0, &task_type, NULL, NULL, NULL, NULL);
+    format_task_type(task_type, buffer);
+    printf("%" PRIu64 ": id=%d task_type=%s=%d\n", ompt_get_thread_data()->value, id, buffer, task_type);
+  }
+};
+
+int main()
+{
+  //initial task
+  print_task_type(0);
+
+  int x;
+  //implicit task
+  #pragma omp parallel num_threads(1)
+  {
+    print_task_type(1);
+    x++;
+  }
+
+  #pragma omp parallel num_threads(2)
+  #pragma omp master
+  {
+    //explicit task
+    #pragma omp task
+    {
+      print_task_type(2);
+      x++;
+    }
+
+    //explicit task with undeferred
+    #pragma omp task if(0)
+    {
+      print_task_type(3);
+      x++;
+    }
+
+    //explicit task with untied
+    #pragma omp task untied
+    {
+      print_task_type(4);
+      x++;
+    }
+
+    //explicit task with final
+    #pragma omp task final(1)
+    {
+      print_task_type(5);
+      x++;
+      //nested explicit task with final and undeferred
+      #pragma omp task
+      {
+        print_task_type(6);
+        x++;
+      }
+    }
+
+    //Mergeable task test deactivated for now
+    //explicit task with mergeable
+    /*
+    #pragma omp task mergeable if((int)sin(0))
+    {
+      print_task_type(7);
+      x++;
+    }
+    */
+
+    //TODO: merged task
+  }
+
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create: parent_task_id=0, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[NULL]], new_task_id={{[0-9]+}}, codeptr_ra=[[NULL]], task_type=ompt_task_initial=1, has_dependences=no
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK: {{^}}[[MASTER_ID]]: id=0 task_type=ompt_task_initial=1
+  // CHECK: {{^}}[[MASTER_ID]]: id=1 task_type=ompt_task_implicit|ompt_task_undeferred=134217730
+
+  // CHECK-DAG: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK-DAG: {{^[0-9]+}}: id=2 task_type=ompt_task_explicit=4
+
+  // CHECK-DAG: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK-DAG: {{^[0-9]+}}: id=3 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  // CHECK-DAG: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_untied=268435460, has_dependences=no
+  // CHECK-DAG: {{^[0-9]+}}: id=4 task_type=ompt_task_explicit|ompt_task_untied=268435460
+
+  // CHECK-DAG: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_final=536870916, has_dependences=no
+  // CHECK-DAG: {{^[0-9]+}}: id=5 task_type=ompt_task_explicit|ompt_task_final=536870916
+
+  // CHECK-DAG: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644, has_dependences=no
+  // CHECK-DAG: {{^[0-9]+}}: id=6 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644
+
+  return 0;
+}
Index: runtime/test/ompt/tasks/task_types_serialized.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/tasks/task_types_serialized.c
@@ -0,0 +1,112 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+void print_task_type(int id)
+{
+  #pragma omp critical
+  {
+    int task_type;
+    char buffer[2048];
+    ompt_get_task_info(0, &task_type, NULL, NULL, NULL, NULL);
+    format_task_type(task_type, buffer);
+    printf("%" PRIu64 ": id=%d task_type=%s=%d\n", ompt_get_thread_data()->value, id, buffer, task_type);
+  }
+};
+
+int main()
+{
+  //initial task
+  print_task_type(0);
+
+  int x;
+  //implicit task
+  #pragma omp parallel num_threads(1)
+  {
+    print_task_type(1);
+    x++;
+  }
+
+  #pragma omp parallel num_threads(1)
+  #pragma omp master
+  {
+    //explicit task
+    #pragma omp task
+    {
+      print_task_type(2);
+      x++;
+    }
+
+    //explicit task with undeferred
+    #pragma omp task if(0)
+    {
+      print_task_type(3);
+      x++;
+    }
+
+    //explicit task with untied
+    #pragma omp task untied
+    {
+      print_task_type(4);
+      x++;
+    }
+
+    //explicit task with final
+    #pragma omp task final(1)
+    {
+      print_task_type(5);
+      x++;
+      //nested explicit task with final and undeferred
+      #pragma omp task
+      {
+        print_task_type(6);
+        x++;
+      }
+    }
+
+/*
+    //TODO:not working
+    //explicit task with mergeable
+    #pragma omp task mergeable
+    {
+      print_task_type(7);
+      x++;
+    }
+*/
+
+    //TODO: merged task
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_task_create: parent_task_id=0, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[NULL]], new_task_id={{[0-9]+}}, codeptr_ra=[[NULL]], task_type=ompt_task_initial=1, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: id=0 task_type=ompt_task_initial=1
+  // CHECK: {{^}}[[MASTER_ID]]: id=1 task_type=ompt_task_implicit|ompt_task_undeferred=134217730
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=2 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=3 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_untied=402653188, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=4 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_untied=402653188
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=5 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=6 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644
+
+  // ___CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // ___CHECK: {{^[0-9]+}}: id=7 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  return 0;
+}
Index: runtime/test/ompt/tasks/taskyield.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/tasks/taskyield.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implements taskyield as stub
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>   
+#include <unistd.h>
+
+int main()
+{
+  int condition=0, x=0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+        #pragma omp task shared(condition)
+        {
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+        }
+        OMPT_WAIT(condition,1);
+        #pragma omp task shared(x)
+        {
+          x++;
+        }
+        printf("%" PRIu64 ": before yield\n", ompt_get_thread_data()->value);
+        #pragma omp taskyield
+        printf("%" PRIu64 ": after yield\n", ompt_get_thread_data()->value);
+        OMPT_SIGNAL(condition);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size={{[0-9]+}}, thread_num={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[WORKER_TASK:[0-9]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[MAIN_TASK:[0-9]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=no
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[MAIN_TASK]], prior_task_status=ompt_task_yield=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[MAIN_TASK]], second_task_id=[[IMPLICIT_TASK_ID]], prior_task_status=ompt_task_complete=1
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_task_schedule: first_task_id={{[0-9]+}}, second_task_id=[[WORKER_TASK]], prior_task_status=ompt_task_others=4
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[WORKER_TASK]], second_task_id={{[0-9]+}}, prior_task_status=ompt_task_complete=1
+
+
+
+
+
+  return 0;
+}
Index: runtime/test/ompt/tasks/untied_task.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/tasks/untied_task.c
@@ -0,0 +1,107 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h> 
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      #pragma omp task untied shared(condition)
+      {
+        OMPT_SIGNAL(condition);
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        #pragma omp task if(0)
+        {
+          print_ids(0);
+          print_ids(1);
+          print_ids(2);
+        }
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      OMPT_WAIT(condition,1);
+      print_ids(0);
+    }
+    #pragma omp barrier
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[MAIN_REENTER]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(1)=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // <- ompt_event_task_create would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=[[REENTER]], new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[TASK_FUNCTION:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // explicit barrier after master
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // this is expected to come earlier and at MASTER:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[MAIN_REENTER]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
Index: runtime/test/ompt/worksharing/for/auto_split.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/worksharing/for/auto_split.c
@@ -0,0 +1,8 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for auto = static schedule
+// XFAIL: gcc
+
+#define SCHEDULE auto
+#include "base_split.h"
Index: runtime/test/ompt/worksharing/for/base.h
===================================================================
--- runtime/test/ompt/worksharing/for/base.h
+++ runtime/test/ompt/worksharing/for/base.h
@@ -9,28 +9,35 @@
   for (i = 0; i < 4; i++) {
   }
 
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+
   // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, parallel_function=0x{{[0-f]+}}, invoker={{.*}}
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker={{[0-9]+}}
 
   // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], workshare_function=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=0x{{[0-f]+}}
   // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], workshare_function=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=0x{{[0-f]+}}
   // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], workshare_function=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=0x{{[0-f]+}}
   // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], workshare_function=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=0x{{[0-f]+}}
   // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
 
   return 0;
 }
Index: runtime/test/ompt/worksharing/for/base_serialized.h
===================================================================
--- runtime/test/ompt/worksharing/for/base_serialized.h
+++ runtime/test/ompt/worksharing/for/base_serialized.h
@@ -8,14 +8,21 @@
   #pragma omp parallel for num_threads(1) schedule(SCHEDULE)
   for (i = 0; i < 1; i++) {
   }
+  
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
 
   // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, parallel_function=0x{{[0-f]+}}, invoker={{.+}}
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=0x{{[0-f]+}}, invoker={{[0-9]+}}
 
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], workshare_function=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=0x{{[0-f]+}}
   // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[PARALLEL_ID,0]}}, task_id=[[IMPLICIT_TASK_ID]]
 
   return 0;
 }
Index: runtime/test/ompt/worksharing/for/base_split.h
===================================================================
--- /dev/null
+++ runtime/test/ompt/worksharing/for/base_split.h
@@ -0,0 +1,66 @@
+#include "callback.h"
+#include <omp.h>
+
+/* With the combined parallel-for construct (base.h), the return-addresses are hard to compare.
+   With the separate parallel and for-nowait construct, the addresses become more predictable,
+   but the begin of the for-loop still generates additional code, so the offset of loop-begin 
+   to the label is >4 Byte.
+*/
+
+int main()
+{
+  unsigned int i;
+
+  #pragma omp parallel num_threads(4) 
+  {
+    print_current_address(0);
+    #pragma omp for schedule(SCHEDULE) nowait
+    for (i = 0; i < 4; i++) {
+      print_fuzzy_address(1);
+    }
+    print_fuzzy_address(2);
+  }
+  print_fuzzy_address(3);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[PARALLEL_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=[[LOOP_BEGIN_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, invoker={{[0-9]+}}, codeptr_ra=[[PARALLEL_RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[PARALLEL_RETURN_ADDRESS]]
+  
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+
+  // CHECK-LOOP: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK-LOOP: 0: ompt_event_runtime_shutdown
+  // CHECK-LOOP: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra={{0x[0-f]+}}, invoker={{[0-9]+}}
+  // CHECK-LOOP: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=[[LOOP_BEGIN_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+
+
+  return 0;
+}
Index: runtime/test/ompt/worksharing/for/dynamic_split.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/worksharing/for/dynamic_split.c
@@ -0,0 +1,6 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+
+#define SCHEDULE dynamic
+#include "base_split.h"
Index: runtime/test/ompt/worksharing/for/guided_split.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/worksharing/for/guided_split.c
@@ -0,0 +1,6 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+
+#define SCHEDULE guided
+#include "base_split.h"
Index: runtime/test/ompt/worksharing/for/runtime_split.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/worksharing/for/runtime_split.c
@@ -0,0 +1,6 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+
+#define SCHEDULE runtime
+#include "base_split.h"
Index: runtime/test/ompt/worksharing/for/static_split.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/worksharing/for/static_split.c
@@ -0,0 +1,8 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for static schedule
+// XFAIL: gcc
+
+#define SCHEDULE static
+#include "base_split.h"
Index: runtime/test/ompt/worksharing/sections.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/worksharing/sections.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not distinguish between sections and loops
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel sections num_threads(2)
+  {
+    #pragma omp section
+    {
+      printf("%lu: section 1\n", ompt_get_thread_data()->value);
+    }
+    #pragma omp section
+    {
+      printf("%lu: section 2\n", ompt_get_thread_data()->value);
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_sections_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], parent_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[SECT_BEGIN:0x[0-f]+]], count=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_sections_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[SECT_END:0x[0-f]+]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_sections_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[SECT_BEGIN]], count=2
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_sections_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[SECT_END]]
+
+
+  return 0;
+}
Index: runtime/test/ompt/worksharing/single.c
===================================================================
--- /dev/null
+++ runtime/test/ompt/worksharing/single.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the single construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp single
+    {
+      x++;
+    }
+  }
+
+  printf("x=%d\n", x);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[THREAD_ID_1:[0-9]+]]: ompt_event_single_in_block_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], parent_task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]+}}, count=1
+  // CHECK: {{^}}[[THREAD_ID_1]]: ompt_event_single_in_block_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]+}}, count=1
+
+  // CHECK: {{^}}[[THREAD_ID_2:[0-9]+]]: ompt_event_single_others_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]+}}, count=1
+  // CHECK: {{^}}[[THREAD_ID_2]]: ompt_event_single_others_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]+}}, count=1
+
+
+
+  return 0;
+}