Index: openmp/trunk/runtime/src/kmp.h
===================================================================
--- openmp/trunk/runtime/src/kmp.h
+++ openmp/trunk/runtime/src/kmp.h
@@ -788,8 +788,8 @@
 
 // KMP_HW_SUBSET support:
 typedef struct kmp_hws_item {
-    int num;
-    int offset;
+  int num;
+  int offset;
 } kmp_hws_item_t;
 
 extern kmp_hws_item_t __kmp_hws_socket;
@@ -1533,9 +1533,9 @@
   kmp_uint32 ordered_lower;
   kmp_uint32 ordered_upper;
 #if KMP_OS_WINDOWS
-// This var can be placed in the hole between 'tc' and 'parm1', instead of
-// 'static_steal_counter'. It would be nice to measure execution times.
-// Conditional if/endif can be removed at all.
+  // This var can be placed in the hole between 'tc' and 'parm1', instead of
+  // 'static_steal_counter'. It would be nice to measure execution times.
+  // Conditional if/endif can be removed at all.
   kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
@@ -1568,9 +1568,9 @@
   kmp_uint64 ordered_lower;
   kmp_uint64 ordered_upper;
 #if KMP_OS_WINDOWS
-// This var can be placed in the hole between 'tc' and 'parm1', instead of
-// 'static_steal_counter'. It would be nice to measure execution times.
-// Conditional if/endif can be removed at all.
+  // This var can be placed in the hole between 'tc' and 'parm1', instead of
+  // 'static_steal_counter'. It would be nice to measure execution times.
+  // Conditional if/endif can be removed at all.
   kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
@@ -2109,7 +2109,7 @@
 
 #if OMP_40_ENABLED
 typedef struct kmp_taskgroup {
-  kmp_uint32 count; // number of allocated and not yet complete tasks
+  kmp_int32 count; // number of allocated and not yet complete tasks
   kmp_int32 cancel_request; // request for cancellation of this taskgroup
   struct kmp_taskgroup *parent; // parent taskgroup
 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
@@ -2250,10 +2250,10 @@
   kmp_int32 td_taskwait_thread; /* gtid + 1 of thread encountered taskwait */
   KMP_ALIGN_CACHE kmp_internal_control_t
       td_icvs; /* Internal control variables for the task */
-  KMP_ALIGN_CACHE volatile kmp_uint32
+  KMP_ALIGN_CACHE volatile kmp_int32
       td_allocated_child_tasks; /* Child tasks (+ current task) not yet
                                    deallocated */
-  volatile kmp_uint32
+  volatile kmp_int32
       td_incomplete_child_tasks; /* Child tasks not yet complete */
 #if OMP_40_ENABLED
   kmp_taskgroup_t
@@ -2328,7 +2328,7 @@
 #endif
 
   KMP_ALIGN_CACHE
-  volatile kmp_uint32 tt_unfinished_threads; /* #threads still active      */
+  volatile kmp_int32 tt_unfinished_threads; /* #threads still active      */
 
   KMP_ALIGN_CACHE
   volatile kmp_uint32
@@ -2402,7 +2402,6 @@
   kmp_uint64 th_team_bt_intervals;
 #endif
 
-
 #if KMP_AFFINITY_SUPPORTED
   kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
 #endif
@@ -3787,7 +3786,6 @@
 extern int _You_must_link_with_Microsoft_OpenMP_library;
 #endif
 
-
 // The routines below are not exported.
 // Consider making them 'static' in corresponding source files.
 void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
Index: openmp/trunk/runtime/src/kmp_affinity.h
===================================================================
--- openmp/trunk/runtime/src/kmp_affinity.h
+++ openmp/trunk/runtime/src/kmp_affinity.h
@@ -618,8 +618,10 @@
 };
 
 static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
-  const Address *aa = (const Address *)&(((AddrUnsPair *)a)->first);
-  const Address *bb = (const Address *)&(((AddrUnsPair *)b)->first);
+  const Address *aa =
+      (const Address *)&(((AddrUnsPair *)CCAST(void *, a))->first);
+  const Address *bb =
+      (const Address *)&(((AddrUnsPair *)CCAST(void *, b))->first);
   unsigned depth = aa->depth;
   unsigned i;
   KMP_DEBUG_ASSERT(depth == bb->depth);
@@ -765,7 +767,6 @@
       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 
     uninitialized = initialized; // One writer
-
   }
 
   // Resize the hierarchy if nproc changes to something larger than before
@@ -832,7 +833,6 @@
 
     base_num_threads = nproc;
     resizing = 0; // One writer
-
   }
 };
 #endif // KMP_AFFINITY_H
Index: openmp/trunk/runtime/src/kmp_affinity.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_affinity.cpp
+++ openmp/trunk/runtime/src/kmp_affinity.cpp
@@ -405,10 +405,9 @@
   __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
   for (socket =
            hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0);
-       socket != NULL;
-       socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology,
-                                           HWLOC_OBJ_PACKAGE, socket),
-         socket_identifier++) {
+       socket != NULL; socket = hwloc_get_next_obj_by_type(
+                           __kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, socket),
+      socket_identifier++) {
     int core_identifier = 0;
     int num_active_cores = 0;
     for (core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type,
@@ -419,7 +418,7 @@
                                         core) == socket;
          core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE,
                                            core),
-         core_identifier++) {
+        core_identifier++) {
       int pu_identifier = 0;
       int num_active_threads = 0;
       for (pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type,
@@ -430,14 +429,14 @@
                                           pu) == core;
            pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU,
                                            pu),
-           pu_identifier++) {
+          pu_identifier++) {
         Address addr(3);
-        if(!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
+        if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
           continue; // skip inactive (inaccessible) unit
         KA_TRACE(20,
                  ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
                   socket->os_index, socket->logical_index, core->os_index,
-                  core->logical_index, pu->os_index,pu->logical_index));
+                  core->logical_index, pu->os_index, pu->logical_index));
         addr.labels[0] = socket_identifier; // package
         addr.labels[1] = core_identifier; // core
         addr.labels[2] = pu_identifier; // pu
@@ -1692,8 +1691,8 @@
 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
                                                   const void *b) {
   unsigned i;
-  const unsigned *aa = *((const unsigned **)a);
-  const unsigned *bb = *((const unsigned **)b);
+  const unsigned *aa = *(RCAST(unsigned **, CCAST(void *, a)));
+  const unsigned *bb = *(RCAST(unsigned **, CCAST(void *, b)));
   for (i = maxIndex;; i--) {
     if (aa[i] < bb[i])
       return -1;
@@ -3037,7 +3036,7 @@
 #if KMP_USE_HWLOC
 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
                                               hwloc_obj_type_t type,
-                                              hwloc_obj_t* f) {
+                                              hwloc_obj_t *f) {
   if (!hwloc_compare_types(o->type, type)) {
     if (*f == NULL)
       *f = o; // output first descendant found
@@ -3051,7 +3050,7 @@
 
 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
                                                hwloc_obj_t o, unsigned depth,
-                                               hwloc_obj_t* f) {
+                                               hwloc_obj_t *f) {
   if (o->depth == depth) {
     if (*f == NULL)
       *f = o; // output first descendant found
@@ -3099,16 +3098,17 @@
 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
   AddrUnsPair *newAddr;
   if (__kmp_hws_requested == 0)
-    goto _exit;   // no topology limiting actions requested, exit
+    goto _exit; // no topology limiting actions requested, exit
 #if KMP_USE_HWLOC
   if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
     // Number of subobjects calculated dynamically, this works fine for
     // any non-uniform topology.
     // L2 cache objects are determined by depth, other objects - by type.
     hwloc_topology_t tp = __kmp_hwloc_topology;
-    int nS=0, nN=0, nL=0, nC=0, nT=0; // logical index including skipped
-    int nCr=0, nTr=0; // number of requested units
-    int nPkg=0, nCo=0, n_new=0, n_old = 0, nCpP=0, nTpC=0; // counters
+    int nS = 0, nN = 0, nL = 0, nC = 0,
+        nT = 0; // logical index including skipped
+    int nCr = 0, nTr = 0; // number of requested units
+    int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
     hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
     int L2depth, idx;
 
@@ -3136,8 +3136,8 @@
     // check L2 cahce, get object by depth because of multiple caches
     L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
     hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
-    if (hL != NULL && __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
-                                                         &hC) > 1) {
+    if (hL != NULL &&
+        __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
       tile_support = 1; // no sense to count L2 if it includes single core
     } else if (__kmp_hws_tile.num > 0) {
       if (__kmp_hws_core.num == 0) {
@@ -3153,7 +3153,7 @@
 
     // fill in unset items, validate settings -----------------------
     if (__kmp_hws_socket.num == 0)
-      __kmp_hws_socket.num = nPackages;    // use all available sockets
+      __kmp_hws_socket.num = nPackages; // use all available sockets
     if (__kmp_hws_socket.offset >= nPackages) {
       KMP_WARNING(AffHWSubsetManySockets);
       goto _exit;
@@ -3180,7 +3180,7 @@
         int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
                                                     &hC); // num cores in tile
         if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC;   // use all available cores
+          __kmp_hws_core.num = NC; // use all available cores
         if (__kmp_hws_core.offset >= NC) {
           KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
@@ -3189,7 +3189,7 @@
         int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
                                                     &hC); // num cores in node
         if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC;   // use all available cores
+          __kmp_hws_core.num = NC; // use all available cores
         if (__kmp_hws_core.offset >= NC) {
           KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
@@ -3208,7 +3208,7 @@
         int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
                                                     &hC); // num cores in tile
         if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC;   // use all available cores
+          __kmp_hws_core.num = NC; // use all available cores
         if (__kmp_hws_core.offset >= NC) {
           KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
@@ -3217,7 +3217,7 @@
         int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
                                                     &hC); // num cores in socket
         if (__kmp_hws_core.num == 0)
-          __kmp_hws_core.num = NC;   // use all available cores
+          __kmp_hws_core.num = NC; // use all available cores
         if (__kmp_hws_core.offset >= NC) {
           KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
@@ -3256,8 +3256,8 @@
         nN = 0;
         hN = NULL;
         // num nodes in current socket
-        int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
-                                                    &hN);
+        int NN =
+            __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
         for (int n = 0; n < NN; ++n) {
           // Check NUMA Node ----------------------------------------
           if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
@@ -3357,8 +3357,8 @@
             nC = 0;
             hC = NULL;
             // num cores in current node
-            int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
-                                                        &hC);
+            int NC =
+                __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
             for (int c = 0; c < NC; ++c) {
               // Check Core ---------------------------------------
               if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
@@ -3377,8 +3377,8 @@
               nT = 0;
               nTr = 0;
               hT = NULL;
-              int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU,
-                                                          &hT);
+              int NT =
+                  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
               for (int t = 0; t < NT; ++t) {
                 // Check PU ---------------------------------------
                 idx = hT->os_index;
@@ -3439,8 +3439,8 @@
             nC = 0;
             hC = NULL;
             // num cores per tile
-            int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
-                                                        &hC);
+            int NC =
+                __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
             for (int c = 0; c < NC; ++c) {
               // Check Core ---------------------------------------
               if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
@@ -3460,8 +3460,8 @@
               nTr = 0;
               hT = NULL;
               // num procs per core
-              int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU,
-                                                          &hT);
+              int NT =
+                  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
               for (int t = 0; t < NT; ++t) {
                 // Check PU ---------------------------------------
                 idx = hT->os_index;
@@ -3501,8 +3501,8 @@
           nC = 0;
           hC = NULL;
           // num cores in socket
-          int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
-                                                      &hC);
+          int NC =
+              __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
           for (int c = 0; c < NC; ++c) {
             // Check Core -------------------------------------------
             if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
@@ -3522,8 +3522,8 @@
             nTr = 0;
             hT = NULL;
             // num procs per core
-            int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU,
-                                                        &hT);
+            int NT =
+                __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
             for (int t = 0; t < NT; ++t) {
               // Check PU ---------------------------------------
               idx = hT->os_index;
@@ -3576,11 +3576,11 @@
     KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
     KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
 
-    nPackages = nPkg;             // correct num sockets
-    nCoresPerPkg = nCpP;          // correct num cores per socket
+    nPackages = nPkg; // correct num sockets
+    nCoresPerPkg = nCpP; // correct num cores per socket
     __kmp_nThreadsPerCore = nTpC; // correct num threads per core
-    __kmp_avail_proc = n_new;     // correct num procs
-    __kmp_ncores = nCo;           // correct num cores
+    __kmp_avail_proc = n_new; // correct num procs
+    __kmp_ncores = nCo; // correct num cores
     // hwloc topology method end
   } else
 #endif // KMP_USE_HWLOC
@@ -3591,34 +3591,32 @@
       goto _exit;
     }
     if (__kmp_hws_socket.num == 0)
-      __kmp_hws_socket.num = nPackages;    // use all available sockets
+      __kmp_hws_socket.num = nPackages; // use all available sockets
     if (__kmp_hws_core.num == 0)
-      __kmp_hws_core.num = nCoresPerPkg;   // use all available cores
-    if (__kmp_hws_proc.num == 0 ||
-        __kmp_hws_proc.num > __kmp_nThreadsPerCore)
+      __kmp_hws_core.num = nCoresPerPkg; // use all available cores
+    if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
       __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
-    if ( !__kmp_affinity_uniform_topology() ) {
-      KMP_WARNING( AffHWSubsetNonUniform );
+    if (!__kmp_affinity_uniform_topology()) {
+      KMP_WARNING(AffHWSubsetNonUniform);
       goto _exit; // don't support non-uniform topology
     }
-    if ( depth > 3 ) {
-      KMP_WARNING( AffHWSubsetNonThreeLevel );
+    if (depth > 3) {
+      KMP_WARNING(AffHWSubsetNonThreeLevel);
       goto _exit; // don't support not-3-level topology
     }
     if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
       KMP_WARNING(AffHWSubsetManySockets);
       goto _exit;
     }
-    if ( __kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg ) {
-      KMP_WARNING( AffHWSubsetManyCores );
+    if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
+      KMP_WARNING(AffHWSubsetManyCores);
       goto _exit;
     }
     // Form the requested subset
     if (pAddr) // pAddr is NULL in case of affinity_none
-      newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
-                                              __kmp_hws_socket.num *
-                                              __kmp_hws_core.num *
-                                              __kmp_hws_proc.num);
+      newAddr = (AddrUnsPair *)__kmp_allocate(
+          sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
+          __kmp_hws_proc.num);
     for (int i = 0; i < nPackages; ++i) {
       if (i < __kmp_hws_socket.offset ||
           i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
@@ -3637,16 +3635,16 @@
         // walk through requested socket
         for (int j = 0; j < nCoresPerPkg; ++j) {
           if (j < __kmp_hws_core.offset ||
-              j >= __kmp_hws_core.offset + __kmp_hws_core.num)
-            { // skip not-requested core
-              n_old += __kmp_nThreadsPerCore;
-              if (__kmp_pu_os_idx != NULL) {
-                for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-                  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-                  ++proc_num;
-                }
+              j >= __kmp_hws_core.offset +
+                       __kmp_hws_core.num) { // skip not-requested core
+            n_old += __kmp_nThreadsPerCore;
+            if (__kmp_pu_os_idx != NULL) {
+              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+                KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+                ++proc_num;
               }
-            } else {
+            }
+          } else {
             // walk through requested core
             for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
               if (k < __kmp_hws_proc.num) {
@@ -3665,21 +3663,23 @@
       }
     }
     KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
-    KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_core.num *
-                     __kmp_hws_proc.num);
-    nPackages = __kmp_hws_socket.num;           // correct nPackages
-    nCoresPerPkg = __kmp_hws_core.num;          // correct nCoresPerPkg
+    KMP_DEBUG_ASSERT(n_new ==
+                     __kmp_hws_socket.num * __kmp_hws_core.num *
+                         __kmp_hws_proc.num);
+    nPackages = __kmp_hws_socket.num; // correct nPackages
+    nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
     __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
-    __kmp_avail_proc = n_new;                   // correct avail_proc
+    __kmp_avail_proc = n_new; // correct avail_proc
     __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
   } // non-hwloc topology method
   if (pAddr) {
-    __kmp_free( *pAddr );
-    *pAddr = newAddr;      // replace old topology with new one
+    __kmp_free(*pAddr);
+    *pAddr = newAddr; // replace old topology with new one
   }
   if (__kmp_affinity_verbose) {
     char m[KMP_AFFIN_MASK_PRINT_LEN];
-    __kmp_affinity_print_mask(m,KMP_AFFIN_MASK_PRINT_LEN,__kmp_affin_fullMask);
+    __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN,
+                              __kmp_affin_fullMask);
     if (__kmp_affinity_respect_mask) {
       KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
     } else {
@@ -3693,7 +3693,7 @@
                __kmp_nThreadsPerCore, __kmp_ncores);
     __kmp_str_buf_free(&buf);
   }
- _exit:
+_exit:
   if (__kmp_pu_os_idx != NULL) {
     __kmp_free(__kmp_pu_os_idx);
     __kmp_pu_os_idx = NULL;
@@ -3750,7 +3750,8 @@
 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
                                     int bottom_level, int core_level) {
   return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
-                                       core_level) - 1;
+                                       core_level) -
+         1;
 }
 
 // This function finds maximal number of processing units bound to a
@@ -3785,8 +3786,10 @@
   return;
 
 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
-  const Address *aa = (const Address *)&(((AddrUnsPair *)a)->first);
-  const Address *bb = (const Address *)&(((AddrUnsPair *)b)->first);
+  const Address *aa =
+      (const Address *)&(((AddrUnsPair *)CCAST(void *, a))->first);
+  const Address *bb =
+      (const Address *)&(((AddrUnsPair *)CCAST(void *, b))->first);
   unsigned depth = aa->depth;
   unsigned i;
   KMP_DEBUG_ASSERT(depth == bb->depth);
Index: openmp/trunk/runtime/src/kmp_alloc.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_alloc.cpp
+++ openmp/trunk/runtime/src/kmp_alloc.cpp
@@ -298,12 +298,12 @@
 #if USE_CMP_XCHG_FOR_BGET
     {
       volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
-      while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list, old_value,
-                                        NULL)) {
+      while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
+                                        CCAST(void *, old_value), NULL)) {
         KMP_CPU_PAUSE();
         old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
       }
-      p = (void *)old_value;
+      p = CCAST(void *, old_value);
     }
 #else /* ! USE_CMP_XCHG_FOR_BGET */
 #ifdef USE_QUEUING_LOCK_FOR_BGET
@@ -362,15 +362,15 @@
     volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
     /* the next pointer must be set before setting bget_list to buf to avoid
        exposing a broken list to other threads, even for an instant. */
-    b->ql.flink = BFH(old_value);
+    b->ql.flink = BFH(CCAST(void *, old_value));
 
-    while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list, old_value,
-                                      buf)) {
+    while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
+                                      CCAST(void *, old_value), buf)) {
       KMP_CPU_PAUSE();
       old_value = TCR_PTR(th->th.th_local.bget_list);
       /* the next pointer must be set before setting bget_list to buf to avoid
          exposing a broken list to other threads, even for an instant. */
-      b->ql.flink = BFH(old_value);
+      b->ql.flink = BFH(CCAST(void *, old_value));
     }
   }
 #else /* ! USE_CMP_XCHG_FOR_BGET */
@@ -607,7 +607,7 @@
   if (thr->acqfcn != 0) {
     if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
       /* Request is too large to fit in a single expansion block.
-	 Try to satisy it by a direct buffer acquisition. */
+         Try to satisy it by a direct buffer acquisition. */
       bdhead_t *bdh;
 
       size += sizeof(bdhead_t) - sizeof(bhead_t);
Index: openmp/trunk/runtime/src/kmp_barrier.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_barrier.cpp
+++ openmp/trunk/runtime/src/kmp_barrier.cpp
@@ -883,9 +883,9 @@
           ANNOTATE_REDUCE_BEFORE(reduce);
           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
         }
-        (void)KMP_TEST_THEN_AND64(
-            (volatile kmp_int64 *)&thr_bar->b_arrived,
-            ~(thr_bar->leaf_state)); // clear leaf_state bits
+        // clear leaf_state bits
+        KMP_TEST_THEN_AND64(CCAST(kmp_uint64 *, &thr_bar->b_arrived),
+                            ~(thr_bar->leaf_state));
       }
       // Next, wait for higher level children on each child's b_arrived flag
       for (kmp_uint32 d = 1; d < thr_bar->my_level;
@@ -1035,7 +1035,8 @@
         TCW_8(thr_bar->b_go,
               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
       } else { // Reset my bits on parent's b_go flag
-        ((char *)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
+        (RCAST(volatile char *,
+               &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
       }
     }
     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
@@ -1210,7 +1211,6 @@
                 gtid, team->t.t_id, tid, bt));
 }
 
-
 // End of Barrier Algorithms
 
 // Internal function to do a barrier.
@@ -1347,7 +1347,7 @@
     if (KMP_MASTER_TID(tid)) {
       status = 0;
       if (__kmp_tasking_mode != tskm_immediate_exec) {
-          __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
+        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
       }
 #if USE_DEBUGGER
       // Let the debugger know: All threads are arrived and starting leaving the
Index: openmp/trunk/runtime/src/kmp_csupport.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_csupport.cpp
+++ openmp/trunk/runtime/src/kmp_csupport.cpp
@@ -3092,8 +3092,8 @@
   // __kmp_dispatch_num_buffers)
   if (idx != sh_buf->doacross_buf_idx) {
     // Shared buffer is occupied, wait for it to be free
-    __kmp_wait_yield_4((kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4,
-                       NULL);
+    __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
+                       __kmp_eq_4, NULL);
   }
   // Check if we are the first thread. After the CAS the first thread gets 0,
   // others get 1 if initialization is in progress, allocated pointer otherwise.
@@ -3258,8 +3258,8 @@
   iter_number >>= 5; // divided by 32
   flag = 1 << shft;
   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
-    KMP_TEST_THEN_OR32((kmp_int32 *)&pr_buf->th_doacross_flags[iter_number],
-                       (kmp_int32)flag);
+    KMP_TEST_THEN_OR32(
+        CCAST(kmp_uint32 *, &pr_buf->th_doacross_flags[iter_number]), flag);
   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
                 (iter_number << 5) + shft));
 }
@@ -3285,7 +3285,7 @@
                      (kmp_int64)&sh_buf->doacross_num_done);
     KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
-    __kmp_thread_free(th, (void *)sh_buf->doacross_flags);
+    __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
     sh_buf->doacross_flags = NULL;
     sh_buf->doacross_num_done = 0;
     sh_buf->doacross_buf_idx +=
Index: openmp/trunk/runtime/src/kmp_dispatch.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_dispatch.cpp
+++ openmp/trunk/runtime/src/kmp_dispatch.cpp
@@ -172,7 +172,7 @@
 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
                                                  kmp_int32 d) {
   kmp_int32 r;
-  r = KMP_TEST_THEN_ADD32(p, d);
+  r = KMP_TEST_THEN_ADD32(CCAST(kmp_int32 *, p), d);
   return r;
 }
 
@@ -180,7 +180,7 @@
 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
                                                  kmp_int64 d) {
   kmp_int64 r;
-  r = KMP_TEST_THEN_ADD64(p, d);
+  r = KMP_TEST_THEN_ADD64(CCAST(kmp_int64 *, p), d);
   return r;
 }
 
@@ -190,14 +190,14 @@
 template <>
 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
   kmp_int32 r;
-  r = KMP_TEST_THEN_INC_ACQ32(p);
+  r = KMP_TEST_THEN_INC_ACQ32(CCAST(kmp_int32 *, p));
   return r;
 }
 
 template <>
 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
   kmp_int64 r;
-  r = KMP_TEST_THEN_INC_ACQ64(p);
+  r = KMP_TEST_THEN_INC_ACQ64(CCAST(kmp_int64 *, p));
   return r;
 }
 
@@ -207,14 +207,14 @@
 template <>
 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
   kmp_int32 r;
-  r = KMP_TEST_THEN_INC32(p);
+  r = KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, p));
   return r;
 }
 
 template <>
 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
   kmp_int64 r;
-  r = KMP_TEST_THEN_INC64(p);
+  r = KMP_TEST_THEN_INC64(CCAST(kmp_int64 *, p));
   return r;
 }
 
@@ -262,7 +262,7 @@
   register kmp_uint32 (*f)(UT, UT) = pred;
   register UT r;
 
-  KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
+  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
   KMP_INIT_YIELD(spins);
   // main wait spin loop
   while (!f(r = *spin, check)) {
@@ -440,7 +440,7 @@
           th->th.th_dispatch->th_dispatch_pr_current);
     }
 
-    KMP_FSYNC_RELEASING(&sh->u.s.ordered_iteration);
+    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
 #if !defined(KMP_GOMP_COMPAT)
     if (__kmp_env_consistency_check) {
       if (pr->ordered_bumped != 0) {
@@ -1162,7 +1162,9 @@
                    gtid, my_buffer_index, sh->buffer_index));
 
     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
-    th->th.th_dispatch->th_dispatch_sh_current = (dispatch_shared_info_t *)sh;
+    th->th.th_dispatch->th_dispatch_sh_current =
+        RCAST(dispatch_shared_info_t *,
+              CCAST(dispatch_shared_info_template<UT> *, sh));
 #if USE_ITT_BUILD
     if (pr->ordered) {
       __kmp_itt_ordered_init(gtid);
@@ -1978,7 +1980,8 @@
               pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
             // use dynamic-style shcedule
             // atomically inrement iterations, get old value
-            init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunkspec);
+            init = test_then_add<ST>(
+                RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)), (ST)chunkspec);
             remaining = trip - init;
             if (remaining <= 0) {
               status = 0; // all iterations got by other threads
@@ -1995,8 +1998,8 @@
           } // if
           limit = init + (UT)(remaining *
                               *(double *)&pr->u.p.parm3); // divide by K*nproc
-          if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init,
-                                   (ST)limit)) {
+          if (compare_and_swap<ST>(RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)),
+                                   (ST)init, (ST)limit)) {
             // CAS was successful, chunk obtained
             status = 1;
             --limit;
@@ -2056,7 +2059,8 @@
           if ((T)remaining < pr->u.p.parm2) {
             // use dynamic-style shcedule
             // atomically inrement iterations, get old value
-            init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunk);
+            init = test_then_add<ST>(
+                RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)), (ST)chunk);
             remaining = trip - init;
             if (remaining <= 0) {
               status = 0; // all iterations got by other threads
@@ -2078,8 +2082,8 @@
           if (rem) // adjust so that span%chunk == 0
             span += chunk - rem;
           limit = init + span;
-          if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init,
-                                   (ST)limit)) {
+          if (compare_and_swap<ST>(RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)),
+                                   (ST)init, (ST)limit)) {
             // CAS was successful, chunk obtained
             status = 1;
             --limit;
@@ -2716,7 +2720,7 @@
   register kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
   register kmp_uint32 r;
 
-  KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
+  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
   KMP_INIT_YIELD(spins);
   // main wait spin loop
   while (!f(r = TCR_4(*spin), check)) {
Index: openmp/trunk/runtime/src/kmp_environment.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_environment.cpp
+++ openmp/trunk/runtime/src/kmp_environment.cpp
@@ -147,7 +147,7 @@
 void __kmp_env_free(char const **value) {
 
   KMP_DEBUG_ASSERT(value != NULL);
-  KMP_INTERNAL_FREE((void *)*value);
+  KMP_INTERNAL_FREE(CCAST(char *, *value));
   *value = NULL;
 
 } // func __kmp_env_free
@@ -475,7 +475,8 @@
     kmp_env_blk_t *block // M: Block of environment variables to sort.
     ) {
 
-  qsort((void *)block->vars, block->count, sizeof(kmp_env_var_t),
+  qsort(CCAST(kmp_env_var_t *, block->vars), block->count,
+        sizeof(kmp_env_var_t),
         (int (*)(void const *, void const *)) & ___kmp_env_var_cmp);
 
 } // __kmp_env_block_sort
@@ -484,7 +485,7 @@
     kmp_env_blk_t *block // M: Block of environment variables to free.
     ) {
 
-  KMP_INTERNAL_FREE((void *)block->vars);
+  KMP_INTERNAL_FREE(CCAST(kmp_env_var_t *, block->vars));
   __kmp_str_free(&(block->bulk));
 
   block->count = 0;
Index: openmp/trunk/runtime/src/kmp_error.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_error.cpp
+++ openmp/trunk/runtime/src/kmp_error.cpp
@@ -114,7 +114,7 @@
                            ) {
   char const *construct = __kmp_pragma(ct, ident);
   __kmp_msg(kmp_ms_fatal, __kmp_msg_format(id, construct), __kmp_msg_null);
-  KMP_INTERNAL_FREE((void *)construct);
+  KMP_INTERNAL_FREE(CCAST(char *, construct));
 }
 
 void __kmp_error_construct2(kmp_i18n_id_t id, // Message identifier.
@@ -126,8 +126,8 @@
   char const *construct2 = __kmp_pragma(cons->type, cons->ident);
   __kmp_msg(kmp_ms_fatal, __kmp_msg_format(id, construct1, construct2),
             __kmp_msg_null);
-  KMP_INTERNAL_FREE((void *)construct1);
-  KMP_INTERNAL_FREE((void *)construct2);
+  KMP_INTERNAL_FREE(CCAST(char *, construct1));
+  KMP_INTERNAL_FREE(CCAST(char *, construct2));
 }
 
 struct cons_header *__kmp_allocate_cons_stack(int gtid) {
Index: openmp/trunk/runtime/src/kmp_i18n.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_i18n.cpp
+++ openmp/trunk/runtime/src/kmp_i18n.cpp
@@ -169,7 +169,7 @@
                   KMP_MSG(WrongMessageCatalog, name, version.str, expected),
                   KMP_HNT(CheckEnvVar, name, nlspath), __kmp_msg_null);
         KMP_INFORM(WillUseDefaultMessages);
-        KMP_INTERNAL_FREE((void *)nlspath);
+        KMP_INTERNAL_FREE(CCAST(char *, nlspath));
       } // __kmp_generate_warnings
     }; // if
     __kmp_str_buf_free(&version);
Index: openmp/trunk/runtime/src/kmp_itt.inl
===================================================================
--- openmp/trunk/runtime/src/kmp_itt.inl
+++ openmp/trunk/runtime/src/kmp_itt.inl
@@ -369,7 +369,7 @@
   char *s_col;
   KMP_DEBUG_ASSERT(loc->psource);
 #ifdef __cplusplus
-  s_line = strchr((char *)loc->psource, ';');
+  s_line = strchr(CCAST(char *, loc->psource), ';');
 #else
   s_line = strchr(loc->psource, ';');
 #endif
Index: openmp/trunk/runtime/src/kmp_lock.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_lock.cpp
+++ openmp/trunk/runtime/src/kmp_lock.cpp
@@ -950,9 +950,8 @@
 void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck) {
   __kmp_init_ticket_lock(lck);
   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
-                             std::memory_order_relaxed); // >= 0 for nestable
-                                                         // locks, -1 for simple
-                                                         // locks
+                             std::memory_order_relaxed);
+  // >= 0 for nestable locks, -1 for simple locks
 }
 
 static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
@@ -1468,9 +1467,9 @@
         KMP_DEBUG_ASSERT(head > 0);
 
         /* try (h,h)->(-1,0) */
-        dequeued = KMP_COMPARE_AND_STORE_REL64((kmp_int64 *)tail_id_p,
-                                               KMP_PACK_64(head, head),
-                                               KMP_PACK_64(-1, 0));
+        dequeued = KMP_COMPARE_AND_STORE_REL64(
+            RCAST(kmp_int64 *, CCAST(kmp_int32 *, tail_id_p)),
+            KMP_PACK_64(head, head), KMP_PACK_64(-1, 0));
 #ifdef DEBUG_QUEUING_LOCKS
         TRACE_LOCK(gtid + 1, "rel deq: (h,h)->(-1,0)");
 #endif
@@ -2290,11 +2289,10 @@
 
 __forceinline static int
 __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
-  kmp_uint64 ticket = KMP_TEST_THEN_INC64((kmp_int64 *)&lck->lk.next_ticket);
+  kmp_uint64 ticket = KMP_TEST_THEN_INC64(
+      RCAST(kmp_int64 *, CCAST(kmp_uint64 *, &lck->lk.next_ticket)));
   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
-  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls =
-      (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)TCR_PTR(
-          lck->lk.polls); // volatile load
+  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
 
 #ifdef USE_LOCK_PROFILE
   if (TCR_8(polls[ticket & mask].poll) != ticket)
@@ -2331,8 +2329,7 @@
     // values, and we get the new value of mask and the old polls pointer, we
     // could access memory beyond the end of the old polling area.
     mask = TCR_8(lck->lk.mask); // volatile load
-    polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)TCR_PTR(
-        lck->lk.polls); // volatile load
+    polls = lck->lk.polls; // volatile load
   }
 
   // Critical section starts here
@@ -2347,7 +2344,7 @@
   // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
   // ticket.
   if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
-    __kmp_free((void *)lck->lk.old_polls);
+    __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
     lck->lk.old_polls = NULL;
     lck->lk.cleanup_ticket = 0;
   }
@@ -2462,13 +2459,11 @@
   // First get a ticket, then read the polls pointer and the mask.
   // The polls pointer must be read before the mask!!! (See above)
   kmp_uint64 ticket = TCR_8(lck->lk.next_ticket); // volatile load
-  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls =
-      (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)TCR_PTR(
-          lck->lk.polls); // volatile load
+  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
   if (TCR_8(polls[ticket & mask].poll) == ticket) {
     kmp_uint64 next_ticket = ticket + 1;
-    if (KMP_COMPARE_AND_STORE_ACQ64((kmp_int64 *)&lck->lk.next_ticket, ticket,
+    if (KMP_COMPARE_AND_STORE_ACQ64(&lck->lk.next_ticket, ticket,
                                     next_ticket)) {
       KMP_FSYNC_ACQUIRED(lck);
       KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
@@ -2509,9 +2504,7 @@
   // Read the ticket value from the lock data struct, then the polls pointer and
   // the mask.  The polls pointer must be read before the mask!!! (See above)
   kmp_uint64 ticket = lck->lk.now_serving + 1; // non-volatile load
-  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls =
-      (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)TCR_PTR(
-          lck->lk.polls); // volatile load
+  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
   KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
                   ticket - 1, lck));
@@ -2568,11 +2561,11 @@
   lck->lk.initialized = NULL;
   lck->lk.location = NULL;
   if (lck->lk.polls != NULL) {
-    __kmp_free((void *)lck->lk.polls);
+    __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.polls));
     lck->lk.polls = NULL;
   }
   if (lck->lk.old_polls != NULL) {
-    __kmp_free((void *)lck->lk.old_polls);
+    __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
     lck->lk.old_polls = NULL;
   }
   lck->lk.mask = 0;
Index: openmp/trunk/runtime/src/kmp_os.h
===================================================================
--- openmp/trunk/runtime/src/kmp_os.h
+++ openmp/trunk/runtime/src/kmp_os.h
@@ -184,6 +184,12 @@
 #define KMP_INT_MIN ((kmp_int32)0x80000000)
 
 #ifdef __cplusplus
+#define CAST_FLT_INT(a)                                                        \
+  reinterpret_cast<kmp_int32 *>(const_cast<kmp_real32 *>(a))
+#define CAST_DBL_INT(a)                                                        \
+  reinterpret_cast<kmp_int64 *>(const_cast<kmp_real64 *>(a))
+#define CCAST(type, var) const_cast<type>(var)
+#define RCAST(type, var) reinterpret_cast<type>(var)
 //-------------------------------------------------------------------------
 // template for debug prints specification ( d, u, lld, llu ), and to obtain
 // signed/unsigned flavors of a type
@@ -229,6 +235,11 @@
   static const int type_size = sizeof(unsigned_t);
 };
 //-------------------------------------------------------------------------
+#else
+#define CAST_FLT_INT(a) (kmp_int32 *)(a)
+#define CAST_DBL_INT(a) (kmp_int64 *)(a)
+#define CCAST(type, var) (type)(var)
+#define RCAST(type, var) (type)(var)
 #endif // __cplusplus
 
 #define KMP_EXPORT extern /* export declaration in guide libraries */
@@ -416,12 +427,12 @@
 
 #if KMP_ARCH_X86
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
-  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
-                            (kmp_int32)(sv))
+  __kmp_compare_and_store32(RCAST(volatile kmp_int32 *, p),                    \
+                            RCAST(kmp_int32, cv), RCAST(kmp_int32, sv))
 #else /* 64 bit pointers */
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
-  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
-                            (kmp_int64)(sv))
+  __kmp_compare_and_store64(RCAST(volatile kmp_int64 *, p),                    \
+                            RCAST(kmp_int64, cv), RCAST(kmp_int64, sv))
 #endif /* KMP_ARCH_X86 */
 
 #define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
@@ -436,11 +447,9 @@
 #define KMP_XCHG_FIXED8(p, v)                                                  \
   __kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
 #define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
-//# define KMP_XCHG_FIXED32(p, v)                 __kmp_xchg_fixed32( (p), (v)
-//);
-//# define KMP_XCHG_FIXED64(p, v)                 __kmp_xchg_fixed64( (p), (v)
-//);
-//# define KMP_XCHG_REAL32(p, v)                  __kmp_xchg_real32( (p), (v) );
+//#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
+//#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
+//#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
 #define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
 
 #elif (KMP_ASM_INTRINS && KMP_OS_UNIX) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
@@ -464,10 +473,10 @@
 #define KMP_TEST_THEN_ADD32(p, v) __sync_fetch_and_add((kmp_int32 *)(p), (v))
 #define KMP_TEST_THEN_ADD64(p, v) __sync_fetch_and_add((kmp_int64 *)(p), (v))
 
-#define KMP_TEST_THEN_OR32(p, v) __sync_fetch_and_or((kmp_int32 *)(p), (v))
-#define KMP_TEST_THEN_AND32(p, v) __sync_fetch_and_and((kmp_int32 *)(p), (v))
-#define KMP_TEST_THEN_OR64(p, v) __sync_fetch_and_or((kmp_int64 *)(p), (v))
-#define KMP_TEST_THEN_AND64(p, v) __sync_fetch_and_and((kmp_int64 *)(p), (v))
+#define KMP_TEST_THEN_OR32(p, v) __sync_fetch_and_or((kmp_uint32 *)(p), (v))
+#define KMP_TEST_THEN_AND32(p, v) __sync_fetch_and_and((kmp_uint32 *)(p), (v))
+#define KMP_TEST_THEN_OR64(p, v) __sync_fetch_and_or((kmp_uint64 *)(p), (v))
+#define KMP_TEST_THEN_AND64(p, v) __sync_fetch_and_and((kmp_uint64 *)(p), (v))
 
 #define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
   __sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),     \
@@ -494,7 +503,7 @@
   __sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),   \
                                (kmp_uint64)(sv))
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
-  __sync_bool_compare_and_swap((volatile void **)(p), (void *)(cv),            \
+  __sync_bool_compare_and_swap((void *volatile *)(p), (void *)(cv),            \
                                (void *)(sv))
 
 #define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
@@ -523,12 +532,12 @@
 extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
 inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
-  kmp_int32 tmp = __sync_lock_test_and_set((kmp_int32 *)p, *(kmp_int32 *)&v);
+  kmp_int32 tmp = __sync_lock_test_and_set(CAST_FLT_INT(p), *(kmp_int32 *)&v);
   return *(kmp_real32 *)&tmp;
 }
 
 inline kmp_real64 KMP_XCHG_REAL64(volatile kmp_real64 *p, kmp_real64 v) {
-  kmp_int64 tmp = __sync_lock_test_and_set((kmp_int64 *)p, *(kmp_int64 *)&v);
+  kmp_int64 tmp = __sync_lock_test_and_set(CAST_DBL_INT(p), *(kmp_int64 *)&v);
   return *(kmp_real64 *)&tmp;
 }
 
@@ -607,12 +616,12 @@
 
 #if KMP_ARCH_X86
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
-  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
-                            (kmp_int32)(sv))
+  __kmp_compare_and_store32(RCAST(volatile kmp_int32 *, p),                    \
+                            RCAST(kmp_int32, cv), RCAST(kmp_int32, sv))
 #else /* 64 bit pointers */
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
-  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
-                            (kmp_int64)(sv))
+  __kmp_compare_and_store64(RCAST(volatile kmp_int64 *, p),                    \
+                            RCAST(kmp_int64, cv), RCAST(kmp_int64, sv))
 #endif /* KMP_ARCH_X86 */
 
 #define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
Index: openmp/trunk/runtime/src/kmp_runtime.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_runtime.cpp
+++ openmp/trunk/runtime/src/kmp_runtime.cpp
@@ -1888,8 +1888,7 @@
         // we were called from GNU native code
         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
         return FALSE;
-      }
-      else {
+      } else {
         KMP_ASSERT2(call_context < fork_context_last,
                     "__kmp_fork_call: unknown fork_context parameter");
       }
@@ -3378,9 +3377,9 @@
   __kmp_printf("\n------------------------------\nPools\n----------------------"
                "--------\n");
   __kmp_print_structure_thread("Thread pool:          ",
-                               (kmp_info_t *)__kmp_thread_pool);
+                               CCAST(kmp_info_t *, __kmp_thread_pool));
   __kmp_print_structure_team("Team pool:            ",
-                             (kmp_team_t *)__kmp_team_pool);
+                             CCAST(kmp_team_t *, __kmp_team_pool));
   __kmp_printf("\n");
 
   // Free team list.
@@ -4148,7 +4147,7 @@
   /* first, try to get one from the thread pool */
   if (__kmp_thread_pool) {
 
-    new_thr = (kmp_info_t *)__kmp_thread_pool;
+    new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
     if (new_thr == __kmp_thread_pool_insert_pt) {
       __kmp_thread_pool_insert_pt = NULL;
@@ -5097,7 +5096,7 @@
 
   /* next, let's try to take one from the team pool */
   KMP_MB();
-  for (team = (kmp_team_t *)__kmp_team_pool; (team);) {
+  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
     /* TODO: consider resizing undersized teams instead of reaping them, now
        that we have a resizing mechanism */
     if (team->t.t_max_nproc >= max_nproc) {
@@ -5322,7 +5321,7 @@
 
     /* put the team back in the team pool */
     /* TODO limit size of team pool, call reap_team if pool too large */
-    team->t.t_next_pool = (kmp_team_t *)__kmp_team_pool;
+    team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
     __kmp_team_pool = (volatile kmp_team_t *)team;
   }
 
@@ -5420,7 +5419,7 @@
   if (__kmp_thread_pool_insert_pt != NULL) {
     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
   } else {
-    scan = (kmp_info_t **)&__kmp_thread_pool;
+    scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
   }
   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
        scan = &((*scan)->th.th_next_pool))
@@ -5704,7 +5703,7 @@
     // so there are no harmful side effects.
     if (thread->th.th_active_in_pool) {
       thread->th.th_active_in_pool = FALSE;
-      KMP_TEST_THEN_DEC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+      KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, &__kmp_thread_pool_active_nth));
       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
     }
 
@@ -5839,7 +5838,7 @@
     // This is valid for now, but be careful if threads are reaped sooner.
     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
       // Get the next thread from the pool.
-      kmp_info_t *thread = (kmp_info_t *)__kmp_thread_pool;
+      kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
       __kmp_thread_pool = thread->th.th_next_pool;
       // Reap it.
       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
@@ -5852,7 +5851,7 @@
     // Reap teams.
     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
       // Get the next team from the pool.
-      kmp_team_t *team = (kmp_team_t *)__kmp_team_pool;
+      kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
       __kmp_team_pool = team->t.t_next_pool;
       // Reap it.
       team->t.t_next_pool = NULL;
@@ -7234,7 +7233,7 @@
 #endif
 
 #if KMP_AFFINITY_SUPPORTED
-  KMP_INTERNAL_FREE((void *)__kmp_cpuinfo_file);
+  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
   __kmp_cpuinfo_file = NULL;
 #endif /* KMP_AFFINITY_SUPPORTED */
 
Index: openmp/trunk/runtime/src/kmp_settings.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_settings.cpp
+++ openmp/trunk/runtime/src/kmp_settings.cpp
@@ -910,14 +910,13 @@
     // default setting
     __kmp_generate_warnings = kmp_warnings_explicit;
   }
-} // __kmp_env_parse_warnings
+} // __kmp_stg_parse_warnings
 
 static void __kmp_stg_print_warnings(kmp_str_buf_t *buffer, char const *name,
                                      void *data) {
-  __kmp_stg_print_bool(
-      buffer, name, __kmp_generate_warnings); // AC: TODO: change to print_int?
-} // __kmp_env_print_warnings                                      //     (needs
-  // documentation change)...
+  // AC: TODO: change to print_int? (needs documentation change)
+  __kmp_stg_print_bool(buffer, name, __kmp_generate_warnings);
+} // __kmp_stg_print_warnings
 
 // -----------------------------------------------------------------------------
 // OMP_NESTED, OMP_NUM_THREADS
@@ -1386,7 +1385,7 @@
     if ((strcmp(var, name) == 0) && (value != 0)) {
       char *comma;
 
-      comma = (char *)strchr(value, ',');
+      comma = CCAST(char *, strchr(value, ','));
       __kmp_barrier_gather_branch_bits[i] =
           (kmp_uint32)__kmp_str_to_int(value, ',');
       /* is there a specified release parameter? */
@@ -1451,7 +1450,7 @@
 
     if ((strcmp(var, name) == 0) && (value != 0)) {
       int j;
-      char *comma = (char *)strchr(value, ',');
+      char *comma = CCAST(char *, strchr(value, ','));
 
       /* handle first parameter: gather pattern */
       for (j = bp_linear_bar; j < bp_last_bar; j++) {
@@ -1948,82 +1947,86 @@
   while (*buf != '\0') {
     start = next = buf;
 
-    if (__kmp_match_str("none", buf, (const char **)&next)) {
+    if (__kmp_match_str("none", buf, CCAST(const char **, &next))) {
       set_type(affinity_none);
 #if OMP_40_ENABLED
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
 #endif
       buf = next;
-    } else if (__kmp_match_str("scatter", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("scatter", buf, CCAST(const char **, &next))) {
       set_type(affinity_scatter);
 #if OMP_40_ENABLED
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
 #endif
       buf = next;
-    } else if (__kmp_match_str("compact", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("compact", buf, CCAST(const char **, &next))) {
       set_type(affinity_compact);
 #if OMP_40_ENABLED
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
 #endif
       buf = next;
-    } else if (__kmp_match_str("logical", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("logical", buf, CCAST(const char **, &next))) {
       set_type(affinity_logical);
 #if OMP_40_ENABLED
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
 #endif
       buf = next;
-    } else if (__kmp_match_str("physical", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("physical", buf, CCAST(const char **, &next))) {
       set_type(affinity_physical);
 #if OMP_40_ENABLED
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
 #endif
       buf = next;
-    } else if (__kmp_match_str("explicit", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("explicit", buf, CCAST(const char **, &next))) {
       set_type(affinity_explicit);
 #if OMP_40_ENABLED
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
 #endif
       buf = next;
-    } else if (__kmp_match_str("balanced", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("balanced", buf, CCAST(const char **, &next))) {
       set_type(affinity_balanced);
 #if OMP_40_ENABLED
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
 #endif
       buf = next;
-    } else if (__kmp_match_str("disabled", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("disabled", buf, CCAST(const char **, &next))) {
       set_type(affinity_disabled);
 #if OMP_40_ENABLED
       __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
 #endif
       buf = next;
-    } else if (__kmp_match_str("verbose", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("verbose", buf, CCAST(const char **, &next))) {
       set_verbose(TRUE);
       buf = next;
-    } else if (__kmp_match_str("noverbose", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("noverbose", buf, CCAST(const char **, &next))) {
       set_verbose(FALSE);
       buf = next;
-    } else if (__kmp_match_str("warnings", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("warnings", buf, CCAST(const char **, &next))) {
       set_warnings(TRUE);
       buf = next;
-    } else if (__kmp_match_str("nowarnings", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("nowarnings", buf,
+                               CCAST(const char **, &next))) {
       set_warnings(FALSE);
       buf = next;
-    } else if (__kmp_match_str("respect", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("respect", buf, CCAST(const char **, &next))) {
       set_respect(TRUE);
       buf = next;
-    } else if (__kmp_match_str("norespect", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("norespect", buf, CCAST(const char **, &next))) {
       set_respect(FALSE);
       buf = next;
-    } else if (__kmp_match_str("duplicates", buf, (const char **)&next) ||
-               __kmp_match_str("dups", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("duplicates", buf,
+                               CCAST(const char **, &next)) ||
+               __kmp_match_str("dups", buf, CCAST(const char **, &next))) {
       set_dups(TRUE);
       buf = next;
-    } else if (__kmp_match_str("noduplicates", buf, (const char **)&next) ||
-               __kmp_match_str("nodups", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("noduplicates", buf,
+                               CCAST(const char **, &next)) ||
+               __kmp_match_str("nodups", buf, CCAST(const char **, &next))) {
       set_dups(FALSE);
       buf = next;
-    } else if (__kmp_match_str("granularity", buf, (const char **)&next) ||
-               __kmp_match_str("gran", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("granularity", buf,
+                               CCAST(const char **, &next)) ||
+               __kmp_match_str("gran", buf, CCAST(const char **, &next))) {
       SKIP_WS(next);
       if (*next != '=') {
         EMIT_WARN(TRUE, (AffInvalidParam, name, start));
@@ -2033,23 +2036,23 @@
       SKIP_WS(next);
 
       buf = next;
-      if (__kmp_match_str("fine", buf, (const char **)&next)) {
+      if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) {
         set_gran(affinity_gran_fine, -1);
         buf = next;
-      } else if (__kmp_match_str("thread", buf, (const char **)&next)) {
+      } else if (__kmp_match_str("thread", buf, CCAST(const char **, &next))) {
         set_gran(affinity_gran_thread, -1);
         buf = next;
-      } else if (__kmp_match_str("core", buf, (const char **)&next)) {
+      } else if (__kmp_match_str("core", buf, CCAST(const char **, &next))) {
         set_gran(affinity_gran_core, -1);
         buf = next;
-      } else if (__kmp_match_str("package", buf, (const char **)&next)) {
+      } else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) {
         set_gran(affinity_gran_package, -1);
         buf = next;
-      } else if (__kmp_match_str("node", buf, (const char **)&next)) {
+      } else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) {
         set_gran(affinity_gran_node, -1);
         buf = next;
 #if KMP_GROUP_AFFINITY
-      } else if (__kmp_match_str("group", buf, (const char **)&next)) {
+      } else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) {
         set_gran(affinity_gran_group, -1);
         buf = next;
 #endif /* KMP_GROUP AFFINITY */
@@ -2065,7 +2068,7 @@
         EMIT_WARN(TRUE, (AffInvalidParam, name, start));
         continue;
       }
-    } else if (__kmp_match_str("proclist", buf, (const char **)&next)) {
+    } else if (__kmp_match_str("proclist", buf, CCAST(const char **, &next))) {
       char *temp_proclist;
 
       SKIP_WS(next);
@@ -2081,8 +2084,8 @@
       }
       next++; // skip '['
       buf = next;
-      if (!__kmp_parse_affinity_proc_id_list(name, buf, (const char **)&next,
-                                             &temp_proclist)) {
+      if (!__kmp_parse_affinity_proc_id_list(
+              name, buf, CCAST(const char **, &next), &temp_proclist)) {
         // warning already emitted.
         SKIP_TO(next, ']');
         if (*next == ']')
@@ -2138,7 +2141,7 @@
 #undef set_respect
 #undef set_granularity
 
-  __kmp_str_free((const char **)&buffer);
+  __kmp_str_free(CCAST(const char **, &buffer));
 
   if (proclist) {
     if (!type) {
@@ -2932,11 +2935,11 @@
     break;
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-# if KMP_USE_HWLOC
+#if KMP_USE_HWLOC
   case affinity_top_method_hwloc:
     value = "hwloc";
     break;
-# endif
+#endif
 
   case affinity_top_method_cpuinfo:
     value = "cpuinfo";
@@ -3269,9 +3272,9 @@
       do {
         char sentinel;
 
-        semicolon = (char *)strchr(value, ';');
+        semicolon = CCAST(char *, strchr(value, ';'));
         if (*value && semicolon != value) {
-          char *comma = (char *)strchr(value, ',');
+          char *comma = CCAST(char *, strchr(value, ','));
 
           if (comma) {
             ++comma;
@@ -3336,7 +3339,7 @@
   if (value) {
     length = KMP_STRLEN(value);
     if (length) {
-      char *comma = (char *)strchr(value, ',');
+      char *comma = CCAST(char *, strchr(value, ','));
       if (value[length - 1] == '"' || value[length - 1] == '\'')
         KMP_WARNING(UnbalancedQuotes, name);
       /* get the specified scheduling style */
@@ -4079,9 +4082,9 @@
   // Value example: 1s,5c@3,2T
   // Which means "use 1 socket, 5 cores with offset 3, 2 threads per core"
   static int parsed = 0;
-  if( strcmp(name, "KMP_PLACE_THREADS") == 0 ) {
-    KMP_INFORM(EnvVarDeprecated,name,"KMP_HW_SUBSET");
-    if( parsed == 1 ) {
+  if (strcmp(name, "KMP_PLACE_THREADS") == 0) {
+    KMP_INFORM(EnvVarDeprecated, name, "KMP_HW_SUBSET");
+    if (parsed == 1) {
       return; // already parsed KMP_HW_SUBSET
     }
   }
@@ -4093,7 +4096,7 @@
   size_t len = 0, mlen = MAX_STR_LEN;
   int level = 0;
   // Canonize the string (remove spaces, unify delimiters, etc.)
-  char *pos = (char *)value;
+  char *pos = CCAST(char *, value);
   while (*pos && mlen) {
     if (*pos != ' ') { // skip spaces
       if (len == 0 && *pos == ':') {
@@ -4212,7 +4215,7 @@
 }
 
 static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
-                                      void *data ) {
+                                      void *data) {
   if (__kmp_hws_requested) {
     int comma = 0;
     kmp_str_buf_t buf;
@@ -4228,26 +4231,26 @@
       comma = 1;
     }
     if (__kmp_hws_node.num) {
-      __kmp_str_buf_print(&buf, "%s%dn", comma?",":"", __kmp_hws_node.num);
+      __kmp_str_buf_print(&buf, "%s%dn", comma ? "," : "", __kmp_hws_node.num);
       if (__kmp_hws_node.offset)
         __kmp_str_buf_print(&buf, "@%d", __kmp_hws_node.offset);
       comma = 1;
     }
     if (__kmp_hws_tile.num) {
-      __kmp_str_buf_print(&buf, "%s%dL2", comma?",":"", __kmp_hws_tile.num);
+      __kmp_str_buf_print(&buf, "%s%dL2", comma ? "," : "", __kmp_hws_tile.num);
       if (__kmp_hws_tile.offset)
         __kmp_str_buf_print(&buf, "@%d", __kmp_hws_tile.offset);
       comma = 1;
     }
     if (__kmp_hws_core.num) {
-      __kmp_str_buf_print(&buf, "%s%dc", comma?",":"", __kmp_hws_core.num);
+      __kmp_str_buf_print(&buf, "%s%dc", comma ? "," : "", __kmp_hws_core.num);
       if (__kmp_hws_core.offset)
         __kmp_str_buf_print(&buf, "@%d", __kmp_hws_core.offset);
       comma = 1;
     }
     if (__kmp_hws_proc.num)
-      __kmp_str_buf_print(&buf, "%s%dt", comma?",":"", __kmp_hws_proc.num);
-    __kmp_str_buf_print(buffer, "%s'\n", buf.str );
+      __kmp_str_buf_print(&buf, "%s%dt", comma ? "," : "", __kmp_hws_proc.num);
+    __kmp_str_buf_print(buffer, "%s'\n", buf.str);
     __kmp_str_buf_free(&buf);
   }
 }
@@ -4582,8 +4585,8 @@
 } // __kmp_stg_find
 
 static int __kmp_stg_cmp(void const *_a, void const *_b) {
-  kmp_setting_t *a = (kmp_setting_t *)_a;
-  kmp_setting_t *b = (kmp_setting_t *)_b;
+  kmp_setting_t *a = RCAST(kmp_setting_t *, CCAST(void *, _a));
+  kmp_setting_t *b = RCAST(kmp_setting_t *, CCAST(void *, _b));
 
   // Process KMP_AFFINITY last.
   // It needs to come after OMP_PLACES and GOMP_CPU_AFFINITY.
@@ -4623,11 +4626,13 @@
       // assignments
       // !!!     rivals[ i ++ ] = ...;
       static kmp_setting_t *volatile rivals[4];
-      static kmp_stg_ss_data_t kmp_data = {1, (kmp_setting_t **)rivals};
+      static kmp_stg_ss_data_t kmp_data = {1, CCAST(kmp_setting_t **, rivals)};
 #ifdef KMP_GOMP_COMPAT
-      static kmp_stg_ss_data_t gomp_data = {1024, (kmp_setting_t **)rivals};
+      static kmp_stg_ss_data_t gomp_data = {1024,
+                                            CCAST(kmp_setting_t **, rivals)};
 #endif
-      static kmp_stg_ss_data_t omp_data = {1024, (kmp_setting_t **)rivals};
+      static kmp_stg_ss_data_t omp_data = {1024,
+                                           CCAST(kmp_setting_t **, rivals)};
       int i = 0;
 
       rivals[i++] = kmp_stacksize;
@@ -4656,8 +4661,8 @@
 
       // !!! volatile keyword is Intel (R) C Compiler bug CQ49908 workaround.
       static kmp_setting_t *volatile rivals[3];
-      static kmp_stg_wp_data_t kmp_data = {0, (kmp_setting_t **)rivals};
-      static kmp_stg_wp_data_t omp_data = {1, (kmp_setting_t **)rivals};
+      static kmp_stg_wp_data_t kmp_data = {0, CCAST(kmp_setting_t **, rivals)};
+      static kmp_stg_wp_data_t omp_data = {1, CCAST(kmp_setting_t **, rivals)};
       int i = 0;
 
       rivals[i++] = kmp_library;
@@ -4690,11 +4695,10 @@
         rivals[i++] = omp_thread_limit;
       }; // if
       rivals[i++] = NULL;
-
-      kmp_all_threads->data = (void *)&rivals;
-      kmp_max_threads->data = (void *)&rivals;
+      kmp_all_threads->data = CCAST(kmp_setting_t **, rivals);
+      kmp_max_threads->data = CCAST(kmp_setting_t **, rivals);
       if (omp_thread_limit != NULL) {
-        omp_thread_limit->data = (void *)&rivals;
+        omp_thread_limit->data = CCAST(kmp_setting_t **, rivals);
       }; // if
     }
 
@@ -4722,11 +4726,11 @@
 
 #ifdef KMP_GOMP_COMPAT
       rivals[i++] = gomp_cpu_affinity;
-      gomp_cpu_affinity->data = (void *)&rivals;
+      gomp_cpu_affinity->data = CCAST(kmp_setting_t **, rivals);
 #endif
 
       rivals[i++] = omp_proc_bind;
-      omp_proc_bind->data = (void *)&rivals;
+      omp_proc_bind->data = CCAST(kmp_setting_t **, rivals);
       rivals[i++] = NULL;
 
 #if OMP_40_ENABLED
@@ -4741,7 +4745,7 @@
       places_rivals[i++] = gomp_cpu_affinity;
 #endif
       places_rivals[i++] = omp_places;
-      omp_places->data = (void *)&places_rivals;
+      omp_places->data = CCAST(kmp_setting_t **, places_rivals);
       places_rivals[i++] = NULL;
 #endif
     }
@@ -4758,8 +4762,10 @@
 
       // !!! volatile keyword is Intel (R) C Compiler bug CQ49908 workaround.
       static kmp_setting_t *volatile rivals[3];
-      static kmp_stg_fr_data_t force_data = {1, (kmp_setting_t **)rivals};
-      static kmp_stg_fr_data_t determ_data = {0, (kmp_setting_t **)rivals};
+      static kmp_stg_fr_data_t force_data = {1,
+                                             CCAST(kmp_setting_t **, rivals)};
+      static kmp_stg_fr_data_t determ_data = {0,
+                                              CCAST(kmp_setting_t **, rivals)};
       int i = 0;
 
       rivals[i++] = kmp_force_red;
Index: openmp/trunk/runtime/src/kmp_str.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_str.cpp
+++ openmp/trunk/runtime/src/kmp_str.cpp
@@ -258,9 +258,9 @@
 } // kmp_str_fname_init
 
 void __kmp_str_fname_free(kmp_str_fname_t *fname) {
-  __kmp_str_free((char const **)(&fname->path));
-  __kmp_str_free((char const **)(&fname->dir));
-  __kmp_str_free((char const **)(&fname->base));
+  __kmp_str_free(CCAST(char const **, &fname->path));
+  __kmp_str_free(CCAST(char const **, &fname->dir));
+  __kmp_str_free(CCAST(char const **, &fname->base));
 } // kmp_str_fname_free
 
 int __kmp_str_fname_match(kmp_str_fname_t const *fname, char const *pattern) {
@@ -329,7 +329,7 @@
 
 void __kmp_str_loc_free(kmp_str_loc_t *loc) {
   __kmp_str_fname_free(&loc->fname);
-  __kmp_str_free((const char **)&(loc->_bulk));
+  __kmp_str_free(CCAST(const char **, &(loc->_bulk)));
   loc->file = NULL;
   loc->func = NULL;
 } // kmp_str_loc_free
@@ -430,7 +430,7 @@
 
 void __kmp_str_free(char const **str) {
   KMP_DEBUG_ASSERT(str != NULL);
-  KMP_INTERNAL_FREE((void *)*str);
+  KMP_INTERNAL_FREE(CCAST(char *, *str));
   *str = NULL;
 } // func __kmp_str_free
 
Index: openmp/trunk/runtime/src/kmp_taskdeps.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_taskdeps.cpp
+++ openmp/trunk/runtime/src/kmp_taskdeps.cpp
@@ -46,7 +46,7 @@
 }
 
 static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
-  KMP_TEST_THEN_INC32(&node->dn.nrefs);
+  KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, &node->dn.nrefs));
   return node;
 }
 
@@ -54,7 +54,7 @@
   if (!node)
     return;
 
-  kmp_int32 n = KMP_TEST_THEN_DEC32(&node->dn.nrefs) - 1;
+  kmp_int32 n = KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, &node->dn.nrefs)) - 1;
   if (n == 0) {
     KMP_ASSERT(node->dn.nrefs == 0);
 #if USE_FAST_MEMORY
@@ -372,8 +372,10 @@
   // Update predecessors and obtain current value to check if there are still
   // any outstandig dependences (some tasks may have finished while we processed
   // the dependences)
-  npredecessors = KMP_TEST_THEN_ADD32(&node->dn.npredecessors, npredecessors) +
-                  npredecessors;
+  npredecessors =
+      KMP_TEST_THEN_ADD32(CCAST(kmp_int32 *, &node->dn.npredecessors),
+                          npredecessors) +
+      npredecessors;
 
   KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n",
                 gtid, npredecessors, taskdata));
@@ -410,8 +412,8 @@
   for (kmp_depnode_list_t *p = node->dn.successors; p; p = next) {
     kmp_depnode_t *successor = p->node;
     kmp_int32 npredecessors =
-        KMP_TEST_THEN_DEC32(&successor->dn.npredecessors) - 1;
-
+        KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, &successor->dn.npredecessors)) -
+        1;
     // successor task can be NULL for wait_depends or because deps are still
     // being processed
     if (npredecessors == 0) {
Index: openmp/trunk/runtime/src/kmp_tasking.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_tasking.cpp
+++ openmp/trunk/runtime/src/kmp_tasking.cpp
@@ -579,9 +579,9 @@
 #endif
   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 
-  kmp_int32 children =
-      KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_allocated_child_tasks)) -
-      1;
+  kmp_int32 children = KMP_TEST_THEN_DEC32(CCAST(
+                           kmp_int32 *, &taskdata->td_allocated_child_tasks)) -
+                       1;
   KMP_DEBUG_ASSERT(children >= 0);
 
   // Now, go up the ancestor tree to see if any ancestors can now be freed.
@@ -604,7 +604,7 @@
 
     // Predecrement simulated by "- 1" calculation
     children = KMP_TEST_THEN_DEC32(
-                   (kmp_int32 *)(&taskdata->td_allocated_child_tasks)) -
+                   CCAST(kmp_int32 *, &taskdata->td_allocated_child_tasks)) -
                1;
     KMP_DEBUG_ASSERT(children >= 0);
   }
@@ -684,8 +684,8 @@
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
     // Predecrement simulated by "- 1" calculation
     children =
-        KMP_TEST_THEN_DEC32(
-            (kmp_int32 *)(&taskdata->td_parent->td_incomplete_child_tasks)) -
+        KMP_TEST_THEN_DEC32(CCAST(
+            kmp_int32 *, &taskdata->td_parent->td_incomplete_child_tasks)) -
         1;
     KMP_DEBUG_ASSERT(children >= 0);
 #if OMP_40_ENABLED
@@ -1110,7 +1110,8 @@
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
 #endif
   {
-    KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_incomplete_child_tasks));
+    KMP_TEST_THEN_INC32(
+        CCAST(kmp_int32 *, &parent_task->td_incomplete_child_tasks));
 #if OMP_40_ENABLED
     if (parent_task->td_taskgroup)
       KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count));
@@ -1119,7 +1120,7 @@
     // implicit not deallocated
     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
       KMP_TEST_THEN_INC32(
-          (kmp_int32 *)(&taskdata->td_parent->td_allocated_child_tasks));
+          CCAST(kmp_int32 *, &taskdata->td_parent->td_allocated_child_tasks));
     }
   }
 
@@ -1511,7 +1512,9 @@
                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
 #endif
     if (must_wait) {
-      kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
+      kmp_flag_32 flag(
+          RCAST(volatile kmp_uint32 *, &taskdata->td_incomplete_child_tasks),
+          0U);
       while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) {
         flag.execute_tasks(thread, gtid, FALSE,
                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
@@ -1845,7 +1848,7 @@
     if (!taskdata->td_flags.team_serial)
 #endif
     {
-      kmp_flag_32 flag(&(taskgroup->count), 0U);
+      kmp_flag_32 flag(RCAST(kmp_uint32 *, &taskgroup->count), 0U);
       while (TCR_4(taskgroup->count) != 0) {
         flag.execute_tasks(thread, gtid, FALSE,
                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
@@ -1960,11 +1963,11 @@
 // __kmp_steal_task: remove a task from another thread's deque
 // Assume that calling thread has already checked existence of
 // task_team thread_data before calling this routine.
-static kmp_task_t *
-__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team,
-                 volatile kmp_uint32 *unfinished_threads, int *thread_finished,
-                 kmp_int32 is_constrained)
-{
+static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid,
+                                    kmp_task_team_t *task_team,
+                                    volatile kmp_int32 *unfinished_threads,
+                                    int *thread_finished,
+                                    kmp_int32 is_constrained) {
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
   kmp_thread_data_t *victim_td, *threads_data;
@@ -2052,9 +2055,9 @@
     // We need to un-mark this victim as a finished victim.  This must be done
     // before releasing the lock, or else other threads (starting with the
     // master victim) might be prematurely released from the barrier!!!
-    kmp_uint32 count;
+    kmp_int32 count;
 
-    count = KMP_TEST_THEN_INC32((kmp_int32 *)unfinished_threads);
+    count = KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, unfinished_threads));
 
     KA_TRACE(
         20,
@@ -2066,7 +2069,6 @@
   TCW_4(victim_td->td.td_deque_ntasks,
         TCR_4(victim_td->td.td_deque_ntasks) - 1);
 
-
   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
 
   KMP_COUNT_BLOCK(TASK_stolen);
@@ -2082,7 +2084,6 @@
   return task;
 }
 
-
 // __kmp_execute_tasks_template: Choose and execute tasks until either the
 // condition is statisfied (return true) or there are none left (return false).
 //
@@ -2102,7 +2103,7 @@
   kmp_task_t *task;
   kmp_info_t *other_thread;
   kmp_taskdata_t *current_task = thread->th.th_current_task;
-  volatile kmp_uint32 *unfinished_threads;
+  volatile kmp_int32 *unfinished_threads;
   kmp_int32 nthreads, victim = -2, use_own_tasks = 1, new_victim = 0,
                       tid = thread->th.th_info.ds.ds_tid;
 
@@ -2127,7 +2128,7 @@
 #else
   KMP_DEBUG_ASSERT(nthreads > 1);
 #endif
-  KMP_DEBUG_ASSERT((int)(TCR_4(*unfinished_threads)) >= 0);
+  KMP_DEBUG_ASSERT(TCR_4(*unfinished_threads) >= 0);
 
   while (1) { // Outer loop keeps trying to find tasks in case of single thread
     // getting tasks from target constructs
@@ -2171,7 +2172,8 @@
             asleep = 0;
             if ((__kmp_tasking_mode == tskm_task_teams) &&
                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
-                (TCR_PTR(other_thread->th.th_sleep_loc) != NULL)) {
+                (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
+                 NULL)) {
               asleep = 1;
               __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
                                         other_thread->th.th_sleep_loc);
@@ -2265,9 +2267,9 @@
       // done.  This decrement might be to the spin location, and result in the
       // termination condition being satisfied.
       if (!*thread_finished) {
-        kmp_uint32 count;
+        kmp_int32 count;
 
-        count = KMP_TEST_THEN_DEC32((kmp_int32 *)unfinished_threads) - 1;
+        count = KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, unfinished_threads)) - 1;
         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
                       "unfinished_threads to %d task_team=%p\n",
                       gtid, count, task_team));
@@ -2388,7 +2390,8 @@
       // To work around this, __kmp_execute_tasks_template() periodically checks
       // see if other threads are sleeping (using the same random mechanism that
       // is used for task stealing) and awakens them if they are.
-      if ((sleep_loc = TCR_PTR(thread->th.th_sleep_loc)) != NULL) {
+      if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
+          NULL) {
         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
                       __kmp_gtid_from_thread(this_thr),
                       __kmp_gtid_from_thread(thread)));
@@ -2762,7 +2765,7 @@
     // TODO: GEH - this may be is wrong because some sync would be necessary
     // in case threads are added to the pool during the traversal. Need to
     // verify that lock for thread pool is held when calling this routine.
-    for (thread = (kmp_info_t *)__kmp_thread_pool; thread != NULL;
+    for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
          thread = thread->th.th_next_pool) {
 #if KMP_OS_WINDOWS
       DWORD exit_val;
@@ -2789,7 +2792,8 @@
       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
         volatile void *sleep_loc;
         // If the thread is sleeping, awaken it.
-        if ((sleep_loc = TCR_PTR(thread->th.th_sleep_loc)) != NULL) {
+        if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
+            NULL) {
           KA_TRACE(
               10,
               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
@@ -2914,7 +2918,9 @@
       // Worker threads may have dropped through to release phase, but could
       // still be executing tasks. Wait here for tasks to complete. To avoid
       // memory contention, only master thread checks termination condition.
-      kmp_flag_32 flag(&task_team->tt.tt_unfinished_threads, 0U);
+      kmp_flag_32 flag(
+          RCAST(volatile kmp_uint32 *, &task_team->tt.tt_unfinished_threads),
+          0U);
       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
     // Deactivate the old task team, so that the worker threads will stop
@@ -2944,8 +2950,9 @@
 // barrier. It is a full barrier itself, which unfortunately turns regular
 // barriers into double barriers and join barriers into 1 1/2 barriers.
 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
-  volatile kmp_uint32 *spin =
-      &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads;
+  volatile kmp_uint32 *spin = RCAST(
+      volatile kmp_uint32 *,
+      &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
   int flag = FALSE;
   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
 
@@ -2957,7 +2964,7 @@
                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
 #if USE_ITT_BUILD
     // TODO: What about itt_sync_obj??
-    KMP_FSYNC_SPIN_PREPARE(spin);
+    KMP_FSYNC_SPIN_PREPARE(CCAST(void *, RCAST(volatile void *, spin)));
 #endif /* USE_ITT_BUILD */
 
     if (TCR_4(__kmp_global.g.g_done)) {
@@ -2968,7 +2975,7 @@
     KMP_YIELD(TRUE); // GH: We always yield here
   }
 #if USE_ITT_BUILD
-  KMP_FSYNC_SPIN_ACQUIRED((void *)spin);
+  KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, RCAST(volatile void *, spin)));
 #endif /* USE_ITT_BUILD */
 }
 
@@ -3080,7 +3087,7 @@
   taskdata->td_flags.complete = 1; // mark the task as completed
 
   if (taskdata->td_taskgroup)
-    KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count));
+    KMP_TEST_THEN_DEC32(&taskdata->td_taskgroup->count);
 
   // Create an imaginary children for this task so the bottom half cannot
   // release the task before we have completed the second top half
@@ -3093,7 +3100,7 @@
   // Predecrement simulated by "- 1" calculation
   children =
       KMP_TEST_THEN_DEC32(
-          (kmp_int32 *)(&taskdata->td_parent->td_incomplete_child_tasks)) -
+          CCAST(kmp_int32 *, &taskdata->td_parent->td_incomplete_child_tasks)) -
       1;
   KMP_DEBUG_ASSERT(children >= 0);
 
@@ -3245,14 +3252,15 @@
   // Only need to keep track of child task counts if team parallel and tasking
   // not serialized
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
-    KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_incomplete_child_tasks));
+    KMP_TEST_THEN_INC32(
+        CCAST(kmp_int32 *, &parent_task->td_incomplete_child_tasks));
     if (parent_task->td_taskgroup)
-      KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count));
+      KMP_TEST_THEN_INC32(&parent_task->td_taskgroup->count);
     // Only need to keep track of allocated child tasks for explicit tasks since
     // implicit not deallocated
     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
       KMP_TEST_THEN_INC32(
-          (kmp_int32 *)(&taskdata->td_parent->td_allocated_child_tasks));
+          CCAST(kmp_int32 *, &taskdata->td_parent->td_allocated_child_tasks));
   }
 
   KA_TRACE(20,
Index: openmp/trunk/runtime/src/kmp_taskq.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_taskq.cpp
+++ openmp/trunk/runtime/src/kmp_taskq.cpp
@@ -272,7 +272,8 @@
     if (in_parallel) {
       if (queue->tq_taskq_slot != NULL) {
         __kmp_printf("    TaskQ slot:\n");
-        __kmp_dump_thunk(tq, (kmpc_thunk_t *)queue->tq_taskq_slot, global_tid);
+        __kmp_dump_thunk(tq, CCAST(kmpc_thunk_t *, queue->tq_taskq_slot),
+                         global_tid);
         __kmp_printf("\n");
       }
       //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
@@ -348,7 +349,7 @@
     KMP_MB();
 
     if (curr_queue->tq_first_child) {
-      for (queue = (kmpc_task_queue_t *)curr_queue->tq_first_child;
+      for (queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
            queue != NULL; queue = queue->tq_next_child) {
         __kmp_aux_dump_task_queue_tree(tq, queue, level + 1, global_tid);
       }
@@ -541,8 +542,8 @@
   __kmpc_taskq_free(p->tq_queue, global_tid);
 
   /* free shared var structure storage */
-  __kmpc_taskq_free((void *)p->tq_shareds[0].ai_data, global_tid);
-
+  __kmpc_taskq_free(CCAST(kmpc_shared_vars_t *, p->tq_shareds[0].ai_data),
+                    global_tid);
   /* free array of pointers to shared vars storage */
   __kmpc_taskq_free(p->tq_shareds, global_tid);
 
@@ -798,7 +799,7 @@
           (queue->tq_nfull <= queue->tq_hiwat)) {
         /* if there's enough room in the queue and the dispatcher */
         /* (taskq task) is available, schedule more tasks         */
-        pt = (kmpc_thunk_t *)queue->tq_taskq_slot;
+        pt = CCAST(kmpc_thunk_t *, queue->tq_taskq_slot);
         queue->tq_taskq_slot = NULL;
       } else if (queue->tq_nfull == 0 ||
                  queue->tq_th_thunks[tid].ai_data >=
@@ -845,7 +846,7 @@
     // Seems to work without this call for digital/alpha, needed for IBM/RS6000
     KMP_MB();
 
-    queue = (kmpc_task_queue_t *)curr_queue->tq_first_child;
+    queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
     if (queue == NULL) {
       __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
       return NULL;
@@ -1111,7 +1112,7 @@
     // Seems to work without this call for digital/alpha, needed for IBM/RS6000
     KMP_MB();
 
-    queue = (kmpc_task_queue_t *)curr_queue->tq_first_child;
+    queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
     if (queue != NULL) {
       __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
       return;
@@ -1181,7 +1182,7 @@
                                          kmpc_task_queue_t *queue) {
   kmpc_task_queue_t *next_child;
 
-  queue = (kmpc_task_queue_t *)queue->tq_first_child;
+  queue = CCAST(kmpc_task_queue_t *, queue->tq_first_child);
 
   while (queue != NULL) {
     __kmp_remove_all_child_taskq(tq, global_tid, queue);
@@ -1222,7 +1223,7 @@
   if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
     kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
     thunk->th.th_shareds =
-        (kmpc_shared_vars_t *)queue->tq_shareds[index].ai_data;
+        CCAST(kmpc_shared_vars_t *, queue->tq_shareds[index].ai_data);
 
     if (__kmp_env_consistency_check) {
       __kmp_push_workshare(global_tid,
@@ -1343,8 +1344,8 @@
         /* enqueued, and the master thread released this barrier.  This      */
         /* worker thread can now proceed and execute tasks.  See also the    */
         /* TQF_RELEASE_WORKERS which is used to handle this case.            */
-        *shareds = (kmpc_shared_vars_t *)tq->tq_root->tq_shareds[tid].ai_data;
-
+        *shareds =
+            CCAST(kmpc_shared_vars_t *, tq->tq_root->tq_shareds[tid].ai_data);
         KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
 
         return NULL;
@@ -1418,7 +1419,7 @@
   }
 
   /*  create a new thunk for the taskq_task in the new_queue  */
-  *shareds = (kmpc_shared_vars_t *)new_queue->tq_shareds[0].ai_data;
+  *shareds = CCAST(kmpc_shared_vars_t *, new_queue->tq_shareds[0].ai_data);
 
   new_taskq_thunk->th.th_shareds = *shareds;
   new_taskq_thunk->th_task = taskq_task;
@@ -1459,7 +1460,7 @@
       KMP_MB();
 
       new_queue->tq_next_child =
-          (struct kmpc_task_queue_t *)curr_queue->tq_first_child;
+          CCAST(struct kmpc_task_queue_t *, curr_queue->tq_first_child);
 
       if (curr_queue->tq_first_child != NULL)
         curr_queue->tq_first_child->tq_prev_child = new_queue;
@@ -1920,8 +1921,8 @@
 
   if (in_parallel) {
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-
-    KMP_TEST_THEN_OR32(&queue->tq_flags, (kmp_int32)TQF_ALL_TASKS_QUEUED);
+    KMP_TEST_THEN_OR32(CCAST(kmp_int32 *, &queue->tq_flags),
+                       (kmp_int32)TQF_ALL_TASKS_QUEUED);
 #else
     {
       __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
@@ -1951,8 +1952,8 @@
       queue->tq_flags |= TQF_IS_LAST_TASK;
     } else {
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-
-      KMP_TEST_THEN_OR32(&queue->tq_flags, (kmp_int32)TQF_IS_LAST_TASK);
+      KMP_TEST_THEN_OR32(CCAST(kmp_int32 *, &queue->tq_flags),
+                         (kmp_int32)TQF_IS_LAST_TASK);
 #else
       {
         __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
@@ -2009,7 +2010,8 @@
      the next to be enqueued in __kmpc_task(). */
 
   new_thunk = __kmp_alloc_thunk(queue, in_parallel, global_tid);
-  new_thunk->th.th_shareds = (kmpc_shared_vars_t *)queue->tq_shareds[0].ai_data;
+  new_thunk->th.th_shareds =
+      CCAST(kmpc_shared_vars_t *, queue->tq_shareds[0].ai_data);
   new_thunk->th_encl_thunk = NULL;
   new_thunk->th_task = task;
 
Index: openmp/trunk/runtime/src/kmp_utility.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_utility.cpp
+++ openmp/trunk/runtime/src/kmp_utility.cpp
@@ -105,7 +105,7 @@
     return result;
   }; // if
   value = strtod(frequency,
-                 (char **)&unit); // strtod() does not like "char const *".
+                 CCAST(char **, &unit)); // strtod() does not like "const"
   if (0 < value &&
       value <= DBL_MAX) { // Good value (not overflow, underflow, etc).
     if (strcmp(unit, "MHz") == 0) {
Index: openmp/trunk/runtime/src/kmp_wait_release.h
===================================================================
--- openmp/trunk/runtime/src/kmp_wait_release.h
+++ openmp/trunk/runtime/src/kmp_wait_release.h
@@ -108,7 +108,7 @@
 
   KMP_FSYNC_SPIN_INIT(spin, NULL);
   if (flag->done_check()) {
-    KMP_FSYNC_SPIN_ACQUIRED(spin);
+    KMP_FSYNC_SPIN_ACQUIRED(CCAST(typename C::flag_t *, spin));
     return;
   }
   th_gtid = this_thr->th.th_info.ds.ds_gtid;
@@ -216,7 +216,7 @@
       } // if
     } // if
 
-    KMP_FSYNC_SPIN_PREPARE(spin);
+    KMP_FSYNC_SPIN_PREPARE(CCAST(typename C::flag_t *, spin));
     if (TCR_4(__kmp_global.g.g_done)) {
       if (__kmp_global.g.g_abort)
         __kmp_abort_thread();
@@ -235,7 +235,7 @@
     in_pool = !!TCR_4(this_thr->th.th_in_pool);
     if (in_pool != !!this_thr->th.th_active_in_pool) {
       if (in_pool) { // Recently transferred from team to pool
-        KMP_TEST_THEN_INC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+        KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, &__kmp_thread_pool_active_nth));
         this_thr->th.th_active_in_pool = TRUE;
         /* Here, we cannot assert that:
            KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) <=
@@ -245,7 +245,7 @@
            inc/dec'd asynchronously by the workers. The two can get out of sync
            for brief periods of time.  */
       } else { // Recently transferred from pool to team
-        KMP_TEST_THEN_DEC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+        KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, &__kmp_thread_pool_active_nth));
         KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         this_thr->th.th_active_in_pool = FALSE;
       }
@@ -327,7 +327,7 @@
   }
 #endif
 
-  KMP_FSYNC_SPIN_ACQUIRED(spin);
+  KMP_FSYNC_SPIN_ACQUIRED(CCAST(typename C::flag_t *, spin));
 }
 
 /* Release any threads specified as waiting on the flag by releasing the flag
@@ -340,7 +340,7 @@
 #endif
   KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
   KMP_DEBUG_ASSERT(flag->get());
-  KMP_FSYNC_RELEASING(flag->get());
+  KMP_FSYNC_RELEASING(CCAST(typename C::flag_t *, flag->get()));
 
   flag->internal_release();
 
@@ -374,13 +374,13 @@
   static const flag_type t = flag32;
   static inline flag_t tcr(flag_t f) { return TCR_4(f); }
   static inline flag_t test_then_add4(volatile flag_t *f) {
-    return KMP_TEST_THEN_ADD4_32((volatile kmp_int32 *)f);
+    return KMP_TEST_THEN_ADD4_32(RCAST(kmp_int32 *, CCAST(flag_t *, f)));
   }
   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
-    return KMP_TEST_THEN_OR32((volatile kmp_int32 *)f, v);
+    return KMP_TEST_THEN_OR32(CCAST(flag_t *, f), v);
   }
   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
-    return KMP_TEST_THEN_AND32((volatile kmp_int32 *)f, v);
+    return KMP_TEST_THEN_AND32(CCAST(flag_t *, f), v);
   }
 };
 
@@ -389,13 +389,13 @@
   static const flag_type t = flag64;
   static inline flag_t tcr(flag_t f) { return TCR_8(f); }
   static inline flag_t test_then_add4(volatile flag_t *f) {
-    return KMP_TEST_THEN_ADD4_64((volatile kmp_int64 *)f);
+    return KMP_TEST_THEN_ADD4_64(RCAST(kmp_int64 *, CCAST(flag_t *, f)));
   }
   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
-    return KMP_TEST_THEN_OR64((volatile kmp_int64 *)f, v);
+    return KMP_TEST_THEN_OR64(CCAST(flag_t *, f), v);
   }
   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
-    return KMP_TEST_THEN_AND64((volatile kmp_int64 *)f, v);
+    return KMP_TEST_THEN_AND64(CCAST(flag_t *, f), v);
   }
 };
 
@@ -562,7 +562,7 @@
       itt_sync_obj; /**< ITT object that must be passed to new flag location. */
 #endif
   unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
-    return ((unsigned char *)loc)[offset];
+    return RCAST(unsigned char *, CCAST(kmp_uint64 *, loc))[offset];
   }
 
 public:
@@ -626,15 +626,15 @@
     } else {
       kmp_uint64 mask = 0;
       byteref(&mask, offset) = 1;
-      (void)KMP_TEST_THEN_OR64((volatile kmp_int64 *)get(), mask);
+      KMP_TEST_THEN_OR64(CCAST(kmp_uint64 *, get()), mask);
     }
   }
   kmp_uint64 set_sleeping() {
-    return KMP_TEST_THEN_OR64((kmp_int64 volatile *)get(),
+    return KMP_TEST_THEN_OR64(CCAST(kmp_uint64 *, get()),
                               KMP_BARRIER_SLEEP_STATE);
   }
   kmp_uint64 unset_sleeping() {
-    return KMP_TEST_THEN_AND64((kmp_int64 volatile *)get(),
+    return KMP_TEST_THEN_AND64(CCAST(kmp_uint64 *, get()),
                                ~KMP_BARRIER_SLEEP_STATE);
   }
   bool is_sleeping_val(kmp_uint64 old_loc) {
@@ -667,7 +667,7 @@
   if (!flag)
     return;
 
-  switch (((kmp_flag_64 *)flag)->get_type()) {
+  switch (RCAST(kmp_flag_64 *, CCAST(void *, flag))->get_type()) {
   case flag32:
     __kmp_resume_32(gtid, NULL);
     break;
Index: openmp/trunk/runtime/src/z_Linux_util.cpp
===================================================================
--- openmp/trunk/runtime/src/z_Linux_util.cpp
+++ openmp/trunk/runtime/src/z_Linux_util.cpp
@@ -1460,7 +1460,8 @@
         th->th.th_active = FALSE;
         if (th->th.th_active_in_pool) {
           th->th.th_active_in_pool = FALSE;
-          KMP_TEST_THEN_DEC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+          KMP_TEST_THEN_DEC32(
+              CCAST(kmp_int32 *, &__kmp_thread_pool_active_nth));
           KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         }
         deactivated = TRUE;
@@ -1516,7 +1517,7 @@
     if (deactivated) {
       th->th.th_active = TRUE;
       if (TCR_4(th->th.th_in_pool)) {
-        KMP_TEST_THEN_INC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+        KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, &__kmp_thread_pool_active_nth));
         th->th.th_active_in_pool = TRUE;
       }
     }
@@ -1568,7 +1569,7 @@
   KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
 
   if (!flag) { // coming from __kmp_null_resume_wrapper
-    flag = (C *)th->th.th_sleep_loc;
+    flag = (C *)CCAST(void *, th->th.th_sleep_loc);
   }
 
   // First, check if the flag is null or its type has changed. If so, someone
@@ -1801,8 +1802,8 @@
   mach_msg_type_number_t num = HOST_BASIC_INFO_COUNT;
   rc = host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&info, &num);
   if (rc == 0 && num == HOST_BASIC_INFO_COUNT) {
-// Cannot use KA_TRACE() here because this code works before trace support is
-// initialized.
+    // Cannot use KA_TRACE() here because this code works before trace support
+    // is initialized.
     r = info.avail_cpus;
   } else {
     KMP_WARNING(CantGetNumAvailCPU);
@@ -2315,7 +2316,8 @@
 
 #endif // USE_LOAD_BALANCE
 
-#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || KMP_ARCH_PPC64)
+#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
+      ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || KMP_ARCH_PPC64)
 
 // we really only need the case with 1 argument, because CLANG always build
 // a struct of pointers to shared variables referenced in the outlined function