Index: CMakeLists.txt
===================================================================
--- CMakeLists.txt
+++ CMakeLists.txt
@@ -1,4 +1,6 @@
 set(PROG miniGMG)
+list(APPEND CXXFLAGS -D__PRINT_NORM)
 list(APPEND LDFLAGS -lm)
-set(RUN_OPTIONS 6  2 2 2  1 1 1)
+set(FP_TOLERANCE 0.00001)
+set(RUN_OPTIONS 5  2 2 2  1 1 1)
 llvm_multisource()
Index: apply_op.inc
===================================================================
--- apply_op.inc
+++ apply_op.inc
@@ -24,6 +24,7 @@
 
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k,s;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -41,7 +42,9 @@
     double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[ __beta_k] + ghosts*(1+pencil+plane);
     double * __restrict__ lambda = domain->subdomains[box].levels[level].grids[ __lambda] + ghosts*(1+pencil+plane);
 
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<dim_k;k++){
     for(j=0;j<dim_j;j++){
     for(i=0;i<dim_i;i++){
@@ -57,7 +60,7 @@
       Ax[ijk] = helmholtz;
     }}}
   }
-#endif
+
   domain->cycles.apply_op[level] += (uint64_t)(CycleTime()-_timeStart);
 }
 //------------------------------------------------------------------------------------------------------------------------------
Index: exchange_boundary.inc
===================================================================
--- exchange_boundary.inc
+++ exchange_boundary.inc
@@ -141,7 +141,34 @@
     if( (domain->bufferCopies[level][buffer].isFace   && exchange_faces  ) || 
         (domain->bufferCopies[level][buffer].isEdge   && exchange_edges  ) ||
         (domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){
-        DoBufferCopy(domain,level,grid_id,buffer);
+          DoBufferCopy(domain,level,grid_id,buffer);
+  }}
+  _timeEnd = CycleTime();
+  domain->cycles.pack[level] += (_timeEnd-_timeStart);
+ 
+  // loop through MPI send buffers and post Isend's...
+  _timeStart = CycleTime();
+  #ifdef __MPI_THREAD_MULTIPLE
+  #pragma omp parallel for schedule(dynamic,1)
+  #endif
+  for(n=nMessages/2;n<nMessages;n++){
+    MPI_Isend(buffers_packed[n],sizes_packed[n],MPI_DOUBLE,ranks_packed[n],tags_packed[n],MPI_COMM_WORLD,&requests_packed[n]);
+  }
+  _timeEnd = CycleTime();
+  domain->cycles.send[level] += (_timeEnd-_timeStart);
+  #endif
+
+
+  // exchange locally... try and hide within Isend latency... 
+  _timeStart = CycleTime();
+#ifdef OMP
+  #pragma omp parallel for schedule(static,1)
+#endif
+  for(buffer=domain->bufferCopy_Local_Start;buffer<domain->bufferCopy_Local_End;buffer++){
+    if( (domain->bufferCopies[level][buffer].isFace   && exchange_faces  ) || 
+        (domain->bufferCopies[level][buffer].isEdge   && exchange_edges  ) ||
+        (domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){
+          DoBufferCopy(domain,level,grid_id,buffer);
   }}
   _timeEnd = CycleTime();
   domain->cycles.grid2grid[level] += (_timeEnd-_timeStart);
@@ -153,7 +180,7 @@
   MPI_Waitall(nMessages,requests_packed,status_packed);
   _timeEnd = CycleTime();
   domain->cycles.wait[level] += (_timeEnd-_timeStart);
-  #endif
+
   // unpack MPI receive buffers 
   _timeStart = CycleTime();
   #pragma omp parallel for schedule(static,1)
Index: mg.h
===================================================================
--- mg.h
+++ mg.h
@@ -125,5 +125,6 @@
 void      MGBuild(domain_type * domain, double a, double b, double h0);
 void      MGSolve(domain_type * domain, int u_id, int F_id, double a, double b, double desired_mg_norm);
 void print_timing(domain_type *domain);
+void DoBufferCopy(domain_type *domain, int level, int grid_id, int buffer);
 //------------------------------------------------------------------------------------------------------------------------------
 #endif
Index: mg.c
===================================================================
--- mg.c
+++ mg.c
@@ -100,7 +100,7 @@
   int di,dj,dk;
   // FIX ghosts = array (varies with level)
   domain->rank = rank;
-  if(domain->rank==0){printf("creating domain...       ");fflush(stdout);}
+  if(domain->rank==0){printf("creating domain.       ");fflush(stdout);}
   //FIX, limit ghosts based on coarsest problem size
   if(ghosts>(subdomain_dim_i>>(numLevels-1))){if(domain->rank==0)printf("#ghosts(%d)>bottom grid size(%d)\n",ghosts,subdomain_dim_i>>(numLevels-1));exit(0);}
 
@@ -492,7 +492,7 @@
 
 
 void destroy_domain(domain_type * domain){
-  if(domain->rank==0){printf("deallocating domain...   ");fflush(stdout);}
+  if(domain->rank==0){printf("deallocating domain.   ");fflush(stdout);}
   int box;for(box=0;box<domain->subdomains_per_rank;box++){
     destroy_subdomain(&domain->subdomains[box]);
   }
@@ -581,7 +581,7 @@
 void MGBuild(domain_type * domain, double a, double b, double h0){
   int box,level,numLevels = domain->numLevels;
 
-  if(domain->rank==0){printf("MGBuild...                      ");fflush(stdout);}
+  if(domain->rank==0){printf("MGBuild.                      ");fflush(stdout);}
 
   // initialize timers...
   MGResetTimers(domain);
@@ -710,7 +710,7 @@
   #endif
 
   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
-  if(domain->rank==0){printf("MGSolve...                      ");fflush(stdout);}
+  if(domain->rank==0){printf("MGSolve.                      ");fflush(stdout);}
   uint64_t _timeStartMGSolve = CycleTime();
 
   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Index: miniGMG.c
===================================================================
--- miniGMG.c
+++ miniGMG.c
@@ -149,7 +149,7 @@
                            boundary_conditions,
                            __NumGrids,1,levels_in_vcycle);
   double h0=1.0/((double)(domain.dim.i));
-  if(MPI_Rank==0){printf("initializing alpha, beta, RHS for the ``hard problem''...");fflush(stdout);}
+  if(MPI_Rank==0){printf("initializing alpha, beta, RHS for the ``hard problem''.");fflush(stdout);}
   double  a=0.9;       // i.e. good Helmholtz
   double  b=0.9;
   initialize_problem(&domain,0,h0,a,b);
Index: miniGMG.reference_output
===================================================================
--- miniGMG.reference_output
+++ miniGMG.reference_output
@@ -1,25 +1,46 @@
 1 MPI Tasks of 1 threads
 truncating the v-cycle at 2^3 subdomains
-creating domain...       done
-  64 x 64 x 64 (per subdomain)
-  128 x 128 x 128 (per process)
-  128 x 128 x 128 (overall)
+creating domain.       done
+  32 x 32 x 32 (per subdomain)
+  64 x 64 x 64 (per process)
+  64 x 64 x 64 (overall)
   1-deep ghost zones
-  allocated 246 MB
-initializing alpha, beta, RHS for the ``hard problem''...
-  average value of f =   0.000000000000e+00
+  allocated 34 MB
+initializing alpha, beta, RHS for the ``hard problem''.
+  average value of f =  -1.110223024625e-15
 done
-MGBuild...                      
-  level= 0, eigenvalue_max ~= 1.000000e+00
-  level= 1, eigenvalue_max ~= -1.000000e+00
-  level= 2, eigenvalue_max ~= -1.000000e+00
-  level= 3, eigenvalue_max ~= -1.000000e+00
-  level= 4, eigenvalue_max ~= -1.000000e+00
-  level= 5, eigenvalue_max ~= -1.000000e+00
+MGBuild.                      
+  level= 0, eigenvalue_max ~= 1.999996e+00
+  level= 1, eigenvalue_max ~= 1.999984e+00
+  level= 2, eigenvalue_max ~= 1.999935e+00
+  level= 3, eigenvalue_max ~= 1.999740e+00
+  level= 4, eigenvalue_max ~= 1.998958e+00
 done
-MGSolve...                      done
-MGSolve...                      done
-Error test: h = 7.812500e-03, max = 0.000000e+00
-Error test: h = 7.812500e-03, L2  = 0.000000e+00
-deallocating domain...   done
+MGSolve.                      
+v-cycle= 1, norm=0.00104306838695688285 (1.043068e-03)
+v-cycle= 2, norm=0.00008508164981969001 (8.508165e-05)
+v-cycle= 3, norm=0.00001867173484947213 (1.867173e-05)
+v-cycle= 4, norm=0.00000204150498158241 (2.041505e-06)
+v-cycle= 5, norm=0.00000036859010963302 (3.685901e-07)
+v-cycle= 6, norm=0.00000004500376124219 (4.500376e-08)
+v-cycle= 7, norm=0.00000000777275418452 (7.772754e-09)
+v-cycle= 8, norm=0.00000000096441180332 (9.644118e-10)
+v-cycle= 9, norm=0.00000000016688265714 (1.668827e-10)
+v-cycle=10, norm=0.00000000002055567632 (2.055568e-11)
+done
+MGSolve.                      
+v-cycle= 1, norm=0.00104306838695688285 (1.043068e-03)
+v-cycle= 2, norm=0.00008508164981969001 (8.508165e-05)
+v-cycle= 3, norm=0.00001867173484947213 (1.867173e-05)
+v-cycle= 4, norm=0.00000204150498158241 (2.041505e-06)
+v-cycle= 5, norm=0.00000036859010963302 (3.685901e-07)
+v-cycle= 6, norm=0.00000004500376124219 (4.500376e-08)
+v-cycle= 7, norm=0.00000000777275418452 (7.772754e-09)
+v-cycle= 8, norm=0.00000000096441180332 (9.644118e-10)
+v-cycle= 9, norm=0.00000000016688265714 (1.668827e-10)
+v-cycle=10, norm=0.00000000002055567632 (2.055568e-11)
+done
+Error test: h = 1.562500e-02, max = 3.529710e-04
+Error test: h = 1.562500e-02, L2  = 5.559935e-05
+deallocating domain.   done
 exit 0
Index: misc.inc
===================================================================
--- misc.inc
+++ misc.inc
@@ -20,6 +20,7 @@
 
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -29,7 +30,9 @@
     int  dim_j = domain->subdomains[box].levels[level].dim.j;
     int  dim_i = domain->subdomains[box].levels[level].dim.i;
     double * __restrict__ grid = domain->subdomains[box].levels[level].grids[grid_id] + ghosts*(1+pencil+plane);
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=-ghosts;k<dim_k+ghosts;k++){
      for(j=-ghosts;j<dim_j+ghosts;j++){
       for(i=-ghosts;i<dim_i+ghosts;i++){
@@ -37,7 +40,6 @@
         grid[ijk] = 0.0;
     }}}
   }
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
 }
 
@@ -56,6 +58,7 @@
   int box;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -65,7 +68,9 @@
     int  dim_j = domain->subdomains[box].levels[level].dim.j;
     int  dim_i = domain->subdomains[box].levels[level].dim.i;
     double * __restrict__ grid = domain->subdomains[box].levels[level].grids[grid_id] + ghosts*(1+pencil+plane);
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=-ghosts;k<dim_k+ghosts;k++){
      for(j=-ghosts;j<dim_j+ghosts;j++){
       for(i=-ghosts;i<dim_i+ghosts;i++){
@@ -74,7 +79,6 @@
         grid[ijk] = ghostZone ? 0.0 : scalar;
     }}}
   }
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
 }
 
@@ -92,6 +96,7 @@
   int box;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -103,7 +108,9 @@
     double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane);
     double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane);
     double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane);
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<dim_k;k++){
      for(j=0;j<dim_j;j++){
       for(i=0;i<dim_i;i++){
@@ -111,7 +118,6 @@
         grid_c[ijk] = scale_a*grid_a[ijk] + scale_b*grid_b[ijk];
     }}}
   }
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
 }
 
@@ -129,6 +135,7 @@
   int box;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -140,7 +147,9 @@
     double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane);
     double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane);
     double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane);
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<dim_k;k++){
      for(j=0;j<dim_j;j++){
       for(i=0;i<dim_i;i++){
@@ -148,7 +157,6 @@
         grid_c[ijk] = scale*grid_a[ijk]*grid_b[ijk];
     }}}
   }
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
 }
 
@@ -167,6 +175,7 @@
   int box;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -177,7 +186,9 @@
     int  dim_i = domain->subdomains[box].levels[level].dim.i;
     double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane);
     double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane);
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<dim_k;k++){
      for(j=0;j<dim_j;j++){
       for(i=0;i<dim_i;i++){
@@ -185,7 +196,6 @@
         grid_c[ijk] = scale_a*grid_a[ijk];
     }}}
   }
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
 }
 
@@ -206,6 +216,7 @@
 #ifdef OMP
   // FIX, schedule(static) is a stand in to guarantee reproducibility...
   #pragma omp parallel for private(box) if(omp_across_boxes) reduction(+:a_dot_b_domain) schedule(static)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -217,7 +228,9 @@
     double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
     double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane);
     double a_dot_b_box = 0.0;
+#ifdef OMP
     #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:a_dot_b_box) schedule(static)
+#endif
     for(k=0;k<dim_k;k++){
     for(j=0;j<dim_j;j++){
     for(i=0;i<dim_i;i++){
@@ -226,7 +239,6 @@
     }}}
     a_dot_b_domain+=a_dot_b_box;
   }
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
 
   #ifdef __MPI
@@ -256,6 +268,7 @@
 #ifdef OMP
   // FIX, schedule(static) is a stand in to guarantee reproducibility...
   #pragma omp parallel for private(box) if(omp_across_boxes) reduction(max:max_norm) schedule(static)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -266,7 +279,9 @@
     int  dim_i = domain->subdomains[box].levels[level].dim.i;
     double * __restrict__ grid   = domain->subdomains[box].levels[level].grids[ grid_id] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
     double box_norm = 0.0;
+#ifdef OMP
     #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(max:box_norm) schedule(static)
+#endif
     for(k=0;k<dim_k;k++){
     for(j=0;j<dim_j;j++){
     for(i=0;i<dim_i;i++){
@@ -276,7 +291,6 @@
     }}}
     if(box_norm>max_norm){max_norm = box_norm;}
   } // box list
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
 
   #ifdef __MPI
@@ -306,6 +320,7 @@
   double sum_domain =  0.0;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes) reduction(+:sum_domain)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -316,7 +331,9 @@
     int  dim_i = domain->subdomains[box].levels[level].dim.i;
     double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
     double sum_box = 0.0;
+#ifdef OMP
     #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:sum_box)
+#endif
     for(k=0;k<dim_k;k++){
     for(j=0;j<dim_j;j++){
     for(i=0;i<dim_i;i++){
@@ -325,7 +342,6 @@
     }}}
     sum_domain+=sum_box;
   }
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
   double ncells_domain = (double)domain->dim.i*(double)domain->dim.j*(double)domain->dim.k;
 
@@ -356,6 +372,7 @@
   int box;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -367,7 +384,9 @@
     double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
     double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
 
+#ifdef OMP
     #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<dim_k;k++){
     for(j=0;j<dim_j;j++){
     for(i=0;i<dim_i;i++){
@@ -375,7 +394,6 @@
       grid_c[ijk] = grid_a[ijk] + shift_a;
     }}}
   }
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
 }
 
@@ -391,6 +409,7 @@
   int box;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -407,7 +426,9 @@
       case 1: stride = pencil;break;//j-direction
       case 2: stride =  plane;break;//k-direction
     }
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<=dim_k;k++){ // <= to ensure you do low and high faces
      for(j=0;j<=dim_j;j++){
       for(i=0;i<=dim_i;i++){
@@ -415,6 +436,5 @@
         grid_face[ijk] = 0.5*(grid_cell[ijk-stride] + grid_cell[ijk]); // simple linear interpolation
     }}}
   }
-#endif
   domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
 }
Index: residual.inc
===================================================================
--- residual.inc
+++ residual.inc
@@ -23,6 +23,7 @@
   int box;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int pencil = domain->subdomains[box].levels[level].pencil;
@@ -40,7 +41,9 @@
     double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane);
     double * __restrict__ res    = domain->subdomains[box].levels[level].grids[  res_id] + ghosts*(1+pencil+plane);
 
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<dim_k;k++){
     for(j=0;j<dim_j;j++){
     for(i=0;i<dim_i;i++){
@@ -57,7 +60,6 @@
       res[ijk] = rhs[ijk]-helmholtz;
     }}}
   }
-#endif
   domain->cycles.residual[level] += (uint64_t)(CycleTime()-_timeStart);
 }
 
@@ -85,6 +87,7 @@
   int box;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int kk,jj;
     int pencil_c = domain->subdomains[box].levels[level_c].pencil;
@@ -110,7 +113,9 @@
     double * __restrict__ beta_k = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*(1+pencil_f+plane_f);
     double * __restrict__ res    = domain->subdomains[box].levels[level_c].grids[  res_id] + ghosts_c*(1+pencil_c+plane_c);
 
+#ifdef OMP
     #pragma omp parallel for private(kk,jj) if(omp_within_a_box) collapse(2)
+#endif
     for(kk=0;kk<dim_k_f;kk+=2){
     for(jj=0;jj<dim_j_f;jj+=2){
       int i,j,k;
@@ -136,7 +141,6 @@
       }
     }}}}
   }
-#endif
   domain->cycles.residual[level_f] += (uint64_t)(CycleTime()-_timeStart);
 }
 #else
@@ -158,7 +162,9 @@
   int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize);
   int box;
 
+#ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int pencil_c = domain->subdomains[box].levels[level_c].pencil;
     int  plane_c = domain->subdomains[box].levels[level_c].plane;
@@ -184,7 +190,10 @@
     double * __restrict__ res    = domain->subdomains[box].levels[level_c].grids[  res_id] + ghosts_c*(1+pencil_c+plane_c);
 
     int kk;
+#ifdef OMP
     #pragma omp parallel for private(kk) if(omp_within_a_box)
+#endif
+
     for(kk=0;kk<dim_k_f;kk+=2){
       int i,j,k; 
       // zero out the next coarse grid plane
Index: restriction.inc
===================================================================
--- restriction.inc
+++ restriction.inc
@@ -22,6 +22,7 @@
   int box;
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
@@ -37,8 +38,10 @@
   
     double * __restrict__ grid_f = domain->subdomains[box].levels[level_f].grids[id_f] + ghosts_f*(1+pencil_f+plane_f);
     double * __restrict__ grid_c = domain->subdomains[box].levels[level_c].grids[id_c] + ghosts_c*(1+pencil_c+plane_c);
-  
+
+#ifdef OMP  
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<dim_k_c;k++){
     for(j=0;j<dim_j_c;j++){
     for(i=0;i<dim_i_c;i++){
@@ -50,7 +53,6 @@
                         grid_f[ijk_f  +pencil_f+plane_f]+grid_f[ijk_f+1+pencil_f+plane_f] ) * 0.125;
     }}}
   }
-#endif
   domain->cycles.restriction[level_f] += (uint64_t)(CycleTime()-_timeStart);
 }
 
@@ -71,6 +73,7 @@
 
 #ifdef OMP
   #pragma omp parallel for private(box) if(omp_across_boxes)
+#endif
   for(box=0;box<domain->subdomains_per_rank;box++){
     int i,j,k;
     int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
@@ -103,7 +106,9 @@
     // restrict beta_j (== face in ik)
     beta_f = domain->subdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f;
     beta_c = domain->subdomains[box].levels[level_c].grids[__beta_j] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c;
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<dim_k_c;k++){
     for(j=0;j<dim_j_c;j++){
     for(i=0;i<dim_i_c;i++){
@@ -116,7 +121,9 @@
     // restrict beta_k (== face in ij)
     beta_f = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f;
     beta_c = domain->subdomains[box].levels[level_c].grids[__beta_k] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c;
+#ifdef OMP
     #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+#endif
     for(k=0;k<dim_k_c;k++){
     for(j=0;j<dim_j_c;j++){
     for(i=0;i<dim_i_c;i++){
@@ -126,6 +133,5 @@
                         beta_f[ijk_f+pencil_f]+beta_f[ijk_f+1+pencil_f] ) * 0.25;
     }}}
   }
-#endif
   domain->cycles.restriction[level_f] += (uint64_t)(CycleTime()-_timeStart);
 }