Index: test-suite/trunk/LICENSE.TXT
===================================================================
--- test-suite/trunk/LICENSE.TXT
+++ test-suite/trunk/LICENSE.TXT
@@ -86,6 +86,7 @@
 miniAMR:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR
 Pathfinder:         llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/Pathfinder
 XSBench:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench
+miniGMG:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG
 CLAMR:              llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR
 HPCCG:              llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/HPCCG
 PENNANT:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(XSBench)
 add_subdirectory(Pathfinder)
 add_subdirectory(miniAMR)
+add_subdirectory(miniGMG)
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
@@ -3,6 +3,7 @@
 LEVEL = ../../..
 PARALLEL_DIRS = XSBench    \
                 miniAMR    \
-                Pathfinder
+                Pathfinder \
+                miniGMG
 
 include $(LEVEL)/Makefile.programs
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeLists.txt
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeLists.txt
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(PROG miniGMG)
+list(APPEND LDFLAGS -lm)
+set(RUN_OPTIONS 6  2 2 2  1 1 1)
+llvm_multisource()
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/LICENSE
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/LICENSE
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/LICENSE
@@ -0,0 +1,16 @@
+*** License Agreement ***
+miniGMG, Copyright (c) 2014, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy).  All rights reserved."
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+(1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+(2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+(3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the features, functionality or performance of the source code ("Enhancements") to anyone; however, if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley National Laboratory, without imposing a separate written license agreement for such Enhancements, then you hereby grant the following license: a  non-exclusive, royalty-free perpetual license to install, use, modify, prepare derivative works, incorporate into other computer software, distribute, and sublicense such enhancements or derivative works thereof, in binary and source code form.
+*********
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/Makefile
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/Makefile
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/Makefile
@@ -0,0 +1,6 @@
+LEVEL = ../../../..
+
+PROG = miniGMG
+LDFLAGS  = -lm
+RUN_OPTIONS = 6  2 2 2  1 1 1
+include $(LEVEL)/MultiSource/Makefile.multisrc
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/apply_op.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/apply_op.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/apply_op.inc
@@ -0,0 +1,63 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timer.h"
+#include "mg.h"
+#include "box.h"
+
+void apply_op(domain_type * domain, int level, int Ax_id, int x_id, double a, double b){  // y=Ax or y=D^{-1}Ax = lambda[]Ax
+  // exchange the boundary of x in preparation for Ax
+  // note, Ax (one power) with a 7-point stencil requires only the faces
+  exchange_boundary(domain,level,x_id,1,0,0);
+
+  // now do Ax proper...
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k,s;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double h2inv = 1.0/(domain->h[level]*domain->h[level]);
+    double * __restrict__ x      = domain->subdomains[box].levels[level].grids[     x_id] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
+    double * __restrict__ Ax     = domain->subdomains[box].levels[level].grids[    Ax_id] + ghosts*(1+pencil+plane); 
+    double * __restrict__ alpha  = domain->subdomains[box].levels[level].grids[ __alpha ] + ghosts*(1+pencil+plane);
+    double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[ __beta_i] + ghosts*(1+pencil+plane);
+    double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[ __beta_j] + ghosts*(1+pencil+plane);
+    double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[ __beta_k] + ghosts*(1+pencil+plane);
+    double * __restrict__ lambda = domain->subdomains[box].levels[level].grids[ __lambda] + ghosts*(1+pencil+plane);
+
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int ijk = i + j*pencil + k*plane;
+      double helmholtz =  a*alpha[ijk]*x[ijk] - b*h2inv*(
+                             beta_i[ijk+1     ]*( x[ijk+1     ]-x[ijk       ] )
+                            -beta_i[ijk       ]*( x[ijk       ]-x[ijk-1     ] )
+                            +beta_j[ijk+pencil]*( x[ijk+pencil]-x[ijk       ] )
+                            -beta_j[ijk       ]*( x[ijk       ]-x[ijk-pencil] )
+                            +beta_k[ijk+plane ]*( x[ijk+plane ]-x[ijk       ] )
+                            -beta_k[ijk       ]*( x[ijk       ]-x[ijk-plane ] )
+                          );
+      Ax[ijk] = helmholtz;
+    }}}
+  }
+#endif
+  domain->cycles.apply_op[level] += (uint64_t)(CycleTime()-_timeStart);
+}
+//------------------------------------------------------------------------------------------------------------------------------
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/box.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/box.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/box.h
@@ -0,0 +1,29 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef BOX_H
+#define BOX_H
+
+typedef struct {
+  double h;                                  // h at this level (==hLevel == h0 * (double)(1<<level)  )
+  struct {int i, j, k;}low;                  // global coordinates of the first (non-ghost) element of subdomain
+  struct {int i, j, k;}dim;                  // dimensions of this box's core (not counting ghost zone)
+  struct {int i, j, k;}dim_with_ghosts;      // dimensions of this box's core (not counting ghost zone)
+  int ghosts;                                // ghost zone depth
+  int pencil,plane,volume;                   // useful for offsets
+  int numGrids;
+  int boundaryCondition[27];                 // = boundary conditions in each of the 26 different dirrections (ignoring direction 13)
+//int                          bufsizes[27]; // = sizes of extracted surfaces and ghost zones (pointer to array of 27 elements)
+//double    * __restrict__ surface_bufs[27]; // = extracted surface (rhs on the way down, correction on the way up)
+//double    * __restrict__   ghost_bufs[27]; // = incoming ghost zone (rhs on the way down, correction on the way up)
+  double   ** __restrict__ grids;            // grids[g] = pointer to grid for component g
+  uint64_t  * __restrict__ RedBlack_64bMask; // Red/Black Mask (i.e. 0x0000000000000000ull or 0xFFFFFFFFFFFFFFFFull) within one plane (dim_with_ghosts^2)
+  double    * __restrict__ RedBlack_FP[2];   // Red/Black Mask (i.e. 0.0 or 1.0) for even/odd planes (dim_with_ghosts^2).  used to zero out changes to phi... x = x + RedBlack[][]*Dinv*(rhs-Ax);
+  uint8_t   * __restrict__ RedBlack_4bMask;  // Red/Black 4bit bit mask (i.e. 4 elements = 0000b ... 1111b ) for the whole volume.  Ideally, you can convert a 4b mask into a 256b (4x64b) bitmask for AVX/QPX operations
+} box_type;
+//------------------------------------------------------------------------------------------------------------------------------
+void destroy_box(box_type *box);
+ int create_box(box_type *box, int numGrids, int low_i, int low_j, int low_k, int dim_i, int dim_j, int dim_k, int ghosts);
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/box.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/box.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/box.c
@@ -0,0 +1,114 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>  
+#include <stdlib.h>  
+#include <stdint.h>  
+#include <string.h>  
+#include <math.h>  
+//------------------------------------------------------------------------------------------------------------------------------
+#include "defines.h"
+#include "box.h"
+//------------------------------------------------------------------------------------------------------------------------------
+int RandomPadding=-1;
+//------------------------------------------------------------------------------------------------------------------------------
+int create_box(box_type *box, int numGrids, int low_i, int low_j, int low_k, int dim_i, int dim_j, int dim_k, int ghosts){
+  uint64_t memory_allocated = 0;
+  box->numGrids = numGrids;
+  box->low.i = low_i;
+  box->low.j = low_j;
+  box->low.k = low_k;
+  box->dim.i = dim_i;
+  box->dim.j = dim_j;
+  box->dim.k = dim_k;
+  box->dim_with_ghosts.i = dim_i+2*ghosts;
+  box->dim_with_ghosts.j = dim_j+2*ghosts;
+  box->dim_with_ghosts.k = dim_k+2*ghosts;
+  box->ghosts = ghosts;
+  box->pencil = (dim_i+2*ghosts);
+  box->plane  = (dim_i+2*ghosts)*(dim_j+2*ghosts);
+
+  // nominally the stencil assumes VL is a perfect multiple of $U and unrolls without cleanup.  However, this means you can walk $U-1 beyond the last point you should update and start updating the ghost zone of the next plane.
+//int MaxUnrolling =  8; // 2-way SIMD x unroll by 4 =  8/thread
+  int MaxUnrolling = 16; // 4-way SIMD x unroll by 4 = 16/thread
+//int MaxUnrolling = 32; // 8-way SIMD x unroll by 4 = 32/thread
+  int paddingToAvoidStencilCleanup = 0;
+  if(box->pencil+1 < (MaxUnrolling-1)){paddingToAvoidStencilCleanup = (MaxUnrolling-1)-(box->pencil+1);} 
+
+// round each plane up to ensure SIMD alignment within each plane (not pencil).  Thus if ijk is aligned, ijkk+/-plane is aligned
+//box->plane  =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0xF) & ~0xF; // multiple of  128 bytes
+  box->plane  =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0x7) & ~0x7; // multiple of  64 bytes (required for MIC)
+//box->plane  =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0x3) & ~0x3; // multiple of  32 bytes (required for AVX/QPX)
+//box->plane  =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0x1) & ~0x1; // multiple of  16 bytes (required for SSE)
+//printf("%2d^2 = %5d -> %5d\n",box->dim_with_ghosts.i,(dim_j+2*ghosts)*box->pencil,box->plane);
+
+  box->volume = (dim_k+2*ghosts)*box->plane;
+  //if(dim_i>=32){while( ((box->volume % 2048) != 288) )box->volume+=8;}
+  //if(dim_i>=32){while( ((box->volume %  512) !=  72) )box->volume+=8;} // 32KB / 8way / 8bytes / 7 arrays
+    if(dim_i>=32){while( ((box->volume %  512) !=  64) )box->volume+=8;} // 32KB / 8way / 8bytes / 8 arrays
+  //if(dim_i>=32){while( ((box->volume %  512) !=  40) )box->volume+=8;} // 32KB / 8way / 8bytes / 13 arrays
+  //if(dim_i>=32){while( ((box->volume %  256) !=  40) )box->volume+=8;} // 16KB / 8way / 8bytes / 7 arrays
+  //if(dim_i>=32){while( ((box->volume %  256) !=  32) )box->volume+=8;} // 16KB / 8way / 8bytes / 8 arrays
+
+  //if(RandomPadding<0){srand(time(NULL));RandomPadding = 8*((int)rand()%128);}
+  //if(dim_i>=32){box->volume+=RandomPadding;}
+
+
+  // allocate pointers to grids and grids themselves
+  posix_memalign((void**)&(box->grids),64,box->numGrids*sizeof(double*));  
+  memory_allocated += box->numGrids*sizeof(double*);
+#if 0
+  int g;for(g=0;g<box->numGrids;g++){
+    posix_memalign((void**)&(box->grids[g]),64,box->volume*sizeof(double));
+    memset(box->grids[g],0,box->volume*sizeof(double));
+    memory_allocated += box->volume*sizeof(double);
+  }
+#else
+  double * tmpbuf;
+  posix_memalign((void**)&tmpbuf,64,box->volume*box->numGrids*sizeof(double));
+  memset(    tmpbuf,0,box->volume*box->numGrids*sizeof(double));
+  memory_allocated += box->volume*box->numGrids*sizeof(double);
+  int g;for(g=0;g<box->numGrids;g++){
+    box->grids[g] = tmpbuf + g*box->volume;
+    //printf("box->grids[%2d] = 0x%016llx\n",g,(uint64_t)box->grids[g] & (0x3<<3));
+  }
+#endif
+
+  // allocate RedBlackMask array for a plane...
+  posix_memalign((void**)&(box->RedBlack_64bMask),64,box->plane*sizeof(uint64_t));
+                    memset(box->RedBlack_64bMask,  0,box->plane*sizeof(uint64_t));
+                                 memory_allocated += box->plane*sizeof(uint64_t);
+  posix_memalign((void**)&(box->RedBlack_FP[0]  ),64,box->plane*sizeof(double  ));
+                    memset(box->RedBlack_FP[0]  ,  0,box->plane*sizeof(double  ));
+                                 memory_allocated += box->plane*sizeof(double  );
+  posix_memalign((void**)&(box->RedBlack_FP[1]  ),64,box->plane*sizeof(double  ));
+                    memset(box->RedBlack_FP[1]  ,  0,box->plane*sizeof(double  ));
+                                 memory_allocated += box->plane*sizeof(double  );
+
+  // initialize red/black... could do ij loop with ((i%pencil)^(j/pencil)&0x1)
+  int i,j;
+  for(j=0-ghosts;j<box->dim.j+ghosts;j++){
+  for(i=0-ghosts;i<box->dim.i+ghosts;i++){
+    int ij = (i+ghosts) + (j+ghosts)*box->pencil;
+    if((i^j)&0x1)box->RedBlack_64bMask[ij]=~0;else box->RedBlack_64bMask[ij]= 0;
+    if((i^j)&0x1)box->RedBlack_FP[0][ij]=1.0;else box->RedBlack_FP[0][ij]=0.0;
+    if((i^j)&0x1)box->RedBlack_FP[1][ij]=0.0;else box->RedBlack_FP[1][ij]=1.0;
+  }}
+  // done...
+  return(memory_allocated);
+}
+
+void destroy_box(box_type *box){
+#if 0
+  int g;for(g=0;g<box->numGrids;g++){
+    free(box->grids[g]);
+  }
+#else
+  free(box->grids[0]);
+#endif
+  free(box->grids);
+}
+
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/defines.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/defines.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/defines.h
@@ -0,0 +1,52 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+// Helmholtz ~ Laplacian() = a*alpha*Identity - b*Divergence*beta*Gradient
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef DEFINES_H
+#define DEFINES_H
+#define  __u           0 // = what we're eventually solving for (u), cell centered
+#define  __f           1 // = original right-hand side (Au=f), cell centered
+#define  __alpha       2 // cell centered constant
+#define  __beta        3 // cell centered constant
+//------------------------------------------------------------------------------------------------------------------
+#define  __lambda      4 // cell centered constant (inverse of the diagonal)
+#define  __beta_i      5 // face constant (n.b. element 0 is the left face of the ghost zone element)
+#define  __beta_j      6 // face constant (n.b. element 0 is the back face of the ghost zone element)
+#define  __beta_k      7 // face constant (n.b. element 0 is the bottom face of the ghost zone element)
+#define  __ee          8 // = used for correction (ee) in residual correction form, cell centered
+#define  __f_minus_Av  9 // = used for initial right-hand side (f-Av) in residual correction form, cell centered
+#define  __temp       10 // = used for unrestricted residual (r), cell centered
+#define  __u_exact    11
+//------------------------------------------------------------------------------------------------------------------
+#define  __NumGrids   12 // total number of grids and the starting location for bottom solver grids
+//------------------------------------------------------------------------------------------------------------------
+
+// FIX, these should be adhoc allocated starting with __NumGrids
+// For CG bottom solver
+#define  __r          13
+#define  __p          14
+#define  __Ap         16
+
+// For BiCGStab bottom solver
+#define  __r0         12
+#define  __r          13
+#define  __p          14
+#define  __s          15
+#define  __Ap         16
+#define  __As         17
+
+// For CABiCGStab bottom solver
+#define  __rt         12
+#define  __r          13
+#define  __p          14
+#define  __PRrt       15 // starting location for the 2S+1 extra p[]'s, 2S extra r[]'s, and rt
+
+// For CACG bottom solver
+#define  __r0         12
+#define  __r          13
+#define  __p          14
+#define  __PRrt       15 // starting location for the S+1 extra p[]'s, S extra r[]'s, and rt
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/exchange_boundary.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/exchange_boundary.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/exchange_boundary.inc
@@ -0,0 +1,172 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include "timer.h"
+#include "mg.h"
+//------------------------------------------------------------------------------------------------------------------------------
+inline void DoBufferCopy(domain_type *domain, int level, int grid_id, int buffer){
+  // copy 3D array from read_i,j,k of read[] to write_i,j,k in write[]
+  int   dim_i      = domain->bufferCopies[level][buffer].dim.i;
+  int   dim_j      = domain->bufferCopies[level][buffer].dim.j;
+  int   dim_k      = domain->bufferCopies[level][buffer].dim.k;
+
+  int  read_i      = domain->bufferCopies[level][buffer].read.i;
+  int  read_j      = domain->bufferCopies[level][buffer].read.j;
+  int  read_k      = domain->bufferCopies[level][buffer].read.k;
+  int  read_pencil = domain->bufferCopies[level][buffer].read.pencil;
+  int  read_plane  = domain->bufferCopies[level][buffer].read.plane;
+
+  int write_i      = domain->bufferCopies[level][buffer].write.i;
+  int write_j      = domain->bufferCopies[level][buffer].write.j;
+  int write_k      = domain->bufferCopies[level][buffer].write.k;
+  int write_pencil = domain->bufferCopies[level][buffer].write.pencil;
+  int write_plane  = domain->bufferCopies[level][buffer].write.plane;
+
+  double * __restrict__  read = domain->bufferCopies[level][buffer].read.ptr;
+  double * __restrict__ write = domain->bufferCopies[level][buffer].write.ptr;
+  if(domain->bufferCopies[level][buffer].read.box >=0) read = domain->subdomains[ domain->bufferCopies[level][buffer].read.box].levels[level].grids[grid_id];
+  if(domain->bufferCopies[level][buffer].write.box>=0)write = domain->subdomains[domain->bufferCopies[level][buffer].write.box].levels[level].grids[grid_id];
+
+  int i,j,k,read_ijk,write_ijk;
+  if(dim_i==1){ // be smart and don't have an inner loop from 0 to 1
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+      int  read_ijk = ( read_i) + (j+ read_j)* read_pencil + (k+ read_k)* read_plane;
+      int write_ijk = (write_i) + (j+write_j)*write_pencil + (k+write_k)*write_plane;
+      write[write_ijk] = read[read_ijk];
+    }}
+  }else if(dim_i==4){ // be smart and don't have an inner loop from 0 to 4
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+      int  read_ijk = ( read_i) + (j+ read_j)* read_pencil + (k+ read_k)* read_plane;
+      int write_ijk = (write_i) + (j+write_j)*write_pencil + (k+write_k)*write_plane;
+      write[write_ijk+0] = read[read_ijk+0];
+      write[write_ijk+1] = read[read_ijk+1];
+      write[write_ijk+2] = read[read_ijk+2];
+      write[write_ijk+3] = read[read_ijk+3];
+    }}
+  }else{
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int  read_ijk = (i+ read_i) + (j+ read_j)* read_pencil + (k+ read_k)* read_plane;
+      int write_ijk = (i+write_i) + (j+write_j)*write_pencil + (k+write_k)*write_plane;
+      write[write_ijk] = read[read_ijk];
+    }}}
+  }
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// Exchange boundaries by aggregating into domain buffers
+//------------------------------------------------------------------------------------------------------------------------------
+void exchange_boundary(domain_type *domain, int level, int grid_id, int exchange_faces, int exchange_edges, int exchange_corners){
+  uint64_t _timeCommunicationStart = CycleTime();
+  uint64_t _timeStart,_timeEnd;
+  int buffer=0;
+  int sendBox,recvBox,n;
+
+  int    faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
+  int    edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
+  int  corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
+  int exchange[27] = {0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0};
+
+  for(n=0;n<27;n++){
+    if( exchange_faces   )exchange[n] |=   faces[n];
+    if( exchange_edges   )exchange[n] |=   edges[n];
+    if( exchange_corners )exchange[n] |= corners[n];
+  }
+
+
+  #ifdef __MPI
+
+  // there are up to 27 sends and up to 27 recvs
+  // packed lists of message info...
+  double *     buffers_packed[54];
+  int            sizes_packed[54];
+  int            ranks_packed[54];
+  int             tags_packed[54];
+  MPI_Request requests_packed[54];
+  MPI_Status    status_packed[54];
+  int nMessages=0;
+
+  int sizes_all[27];
+  // precompute the possible sizes of each buffer (n.b. not all are necessarily used)
+  int di,dj,dk;
+  for(dk=-1;dk<=1;dk++){
+  for(dj=-1;dj<=1;dj++){
+  for(di=-1;di<=1;di++){
+    int n = 13+di+3*dj+9*dk;
+    sizes_all[n] = 1;
+    if(di==0)sizes_all[n]*=domain->subdomains_per_rank_in.i*domain->subdomains[0].levels[level].dim.i;else sizes_all[n]*=domain->subdomains[0].levels[level].ghosts;
+    if(dj==0)sizes_all[n]*=domain->subdomains_per_rank_in.j*domain->subdomains[0].levels[level].dim.j;else sizes_all[n]*=domain->subdomains[0].levels[level].ghosts;
+    if(dk==0)sizes_all[n]*=domain->subdomains_per_rank_in.k*domain->subdomains[0].levels[level].dim.k;else sizes_all[n]*=domain->subdomains[0].levels[level].ghosts;
+  }}}
+
+  // enumerate a packed list of messages... starting with receives...
+  for(n=0;n<27;n++)if(exchange[26-n] && (domain->rank_of_neighbor[26-n] != domain->rank) ){
+    buffers_packed[nMessages] =      domain->recv_buffer[26-n];
+      sizes_packed[nMessages] =                sizes_all[26-n];
+      ranks_packed[nMessages] = domain->rank_of_neighbor[26-n];
+       tags_packed[nMessages] = n;
+            nMessages++;
+  }
+  // enumerate a packed list of messages... continuing with sends...
+  for(n=0;n<27;n++)if(exchange[n] && (domain->rank_of_neighbor[n] != domain->rank) ){
+    buffers_packed[nMessages] =      domain->send_buffer[n];
+      sizes_packed[nMessages] =                sizes_all[n];
+      ranks_packed[nMessages] = domain->rank_of_neighbor[n];
+       tags_packed[nMessages] = n;
+    nMessages++;
+  }
+
+  // loop through packed list of MPI receives and prepost Irecv's...
+  _timeStart = CycleTime();
+  #ifdef __MPI_THREAD_MULTIPLE
+  #pragma omp parallel for schedule(dynamic,1)
+  #endif
+  for(n=0;n<nMessages/2;n++){
+    MPI_Irecv(buffers_packed[n],sizes_packed[n],MPI_DOUBLE,ranks_packed[n],tags_packed[n],MPI_COMM_WORLD,&requests_packed[n]);
+  }
+  _timeEnd = CycleTime();
+  domain->cycles.recv[level] += (_timeEnd-_timeStart);
+
+  // pack MPI send buffers...
+  _timeStart = CycleTime();
+  #pragma omp parallel for schedule(static,1)
+  for(buffer=domain->bufferCopy_Pack_Start;buffer<domain->bufferCopy_Pack_End;buffer++){
+    if( (domain->bufferCopies[level][buffer].isFace   && exchange_faces  ) || 
+        (domain->bufferCopies[level][buffer].isEdge   && exchange_edges  ) ||
+        (domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){
+        DoBufferCopy(domain,level,grid_id,buffer);
+  }}
+  _timeEnd = CycleTime();
+  domain->cycles.grid2grid[level] += (_timeEnd-_timeStart);
+
+
+  // wait for MPI to finish...
+  #ifdef __MPI 
+  _timeStart = CycleTime();
+  MPI_Waitall(nMessages,requests_packed,status_packed);
+  _timeEnd = CycleTime();
+  domain->cycles.wait[level] += (_timeEnd-_timeStart);
+  #endif
+  // unpack MPI receive buffers 
+  _timeStart = CycleTime();
+  #pragma omp parallel for schedule(static,1)
+  for(buffer=domain->bufferCopy_Unpack_Start;buffer<domain->bufferCopy_Unpack_End;buffer++){
+    if( (domain->bufferCopies[level][buffer].isFace   && exchange_faces  ) || 
+        (domain->bufferCopies[level][buffer].isEdge   && exchange_edges  ) ||
+        (domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){
+          DoBufferCopy(domain,level,grid_id,buffer);
+  }}
+  _timeEnd = CycleTime();
+  domain->cycles.unpack[level] += (_timeEnd-_timeStart);
+  #endif
+ 
+ 
+  domain->cycles.communication[level] += (uint64_t)(CycleTime()-_timeCommunicationStart);
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/interpolation.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/interpolation.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/interpolation.inc
@@ -0,0 +1,217 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include "timer.h"
+#include "mg.h"
+//------------------------------------------------------------------------------------------------------------------------------
+// piecewise constant interpolation...
+//
+// +-------+    +---+---+
+// |       |    | x | x |
+// |   x   | -> +---+---+
+// |       |    | x | x |
+// +-------+    +---+---+
+//
+//------------------------------------------------------------------------------------------------------------------------------
+void interpolation_constant(domain_type * domain, int level_f, double prescale_f, int id_f, int id_c){ // id_f = prescale*id_f + I_{2h}^{h}(id_c)
+  int level_c = level_f+1;
+  uint64_t _timeStart = CycleTime();
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
+    int pencil_c = domain->subdomains[box].levels[level_c].pencil;
+    int  plane_c = domain->subdomains[box].levels[level_c].plane;
+  
+    int ghosts_f = domain->subdomains[box].levels[level_f].ghosts;
+    int pencil_f = domain->subdomains[box].levels[level_f].pencil;
+    int  plane_f = domain->subdomains[box].levels[level_f].plane;
+    int  dim_i_f = domain->subdomains[box].levels[level_f].dim.i;
+    int  dim_j_f = domain->subdomains[box].levels[level_f].dim.j;
+    int  dim_k_f = domain->subdomains[box].levels[level_f].dim.k;
+  
+    double * __restrict__ grid_f = domain->subdomains[box].levels[level_f].grids[id_f] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ grid_c = domain->subdomains[box].levels[level_c].grids[id_c] + ghosts_c*(1+pencil_c+plane_c);
+  
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k_f;k++){
+    for(j=0;j<dim_j_f;j++){
+    for(i=0;i<dim_i_f;i++){
+      int ijk_f = (i   ) + (j   )*pencil_f + (k   )*plane_f;
+      int ijk_c = (i>>1) + (j>>1)*pencil_c + (k>>1)*plane_c;
+      grid_f[ijk_f] = prescale_f*grid_f[ijk_f] + grid_c[ijk_c];
+    }}}
+  }
+  domain->cycles.interpolation[level_f] += (uint64_t)(CycleTime()-_timeStart);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+// piecewise linear interpolation...
+//
+//------------------------------------------------------------------------------------------------------------------------------
+void interpolation_linear(domain_type * domain, int level_f, double prescale_f, int id_f, int id_c){  // id_f = prescale*id_f + I_{2h}^{h}(id_c)
+  int level_c = level_f+1;
+  exchange_boundary(domain,level_c,id_c,1,1,1);  // linear needs corners/edges in the coarse grid.
+
+  uint64_t _timeStart = CycleTime();
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
+    int pencil_c = domain->subdomains[box].levels[level_c].pencil;
+    int  plane_c = domain->subdomains[box].levels[level_c].plane;
+    int  dim_i_c = domain->subdomains[box].levels[level_c].dim.i;
+    int  dim_j_c = domain->subdomains[box].levels[level_c].dim.j;
+    int  dim_k_c = domain->subdomains[box].levels[level_c].dim.k;
+
+    int ghosts_f = domain->subdomains[box].levels[level_f].ghosts;
+    int pencil_f = domain->subdomains[box].levels[level_f].pencil;
+    int  plane_f = domain->subdomains[box].levels[level_f].plane;
+    int  dim_i_f = domain->subdomains[box].levels[level_f].dim.i;
+    int  dim_j_f = domain->subdomains[box].levels[level_f].dim.j;
+    int  dim_k_f = domain->subdomains[box].levels[level_f].dim.k;
+
+    double * __restrict__ grid_f = domain->subdomains[box].levels[level_f].grids[  id_f] + ghosts_f*(1 + pencil_f + plane_f); // [0] is first non-ghost zone element
+    double * __restrict__ grid_c = domain->subdomains[box].levels[level_c].grids[  id_c] + ghosts_c*(1 + pencil_c + plane_c);
+
+    // FIX what about dirichlet boundary conditions ???
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k_f;k++){
+    for(j=0;j<dim_j_f;j++){
+    for(i=0;i<dim_i_f;i++){
+      int ijk_f = (i   ) + (j   )*pencil_f + (k   )*plane_f;
+      int ijk_c = (i>>1) + (j>>1)*pencil_c + (k>>1)*plane_c;
+ 
+      // -----------------------------------------------------------------------------------------------------------------------
+      // Piecewise Quadratic Interpolation
+      // -----------------------------------------------------------------------------------------------------------------------
+      // define parabola f(x) = ax^2 + bx + c through three coarse grid cells in i-direction... x=-1, x=0, and x=1
+      // interpolate to (-0.25,j,k) and (+0.25,j,k)
+      // combine into 3 dimensions
+      //
+      // +-------+-------+-------+
+      // |   o   |   o   |   o   |
+      // +-------+-------+-------+
+      //             .
+      //             .
+      //       interpolation
+      //             .
+      //             .
+      // +---+---+---+---+---+---+
+      // |   |   | x | y |   |   |
+      // +---+---+---+---+---+---+
+      //
+      #warning using 27pt stencil for piecewise-quadratic interpolation
+         double xm= 0.156250,x0=0.937500,xp=-0.093750;
+         double ym= 0.156250,y0=0.937500,yp=-0.093750;
+         double zm= 0.156250,z0=0.937500,zp=-0.093750;
+      if(i&0x1){xm=-0.093750;x0=0.937500;xp= 0.156250;}
+      if(j&0x1){ym=-0.093750;y0=0.937500;yp= 0.156250;}
+      if(k&0x1){zm=-0.093750;z0=0.937500;zp= 0.156250;}
+      grid_f[ijk_f] =
+        prescale_f*grid_f[ijk_f                   ] +
+
+          zm*ym*xm*grid_c[ijk_c-1-pencil_c-plane_c] +
+          zm*ym*x0*grid_c[ijk_c  -pencil_c-plane_c] +
+          zm*ym*xp*grid_c[ijk_c+1-pencil_c-plane_c] +
+          zm*y0*xm*grid_c[ijk_c-1         -plane_c] +
+          zm*y0*x0*grid_c[ijk_c           -plane_c] +
+          zm*y0*xp*grid_c[ijk_c+1         -plane_c] +
+          zm*yp*xm*grid_c[ijk_c-1+pencil_c-plane_c] +
+          zm*yp*x0*grid_c[ijk_c  +pencil_c-plane_c] +
+          zm*yp*xp*grid_c[ijk_c+1+pencil_c-plane_c] +
+
+          z0*ym*xm*grid_c[ijk_c-1-pencil_c        ] +
+          z0*ym*x0*grid_c[ijk_c  -pencil_c        ] +
+          z0*ym*xp*grid_c[ijk_c+1-pencil_c        ] +
+          z0*y0*xm*grid_c[ijk_c-1                 ] +
+          z0*y0*x0*grid_c[ijk_c                   ] +
+          z0*y0*xp*grid_c[ijk_c+1                 ] +
+          z0*yp*xm*grid_c[ijk_c-1+pencil_c        ] +
+          z0*yp*x0*grid_c[ijk_c  +pencil_c        ] +
+          z0*yp*xp*grid_c[ijk_c+1+pencil_c        ] +
+
+          zp*ym*xm*grid_c[ijk_c-1-pencil_c+plane_c] +
+          zp*ym*x0*grid_c[ijk_c  -pencil_c+plane_c] +
+          zp*ym*xp*grid_c[ijk_c+1-pencil_c+plane_c] +
+          zp*y0*xm*grid_c[ijk_c-1         +plane_c] +
+          zp*y0*x0*grid_c[ijk_c           +plane_c] +
+          zp*y0*xp*grid_c[ijk_c+1         +plane_c] +
+          zp*yp*xm*grid_c[ijk_c-1+pencil_c+plane_c] +
+          zp*yp*x0*grid_c[ijk_c  +pencil_c+plane_c] +
+          zp*yp*xp*grid_c[ijk_c+1+pencil_c+plane_c] ;
+      /*
+      // -----------------------------------------------------------------------------------------------------------------------
+      // Piecewise-Linear Interpolation
+      // -----------------------------------------------------------------------------------------------------------------------
+      // define line f(x) = bx + c through two coarse grid cells in i-direction... x=+/-1 and x=0
+      // interpolate to either (-0.25) or (+0.25)
+      // combine into 3 dimensions
+      //
+      // +-------+-------+
+      // |   o   |   o   |
+      // +-------+-------+
+      //         .
+      //         .
+      //   interpolation
+      //         .
+      //         .
+      // +---+---+---+---+
+      // |   | x | y |   |
+      // +---+---+---+---+
+      //
+      #warning using 8pt stencil for piecewise-linear interpolation
+      int delta_i=       -1;if(i&0x1)delta_i=       1; // i.e. even points look backwards while odd points look forward
+      int delta_j=-pencil_c;if(j&0x1)delta_j=pencil_c;
+      int delta_k= -plane_c;if(k&0x1)delta_k= plane_c;
+      grid_f[ijk_f] =
+        prescale_f*grid_f[ijk_f                        ] +
+          0.421875*grid_c[ijk_c                        ] +
+          0.140625*grid_c[ijk_c                +delta_k] +
+          0.140625*grid_c[ijk_c        +delta_j        ] +
+          0.046875*grid_c[ijk_c        +delta_j+delta_k] +
+          0.140625*grid_c[ijk_c+delta_i                ] +
+          0.046875*grid_c[ijk_c+delta_i        +delta_k] +
+          0.046875*grid_c[ijk_c+delta_i+delta_j        ] +
+          0.015625*grid_c[ijk_c+delta_i+delta_j+delta_k] ;
+      */
+      /*
+      // -----------------------------------------------------------------------------------------------------------------------
+      // Piecewise-Linear Interpolation
+      // -----------------------------------------------------------------------------------------------------------------------
+      #warning using 7pt stencil for piecewise-linear interpolation... doesn't given 2nd order FMG??
+      double coefi = -0.125;if(i&0x1)coefi = 0.125;
+      double coefj = -0.125;if(j&0x1)coefj = 0.125;
+      double coefk = -0.125;if(k&0x1)coefk = 0.125;
+      grid_f[ijk_f] = prescale_f*grid_f[ijk_f         ] +
+                                 grid_c[ijk_c         ] + 
+                          coefi*(grid_c[ijk_c+       1]-grid_c[ijk_c-       1]) +
+                          coefj*(grid_c[ijk_c+pencil_c]-grid_c[ijk_c-pencil_c]) +
+                          coefk*(grid_c[ijk_c+ plane_c]-grid_c[ijk_c- plane_c]);
+      */
+    }}}
+  }
+  domain->cycles.interpolation[level_f] += (uint64_t)(CycleTime()-_timeStart);
+}
+//------------------------------------------------------------------------------------------------------------------------------
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/jacobi.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/jacobi.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/jacobi.inc
@@ -0,0 +1,86 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include "timer.h"
+#include "mg.h"
+#include "defines.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void smooth(domain_type * domain, int level, int phi_id, int rhs_id, double a, double b){
+  if(numSmooths&1){
+    printf("error - numSmooths must be even...\n");
+    exit(0);
+  }
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+
+  int box,s;
+  int ghosts = domain->ghosts;
+  double TwoThirds = 2.0/3.0;
+  
+  // if communication-avoiding, need RHS for stencils in ghost zones
+  if(ghosts>1)exchange_boundary(domain,level,rhs_id,1,1,1);
+
+  for(s=0;s<numSmooths;s+=ghosts){
+    // Jacobi ping pongs between phi and __temp
+    if((s&1)==0)exchange_boundary(domain,level,phi_id,1,ghosts>1,ghosts>1);  // corners/edges if doing communication-avoiding...
+           else exchange_boundary(domain,level,__temp,1,ghosts>1,ghosts>1);  // corners/edges if doing communication-avoiding...
+
+    // now do ghosts communication-avoiding smooths on each box...
+    uint64_t _timeStart = CycleTime();
+
+    #pragma omp parallel for private(box) if(omp_across_boxes)
+    for(box=0;box<domain->subdomains_per_rank;box++){
+      int i,j,k,ss;
+      int pencil = domain->subdomains[box].levels[level].pencil;
+      int  plane = domain->subdomains[box].levels[level].plane;
+      int ghosts = domain->subdomains[box].levels[level].ghosts;
+      int  dim_k = domain->subdomains[box].levels[level].dim.k;
+      int  dim_j = domain->subdomains[box].levels[level].dim.j;
+      int  dim_i = domain->subdomains[box].levels[level].dim.i;
+      double h2inv = 1.0/(domain->h[level]*domain->h[level]);
+      double * __restrict__ rhs    = domain->subdomains[box].levels[level].grids[  rhs_id] + ghosts*(1+pencil+plane);
+      double * __restrict__ alpha  = domain->subdomains[box].levels[level].grids[__alpha ] + ghosts*(1+pencil+plane);
+      double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[__beta_i] + ghosts*(1+pencil+plane);
+      double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[__beta_j] + ghosts*(1+pencil+plane);
+      double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane);
+      double * __restrict__ lambda = domain->subdomains[box].levels[level].grids[__lambda] + ghosts*(1+pencil+plane);
+  
+      int ghostsToOperateOn=ghosts-1;
+      for(ss=s;ss<s+ghosts;ss++,ghostsToOperateOn--){
+        double * __restrict__ phi;
+        double * __restrict__ phi_new;
+              if((ss&1)==0){phi    = domain->subdomains[box].levels[level].grids[  phi_id] + ghosts*(1+pencil+plane);
+                            phi_new= domain->subdomains[box].levels[level].grids[  __temp] + ghosts*(1+pencil+plane);}
+                       else{phi    = domain->subdomains[box].levels[level].grids[  __temp] + ghosts*(1+pencil+plane);
+                            phi_new= domain->subdomains[box].levels[level].grids[  phi_id] + ghosts*(1+pencil+plane);}
+        #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+        for(k=0-ghostsToOperateOn;k<dim_k+ghostsToOperateOn;k++){
+        for(j=0-ghostsToOperateOn;j<dim_j+ghostsToOperateOn;j++){
+        for(i=0-ghostsToOperateOn;i<dim_i+ghostsToOperateOn;i++){
+          int ijk = i + j*pencil + k*plane;
+          double helmholtz =  a*alpha[ijk]*phi[ijk]
+                             -b*h2inv*(
+                                beta_i[ijk+1     ]*( phi[ijk+1     ]-phi[ijk       ] )
+                               -beta_i[ijk       ]*( phi[ijk       ]-phi[ijk-1     ] )
+                               +beta_j[ijk+pencil]*( phi[ijk+pencil]-phi[ijk       ] )
+                               -beta_j[ijk       ]*( phi[ijk       ]-phi[ijk-pencil] )
+                               +beta_k[ijk+plane ]*( phi[ijk+plane ]-phi[ijk       ] )
+                               -beta_k[ijk       ]*( phi[ijk       ]-phi[ijk-plane ] )
+                              );
+          phi_new[ijk] = phi[ijk] - TwoThirds*lambda[ijk]*(helmholtz-rhs[ijk]);
+        }}}
+      } // ss-loop
+    } // box-loop
+    domain->cycles.smooth[level] += (uint64_t)(CycleTime()-_timeStart);
+  } // s-loop
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/lambda.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/lambda.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/lambda.inc
@@ -0,0 +1,73 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include <stdio.h>
+#include "timer.h"
+#include "mg.h"
+#include "defines.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void rebuild_lambda(domain_type * domain, int level, double a, double b){
+  uint64_t _timeStart = CycleTime();
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+
+  double dominant_eigenvalue = -1.0;
+  #pragma omp parallel for private(box) if(omp_across_boxes) reduction(max:dominant_eigenvalue)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double h2inv = 1.0/(domain->h[level]*domain->h[level]);
+    double * __restrict__ alpha  = domain->subdomains[box].levels[level].grids[__alpha ] + ghosts*(1+pencil+plane);
+    double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[__beta_i] + ghosts*(1+pencil+plane);
+    double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[__beta_j] + ghosts*(1+pencil+plane);
+    double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane);
+    double * __restrict__ lambda = domain->subdomains[box].levels[level].grids[__lambda] + ghosts*(1+pencil+plane);
+    double box_eigenvalue = -1.0;
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) reduction(max:box_eigenvalue)
+    for(k=0;k<dim_k;k++){
+     for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = i + j*pencil + k*plane;
+        // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements...
+        double sumAij = fabs(b*h2inv*beta_i[ijk]) + fabs(b*h2inv*beta_i[ijk+     1]) +
+                        fabs(b*h2inv*beta_j[ijk]) + fabs(b*h2inv*beta_j[ijk+pencil]) +
+                        fabs(b*h2inv*beta_k[ijk]) + fabs(b*h2inv*beta_k[ijk+ plane]);
+        // centr of Gershgorin disc is the diagonal element...
+        double    Aii = a*alpha[ijk] - b*h2inv*( -beta_i[ijk]-beta_i[ijk+     1] 
+                                                 -beta_j[ijk]-beta_j[ijk+pencil] 
+                                                 -beta_k[ijk]-beta_k[ijk+ plane] );
+        lambda[ijk] = 1.0/Aii; // inverse of the diagonal Aii
+        double Di = (Aii + sumAij)/Aii;if(Di>box_eigenvalue)box_eigenvalue=Di; // upper limit to Gershgorin disc == bound on dominant eigenvalue
+    }}}
+    if(box_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = box_eigenvalue;}
+  }
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+
+  #ifdef __MPI
+  uint64_t _timeStartAllReduce = CycleTime();
+  double send = dominant_eigenvalue;
+  MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
+  uint64_t _timeEndAllReduce = CycleTime();
+  domain->cycles.collectives[level]   += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+
+  if(domain->rank==0){if(level==0)printf("\n");printf("  level=%2d, eigenvalue_max ~= %e\n",level,dominant_eigenvalue);fflush(stdout);}
+  domain->dominant_eigenvalue_of_DinvA[level] = dominant_eigenvalue;
+}
+
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/matmul.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/matmul.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/matmul.inc
@@ -0,0 +1,71 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include "timer.h"
+#include "mg.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void matmul_grids(domain_type * domain, int level, double *C, int * id_A, int * id_B, int rows, int cols, int A_equals_B_transpose){
+  // *id_A = m grid_id's (conceptually pointers to the rows    of a m x domain->subdomains_per_rank*volume matrix)
+  // *id_B = n grid_id's (conceptually pointers to the columns of a domain->subdomains_per_rank*volume matrix x n)
+  // *C is a mxn matrix where C[rows][cols] = dot(id_A[rows],id_B[cols])
+
+  // FIX, id_A and id_B are likely the same and thus C[][] will be symmetric (modulo missing row?)
+  // if(A_equals_B_transpose && (cols>=rows)) then use id_B and only run for nn>=mm // common case for s-step Krylov methods
+  // C_is_symmetric && cols< rows (use id_A)
+  int omp_across_matrix = 1;
+  int omp_within_a_box  = 0;
+  int mm,nn;
+
+
+  uint64_t _timeStart = CycleTime();
+  #pragma omp parallel for private(mm,nn) if(omp_across_matrix) collapse(2) schedule(static,1)
+  for(mm=0;mm<rows;mm++){
+  for(nn=0;nn<cols;nn++){
+  if(nn>=mm){ // upper triangular
+    int box;
+    double a_dot_b_domain =  0.0;
+    for(box=0;box<domain->subdomains_per_rank;box++){
+      int i,j,k;
+      int pencil = domain->subdomains[box].levels[level].pencil;
+      int  plane = domain->subdomains[box].levels[level].plane;
+      int ghosts = domain->subdomains[box].levels[level].ghosts;
+      int  dim_k = domain->subdomains[box].levels[level].dim.k;
+      int  dim_j = domain->subdomains[box].levels[level].dim.j;
+      int  dim_i = domain->subdomains[box].levels[level].dim.i;
+      double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_A[mm]] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
+      double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_B[nn]] + ghosts*(1+pencil+plane); 
+      double a_dot_b_box = 0.0;
+      #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:a_dot_b_box)
+      for(k=0;k<dim_k;k++){
+      for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = i + j*pencil + k*plane;
+        a_dot_b_box += grid_a[ijk]*grid_b[ijk];
+      }}}
+      a_dot_b_domain+=a_dot_b_box;
+    }
+                             C[mm*cols + nn] = a_dot_b_domain; // C[mm][nn]
+    if((mm<cols)&&(nn<rows)){C[nn*cols + mm] = a_dot_b_domain;}// C[nn][mm] 
+  }
+  }}
+  domain->cycles.blas3[level] += (uint64_t)(CycleTime()-_timeStart);
+
+  #ifdef __MPI
+  double *send_buffer = (double*)malloc(rows*cols*sizeof(double));
+  for(mm=0;mm<rows;mm++){
+  for(nn=0;nn<cols;nn++){
+    send_buffer[mm*cols + nn] = C[mm*cols + nn];
+  }}
+  uint64_t _timeStartAllReduce = CycleTime();
+  MPI_Allreduce(send_buffer,C,rows*cols,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+  uint64_t _timeEndAllReduce = CycleTime();
+  domain->cycles.collectives[level]   += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  free(send_buffer);
+  #endif
+
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.h
@@ -0,0 +1,129 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef MG_H
+#define MG_H
+#ifdef __MPI
+#include <mpi.h>
+#endif
+#include "box.h"
+//------------------------------------------------------------------------------------------------------------------------------
+#define MaxLevels 10
+//------------------------------------------------------------------------------------------------------------------------------
+#define __DEFAULT_BOTTOM_NORM 1e-3
+//------------------------------------------------------------------------------------------------------------------------------
+#define numSmooths     4
+//------------------------------------------------------------------------------------------------------------------------------
+#define __BOUNDARY_PERIODIC  0
+#define __BOUNDARY_DIRICHLET 1
+#define __BOUNDARY_NEUMANN   2
+//------------------------------------------------------------------------------------------------------------------------------
+typedef struct {
+//double scale;										// +1.0 for ghost zone exchanges
+//											// -1.0 for dirichlet boundary conditions in finite volume
+//											//  0.0 for dirichlet boundary conditions in finite difference
+//											// +1.0 for neumann boundary conditions in finite volume or difference
+  int isFace,isEdge,isCorner;								// used to bypass certain computations
+  struct {int i, j, k;}dim;								// dimensions of the buffer to copy
+  struct {int box, i, j, k, pencil, plane;double * __restrict__ ptr;}read,write;	// coordinates in the read grid to extract the buffer, 
+  											// and coordinates in the write grid to insert the buffer
+  											// if read/write.box<0, then use write/read.ptr, 
+  											// otherwise domain->subdomains[.box].levels[level].grids[grid_id]
+} bufferCopy_type;
+//------------------------------------------------------------------------------------------------------------------------------
+typedef struct {
+  int rank;							// MPI rank of remote process
+  int local_index;						// index in subdomains[] on remote process
+  #ifdef __MPI
+  struct{int buf;struct{int faces,edges,corners;}offset;}send;	// i.e. calculate offset as faceSize*faces + edgeSize*edges + cornerSize*corners
+  struct{int buf;struct{int faces,edges,corners;}offset;}recv;	// i.e. calculate offset as faceSize*faces + edgeSize*edges + cornerSize*corners
+  #endif
+} neighbor_type;
+
+//------------------------------------------------------------------------------------------------------------------------------
+typedef struct {
+  struct {int i, j, k;}low;  			// global coordinates of the first (non-ghost) element of subdomain at the finest resolution
+  struct {int i, j, k;}dim;  			// subdomain dimensions at finest resolution
+  int numLevels;      				// number of levels in MG v-cycle.  1=no restrictions
+  int ghosts;                			// ghost zone depth
+  neighbor_type neighbors[27];			// MPI rank and local index (on remote process) of each subdomain neighboring this subdomain
+  box_type * levels;				// pointer to an array of all coarsenings of this box
+} subdomain_type;
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+
+typedef struct {
+  // timing information...
+  struct {
+    uint64_t        smooth[MaxLevels];
+    uint64_t      apply_op[MaxLevels];
+    uint64_t      residual[MaxLevels];
+    uint64_t   restriction[MaxLevels];
+    uint64_t interpolation[MaxLevels];
+    uint64_t communication[MaxLevels];
+    uint64_t          pack[MaxLevels];
+    uint64_t     grid2grid[MaxLevels];
+    uint64_t        unpack[MaxLevels];
+    uint64_t          recv[MaxLevels];
+    uint64_t          send[MaxLevels];
+    uint64_t          wait[MaxLevels];
+    uint64_t         blas1[MaxLevels];
+    uint64_t         blas3[MaxLevels];
+    uint64_t   collectives[MaxLevels];
+    uint64_t         Total[MaxLevels];
+    uint64_t MGBuild; // total time spent building the coefficients...
+    uint64_t vcycles; // total time spent in all vcycles (all CycleMG)
+    uint64_t MGSolve; // total time spent in MGSolve
+  }cycles;
+  int vcycles_performed, MGSolves_performed, Krylov_iterations, CAKrylov_formations_of_G;
+
+  int                                   rank_of_neighbor[27]; // = MPI rank of the neighbors of this process's subdomains (presumes rectahedral packing)
+  #ifdef __MPI
+  double * __restrict__                      send_buffer[27]; // = MPI send buffers (one per neighbor)
+  double * __restrict__                      recv_buffer[27]; // = MPI recieve buffer (one per neighbor)
+  struct{int faces,edges,corners;}           buffer_size[27]; // = MPI buffer size (one per neighbor) in the units of faces/edges/corners
+//int                             buffer_size[MaxLevels][27]; // = MPI buffer size (one per neighbor) in doubles
+  #endif
+
+  bufferCopy_type * bufferCopies[MaxLevels];         // array of pointers to list of bufferCopy's that must be performed to realize a boundary exchange
+  int bufferCopy_Pack_Start,  bufferCopy_Pack_End;   // index into above list to pack the MPI buffers
+  int bufferCopy_Local_Start, bufferCopy_Local_End;  // index into above list to perform a local exchange (hide within waitall)
+  int bufferCopy_Unpack_Start,bufferCopy_Unpack_End; // index into above list to unpack the MPI buffers
+
+  // n.b. i=unit stride
+  struct {int i, j, k;}dim;				// global dimensions at finest resolution
+  struct {int i, j, k;}ranks_in;			// number of MPI ranks in i,j,k
+  struct {int i, j, k;}subdomains_per_rank_in;		// number of subdomains in i,j,k
+  struct {int i, j, k;}subdomains_in;			// total number of subdomains in i,j,k across the entire domain
+  struct {int i, j, k;}boundary_condition;              // domain boundary condition in i,j,k directions (e.g. __BOUNDARY_PERIODIC)
+  int rank;						// MPI rank of this process
+  int subdomains_per_rank;				// number of subdomains owned by this process
+  int numLevels;					// number of levels in MG v-cycle.  1=no restrictions
+  int numGrids;						// number of grids (variables)
+  int ghosts;						// ghost zone depth
+  double h[MaxLevels];					// h at each level
+  double dominant_eigenvalue_of_DinvA[MaxLevels];	// (estimate) for the dominant(largest) eigenvalue at each level of the operator D^{-1}A = lambda*helmholtz
+  subdomain_type * subdomains;				// pointer to a list of all subdomains owned by this process
+} domain_type;
+
+//------------------------------------------------------------------------------------------------------------------------------
+ int create_subdomain(subdomain_type * box, 
+                      int subdomain_low_i, int subdomain_low_j, int subdomain_low_k,
+                      int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k,
+                      int numGrids, int ghosts, int numLevels);
+void destroy_domain(domain_type * domain);
+ int create_domain(domain_type * domain,
+                   int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k,
+                   int subdomains_per_rank_in_i, int subdomains_per_rank_in_j, int subdomains_per_rank_in_k,
+                   int ranks_in_i, int ranks_in_j, int ranks_in_k,
+                   int rank, 
+                   int *boundary_conditions,
+                   int numGrids, int ghosts, int numLevels);
+void      MGBuild(domain_type * domain, double a, double b, double h0);
+void      MGSolve(domain_type * domain, int u_id, int F_id, double a, double b, double desired_mg_norm);
+void print_timing(domain_type *domain);
+//------------------------------------------------------------------------------------------------------------------------------
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.c
@@ -0,0 +1,798 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef __MPI
+#include <mpi.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timer.h"
+#include "defines.h"
+#include "box.h"
+#include "mg.h"
+#include "operators.h"
+#include "solver.h"
+//------------------------------------------------------------------------------------------------------------------------------
+int create_subdomain(subdomain_type * box, int subdomain_low_i, int subdomain_low_j, int subdomain_low_k,  
+                                       int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k, 
+                                       int numGrids, int ghosts, int numLevels){
+  int level;
+  uint64_t memory_allocated=0;
+  box->numLevels=numLevels;
+  box->ghosts=ghosts;
+  box->low.i = subdomain_low_i;
+  box->low.j = subdomain_low_j;
+  box->low.k = subdomain_low_k;
+  box->dim.i = subdomain_dim_i;
+  box->dim.j = subdomain_dim_j;
+  box->dim.k = subdomain_dim_k;
+  posix_memalign((void**)&(box->levels),64,numLevels*sizeof(box_type));
+  memory_allocated += numLevels*sizeof(box_type);
+  for(level=0;level<numLevels;level++){
+                          int numGridsActual = numGrids;
+    if(level == (numLevels-1))numGridsActual+= IterativeSolver_NumGrids(); // the bottom solver may need a few auxillary grids...
+    memory_allocated += create_box(&box->levels[level],numGridsActual,subdomain_low_i>>level,subdomain_low_j>>level,subdomain_low_k>>level,
+                                                                      subdomain_dim_i>>level,subdomain_dim_j>>level,subdomain_dim_k>>level,ghosts);
+  }
+  return(memory_allocated);
+}
+
+
+void destroy_subdomain(subdomain_type * box){
+  int level;
+  for(level=0;level<box->numLevels;level++){
+    destroy_box(&box->levels[level]);
+  }
+  free(box->levels);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+int calculate_neighboring_subdomain_index(domain_type *domain, int bi, int bj, int bk, int di, int dj, int dk){
+  int ni,nj,nk;
+  ni=(bi+domain->subdomains_per_rank_in.i+di)%domain->subdomains_per_rank_in.i;
+  nj=(bj+domain->subdomains_per_rank_in.j+dj)%domain->subdomains_per_rank_in.j;
+  nk=(bk+domain->subdomains_per_rank_in.k+dk)%domain->subdomains_per_rank_in.k;
+  int index = ni + nj*domain->subdomains_per_rank_in.i + nk*domain->subdomains_per_rank_in.i*domain->subdomains_per_rank_in.j;
+  return(index);
+}
+
+int calculate_neighboring_subdomain_rank(domain_type * domain, int bi, int bj, int bk, int di, int dj, int dk, int ri, int rj, int rk){
+  if( domain->boundary_condition.i != __BOUNDARY_PERIODIC){
+    if( ((domain->subdomains_per_rank_in.i*ri)+bi+di) <                        0 )return(-1);
+    if( ((domain->subdomains_per_rank_in.i*ri)+bi+di) >= domain->subdomains_in.i )return(-1);
+  }
+  if( domain->boundary_condition.j != __BOUNDARY_PERIODIC){
+    if( ((domain->subdomains_per_rank_in.j*rj)+bj+dj) <                        0 )return(-1);
+    if( ((domain->subdomains_per_rank_in.j*rj)+bj+dj) >= domain->subdomains_in.j )return(-1);
+  }
+  if( domain->boundary_condition.k != __BOUNDARY_PERIODIC){
+    if( ((domain->subdomains_per_rank_in.k*rk)+bk+dk) <                        0 )return(-1);
+    if( ((domain->subdomains_per_rank_in.k*rk)+bk+dk) >= domain->subdomains_in.k )return(-1);
+  }
+  // if didn't walk off domain, calculate new rank for periodic boundaries...
+  if((bi+di)<0)ri--;if((bi+di)>=domain->subdomains_per_rank_in.i)ri++;ri=(ri+domain->ranks_in.i)%domain->ranks_in.i;
+  if((bj+dj)<0)rj--;if((bj+dj)>=domain->subdomains_per_rank_in.j)rj++;rj=(rj+domain->ranks_in.j)%domain->ranks_in.j;
+  if((bk+dk)<0)rk--;if((bk+dk)>=domain->subdomains_per_rank_in.k)rk++;rk=(rk+domain->ranks_in.k)%domain->ranks_in.k;
+  int rank = ri + rj*domain->ranks_in.i + rk*domain->ranks_in.i*domain->ranks_in.j;
+  return(rank);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+int create_domain(domain_type * domain, 
+              int subdomain_dim_i,  int subdomain_dim_j,  int subdomain_dim_k,  
+              int subdomains_per_rank_in_i, int subdomains_per_rank_in_j, int subdomains_per_rank_in_k, 
+              int ranks_in_i,      int ranks_in_j,      int ranks_in_k, 
+              int rank,
+              int *boundary_conditions,
+              int numGrids, int ghosts, int numLevels
+             ){
+  int level;
+  int  i, j, k;
+  int di,dj,dk;
+  // FIX ghosts = array (varies with level)
+  domain->rank = rank;
+  if(domain->rank==0){printf("creating domain...       ");fflush(stdout);}
+  //FIX, limit ghosts based on coarsest problem size
+  if(ghosts>(subdomain_dim_i>>(numLevels-1))){if(domain->rank==0)printf("#ghosts(%d)>bottom grid size(%d)\n",ghosts,subdomain_dim_i>>(numLevels-1));exit(0);}
+
+  if( (subdomain_dim_i!=subdomain_dim_j)||(subdomain_dim_j!=subdomain_dim_k)||(subdomain_dim_i!=subdomain_dim_k) ){
+  if(domain->rank==0)printf("subdomain_dim's must be equal\n");exit(0);
+  }
+  uint64_t memory_allocated =0;
+  // processes are laid out in x, then y, then z
+  //int ranks = ranks_in_i*ranks_in_j*ranks_in_k;
+  int my_k_rank = (rank                                                              ) / (ranks_in_i*ranks_in_j);
+  int my_j_rank = (rank - (ranks_in_i*ranks_in_j*my_k_rank)                          ) / (ranks_in_i           );
+  int my_i_rank = (rank - (ranks_in_i*ranks_in_j*my_k_rank) - (ranks_in_i*my_j_rank) );
+  //printf("%2d: (%2d,%2d,%2d)\n",domain->rank,my_k_rank,my_j_rank,my_i_rank);
+
+
+  domain->ranks_in.i               = ranks_in_i;
+  domain->ranks_in.j               = ranks_in_j;
+  domain->ranks_in.k               = ranks_in_k;
+  domain->subdomains_per_rank_in.i = subdomains_per_rank_in_i;
+  domain->subdomains_per_rank_in.j = subdomains_per_rank_in_j;
+  domain->subdomains_per_rank_in.k = subdomains_per_rank_in_k;
+  domain->subdomains_in.i          = subdomains_per_rank_in_i*ranks_in_i;
+  domain->subdomains_in.j          = subdomains_per_rank_in_j*ranks_in_j;
+  domain->subdomains_in.k          = subdomains_per_rank_in_k*ranks_in_k;
+  domain->subdomains_per_rank      = subdomains_per_rank_in_i*subdomains_per_rank_in_j*subdomains_per_rank_in_k;
+  posix_memalign((void**)&(domain->subdomains),64,domain->subdomains_per_rank*sizeof(subdomain_type));
+  memory_allocated+=domain->subdomains_per_rank*sizeof(subdomain_type);
+
+  domain->dim.i                = domain->subdomains_in.i * subdomain_dim_i;
+  domain->dim.j                = domain->subdomains_in.j * subdomain_dim_j;
+  domain->dim.k                = domain->subdomains_in.k * subdomain_dim_k;
+  domain->boundary_condition.i = boundary_conditions[0];
+  domain->boundary_condition.j = boundary_conditions[1];
+  domain->boundary_condition.k = boundary_conditions[2];
+  domain->numLevels            = numLevels;
+  domain->numGrids             = numGrids;
+  domain->ghosts               = ghosts;
+  for(level=0;level<domain->numLevels;level++){
+                               domain->h[level] = -1;
+    domain->dominant_eigenvalue_of_DinvA[level] = -1;
+  }
+
+  // calculate the neighbors of this process
+  for(dk=-1;dk<=1;dk++){
+  for(dj=-1;dj<=1;dj++){
+  for(di=-1;di<=1;di++){
+    int n = 13+di+3*dj+9*dk;
+    int neighbor_rank_in_i = (my_i_rank+di+ranks_in_i)%ranks_in_i;
+    int neighbor_rank_in_j = (my_j_rank+dj+ranks_in_j)%ranks_in_j;
+    int neighbor_rank_in_k = (my_k_rank+dk+ranks_in_k)%ranks_in_k;
+    domain->rank_of_neighbor[n] = neighbor_rank_in_i + ranks_in_i*neighbor_rank_in_j + ranks_in_i*ranks_in_j*neighbor_rank_in_k;
+    if( domain->boundary_condition.i != __BOUNDARY_PERIODIC){
+      if( (my_i_rank+di) <           0 )domain->rank_of_neighbor[n] = -1;
+      if( (my_i_rank+di) >= ranks_in_i )domain->rank_of_neighbor[n] = -1;
+    }
+    if( domain->boundary_condition.j != __BOUNDARY_PERIODIC){
+      if( (my_j_rank+dj) <           0 )domain->rank_of_neighbor[n] = -1;
+      if( (my_j_rank+dj) >= ranks_in_j )domain->rank_of_neighbor[n] = -1;
+    }
+    if( domain->boundary_condition.k != __BOUNDARY_PERIODIC){
+      if( (my_k_rank+dk) <           0 )domain->rank_of_neighbor[n] = -1;
+      if( (my_k_rank+dk) >= ranks_in_k )domain->rank_of_neighbor[n] = -1;
+    }
+  }}}
+
+  // subdomains within a process are laid out in i, then j, then k
+  for(k=0;k<subdomains_per_rank_in_k;k++){
+  for(j=0;j<subdomains_per_rank_in_j;j++){
+  for(i=0;i<subdomains_per_rank_in_i;i++){
+    int box = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
+    int low_i = subdomain_dim_i * (i + subdomains_per_rank_in_i*my_i_rank);
+    int low_j = subdomain_dim_j * (j + subdomains_per_rank_in_j*my_j_rank);
+    int low_k = subdomain_dim_k * (k + subdomains_per_rank_in_k*my_k_rank);
+    memory_allocated += create_subdomain(&domain->subdomains[box],low_i,low_j,low_k,subdomain_dim_i,subdomain_dim_j,subdomain_dim_k,numGrids,ghosts,numLevels);
+    for(dk=-1;dk<=1;dk++){
+    for(dj=-1;dj<=1;dj++){
+    for(di=-1;di<=1;di++){
+      int n = 13+di+3*dj+9*dk;
+      domain->subdomains[box].neighbors[n].rank        = calculate_neighboring_subdomain_rank(domain,i,j,k,di,dj,dk,my_i_rank,my_j_rank,my_k_rank);
+      domain->subdomains[box].neighbors[n].local_index = calculate_neighboring_subdomain_index(domain,i,j,k,di,dj,dk);
+      // FIX boundary conditions??
+      //printf("rank=%2d, box[%2d].neighbors[%3d]=(%3d,%3d)\n",domain->rank,box,n,domain->subdomains[box].neighbors[n].rank,domain->subdomains[box].neighbors[n].local_index);
+    }}}
+  }}}
+
+
+  int FacesEdgesCorners[27] = {4,10,12,14,16,22,   1,3,5,7,9,11,15,17,19,21,23,25,   0,2,6,8,18,20,24,26,   13};
+  int    faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
+  int    edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
+  int  corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};
+
+  #ifdef __MPI
+  // allocate MPI send/recv buffers for the 26 neighbors....
+  for(dk=-1;dk<=1;dk++){
+  for(dj=-1;dj<=1;dj++){
+  for(di=-1;di<=1;di++){
+    int n = 13+di+3*dj+9*dk;if(n!=13){
+    int bufSize = 1;
+    if(di==0)bufSize*=domain->subdomains_per_rank_in.i*subdomain_dim_i;else bufSize*=ghosts;
+    if(dj==0)bufSize*=domain->subdomains_per_rank_in.j*subdomain_dim_j;else bufSize*=ghosts;
+    if(dk==0)bufSize*=domain->subdomains_per_rank_in.k*subdomain_dim_k;else bufSize*=ghosts;
+    posix_memalign((void**)&(domain->send_buffer[n]),64,bufSize*sizeof(double));
+    posix_memalign((void**)&(domain->recv_buffer[n]),64,bufSize*sizeof(double));
+                        memset(domain->send_buffer[n],0,bufSize*sizeof(double));
+                        memset(domain->recv_buffer[n],0,bufSize*sizeof(double));
+                                      memory_allocated+=bufSize*sizeof(double);
+                                      memory_allocated+=bufSize*sizeof(double);
+  }}}}
+  #endif
+
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  // now enumerate a list of data transfers to effect a ghost zone exchange (boundary conditions are skipped)
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  for(level=0;level<domain->numLevels;level++){
+    int initialize;for(initialize=0;initialize<2;initialize++){ // initialize=0=count/malloc.  initialize=1=initialize
+    int buffer=0;
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // Traverse list of boxes, and create MPI pack for faces, edges, and corners
+    domain->bufferCopy_Pack_Start=buffer;
+    int nn;
+    #ifdef __MPI
+    for(nn=0;nn<26;nn++){
+      int n = FacesEdgesCorners[nn];
+      int di = ((n/1)%3) - 1;
+      int dj = ((n/3)%3) - 1;
+      int dk = ((n/9)%3) - 1;
+
+      // traverse all boxes in direction di/dj/dk... (e.g. all subdomains on the corresponding face)
+      int sd_i_lo,sd_i_hi;
+      int sd_j_lo,sd_j_hi;
+      int sd_k_lo,sd_k_hi;
+      switch(di){
+        case -1:sd_i_lo=                         0;sd_i_hi=                       1;break;
+        case  0:sd_i_lo=                         0;sd_i_hi=subdomains_per_rank_in_i;break;
+        case  1:sd_i_lo=subdomains_per_rank_in_i-1;sd_i_hi=subdomains_per_rank_in_i;break;
+      };
+      switch(dj){
+        case -1:sd_j_lo=                         0;sd_j_hi=                       1;break;
+        case  0:sd_j_lo=                         0;sd_j_hi=subdomains_per_rank_in_j;break;
+        case  1:sd_j_lo=subdomains_per_rank_in_j-1;sd_j_hi=subdomains_per_rank_in_j;break;
+      };
+      switch(dk){
+        case -1:sd_k_lo=                         0;sd_k_hi=                       1;break;
+        case  0:sd_k_lo=                         0;sd_k_hi=subdomains_per_rank_in_k;break;
+        case  1:sd_k_lo=subdomains_per_rank_in_k-1;sd_k_hi=subdomains_per_rank_in_k;break;
+      };
+      for(k=sd_k_lo;k<sd_k_hi;k++){
+      for(j=sd_j_lo;j<sd_j_hi;j++){
+      for(i=sd_i_lo;i<sd_i_hi;i++){
+        int off_node_exchange=1;
+        if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == domain->rank)off_node_exchange=0; // same process
+        if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) ==           -1)off_node_exchange=0; // domain boundary
+        if(off_node_exchange){
+          int sendBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
+          int grid_i,grid_j,grid_k; // ijk in the source grid (relative to first ghost zone element)
+          int  dim_i, dim_j, dim_k; // dimensions of face/edge/corner
+          switch(di){
+            case -1:grid_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].ghosts;break;
+            case  0:grid_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].dim.i; break;
+            case  1:grid_i=domain->subdomains[sendBox].levels[level].dim.i; dim_i=domain->subdomains[sendBox].levels[level].ghosts;break;
+          };
+          switch(dj){
+            case -1:grid_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].ghosts;break;
+            case  0:grid_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].dim.j; break;
+            case  1:grid_j=domain->subdomains[sendBox].levels[level].dim.j; dim_j=domain->subdomains[sendBox].levels[level].ghosts;break;
+          };
+          switch(dk){
+            case -1:grid_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].ghosts;break;
+            case  0:grid_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].dim.k; break;
+            case  1:grid_k=domain->subdomains[sendBox].levels[level].dim.k; dim_k=domain->subdomains[sendBox].levels[level].ghosts;break;
+          };
+          int buf_i = dim_i*(i-sd_i_lo); // ijk in the target buffer
+          int buf_j = dim_j*(j-sd_j_lo);
+          int buf_k = dim_k*(k-sd_k_lo);
+          int buf_pencil = dim_i*(sd_i_hi-sd_i_lo);
+          int buf_plane  = dim_j*(sd_j_hi-sd_j_lo)*buf_pencil;
+
+          // FIX, could be broken into sub buffers for more parallelism (i.e. pencils)
+          if(initialize==1){
+            domain->bufferCopies[level][buffer].dim.i        = dim_i;
+            domain->bufferCopies[level][buffer].dim.j        = dim_j;
+            domain->bufferCopies[level][buffer].dim.k        = dim_k;
+            domain->bufferCopies[level][buffer].read.box     = sendBox;
+            domain->bufferCopies[level][buffer].read.ptr     = NULL;
+            domain->bufferCopies[level][buffer].read.i       = grid_i;
+            domain->bufferCopies[level][buffer].read.j       = grid_j;
+            domain->bufferCopies[level][buffer].read.k       = grid_k;
+            domain->bufferCopies[level][buffer].read.pencil  = domain->subdomains[sendBox].levels[level].pencil;
+            domain->bufferCopies[level][buffer].read.plane   = domain->subdomains[sendBox].levels[level].plane;
+            domain->bufferCopies[level][buffer].write.box    = -1;
+            domain->bufferCopies[level][buffer].write.ptr    = domain->send_buffer[n];
+            domain->bufferCopies[level][buffer].write.i      = buf_i;
+            domain->bufferCopies[level][buffer].write.j      = buf_j;
+            domain->bufferCopies[level][buffer].write.k      = buf_k;
+            domain->bufferCopies[level][buffer].write.pencil = buf_pencil;
+            domain->bufferCopies[level][buffer].write.plane  = buf_plane;
+            domain->bufferCopies[level][buffer].isFace       =   faces[n];
+            domain->bufferCopies[level][buffer].isEdge       =   edges[n];
+            domain->bufferCopies[level][buffer].isCorner     = corners[n];
+          }
+          //else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from box=%3d to (%3d,%3d,%3d) in MPI[%2d,%2d,%2d] (%3d,%4d)\n",level,buffer,dim_i,dim_j,dim_k,grid_i,grid_j,grid_k,sendBox,buf_i,buf_j,buf_k,di,dj,dk,buf_pencil,buf_plane);
+          buffer++;
+        } // off-process neighbor
+      }}} // subdomains
+    } // directions
+    #endif
+    domain->bufferCopy_Pack_End=buffer;
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+    // Traverse list of boxes, and create local exchange/domain copies for faces, edges, and corners
+    domain->bufferCopy_Local_Start=buffer;
+    for(nn=0;nn<26;nn++){
+      int n = FacesEdgesCorners[nn];
+      int di = ((n/1)%3) - 1;
+      int dj = ((n/3)%3) - 1;
+      int dk = ((n/9)%3) - 1;
+      for(k=0;k<subdomains_per_rank_in_k;k++){
+      for(j=0;j<subdomains_per_rank_in_j;j++){
+      for(i=0;i<subdomains_per_rank_in_i;i++){
+        int sendBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
+        int on_node_exchange = 1;
+        if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) != domain->rank)on_node_exchange=0; // if == -1, then domain boundary and thus not on_node
+        if(on_node_exchange){
+          int recvBox = calculate_neighboring_subdomain_index(domain,i,j,k,di,dj,dk);
+          int send_i,send_j,send_k; // ijk in the source grid (relative to first ghost zone element)
+          int recv_i,recv_j,recv_k; // ijk in the destination grid (relative to first ghost zone element)
+          int  dim_i, dim_j, dim_k; // dimensions of face/edge/corner
+          switch(di){
+            case -1:send_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].ghosts;recv_i=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.i;break;
+            case  0:send_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].dim.i; recv_i=domain->subdomains[recvBox].levels[level].ghosts;break;
+            case  1:send_i=domain->subdomains[sendBox].levels[level].dim.i; dim_i=domain->subdomains[sendBox].levels[level].ghosts;recv_i=0;break;
+          };
+          switch(dj){
+            case -1:send_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].ghosts;recv_j=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.j;break;
+            case  0:send_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].dim.j; recv_j=domain->subdomains[recvBox].levels[level].ghosts;break;
+            case  1:send_j=domain->subdomains[sendBox].levels[level].dim.j; dim_j=domain->subdomains[sendBox].levels[level].ghosts;recv_j=0;break;
+          };
+          switch(dk){
+            case -1:send_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].ghosts;recv_k=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.k;break;
+            case  0:send_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].dim.k; recv_k=domain->subdomains[recvBox].levels[level].ghosts;break;
+            case  1:send_k=domain->subdomains[sendBox].levels[level].dim.k; dim_k=domain->subdomains[sendBox].levels[level].ghosts;recv_k=0;break;
+          };
+          // FIX, could be broken into sub buffers for more parallelism (i.e. pencils)
+          if(initialize==1){
+            domain->bufferCopies[level][buffer].dim.i        = dim_i;
+            domain->bufferCopies[level][buffer].dim.j        = dim_j;
+            domain->bufferCopies[level][buffer].dim.k        = dim_k;
+            domain->bufferCopies[level][buffer].read.box     = sendBox;
+            domain->bufferCopies[level][buffer].read.ptr     = NULL;
+            domain->bufferCopies[level][buffer].read.i       = send_i;
+            domain->bufferCopies[level][buffer].read.j       = send_j;
+            domain->bufferCopies[level][buffer].read.k       = send_k;
+            domain->bufferCopies[level][buffer].read.pencil  = domain->subdomains[sendBox].levels[level].pencil;
+            domain->bufferCopies[level][buffer].read.plane   = domain->subdomains[sendBox].levels[level].plane;
+            domain->bufferCopies[level][buffer].write.box    = recvBox;
+            domain->bufferCopies[level][buffer].write.ptr    = NULL;
+            domain->bufferCopies[level][buffer].write.i      = recv_i;
+            domain->bufferCopies[level][buffer].write.j      = recv_j;
+            domain->bufferCopies[level][buffer].write.k      = recv_k;
+            domain->bufferCopies[level][buffer].write.pencil = domain->subdomains[recvBox].levels[level].pencil;
+            domain->bufferCopies[level][buffer].write.plane  = domain->subdomains[recvBox].levels[level].plane;
+            domain->bufferCopies[level][buffer].isFace       =   faces[n];
+            domain->bufferCopies[level][buffer].isEdge       =   edges[n];
+            domain->bufferCopies[level][buffer].isCorner     = corners[n];
+          }
+          //else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from box=%3d to (%3d,%3d,%3d) in box=%3d\n",level,buffer,dim_i,dim_j,dim_k,send_i,send_j,send_k,sendBox,recv_i,recv_j,recv_k,recvBox);
+          buffer++;
+        }
+      }}} // all local subdomains
+    } // directions
+    domain->bufferCopy_Local_End=buffer;
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+    // Traverse list of boxes and their neighbors and gather data from buffers...
+    // faces come from face buffers
+    // edges can come from either edge buffers or face buffers
+    // corners can come from either corner buffers, edge buffers or face buffers
+    domain->bufferCopy_Unpack_Start=buffer;
+    #ifdef __MPI
+    for(nn=0;nn<26;nn++){
+      int n = FacesEdgesCorners[nn];
+      int di = ((n/1)%3) - 1;
+      int dj = ((n/3)%3) - 1;
+      int dk = ((n/9)%3) - 1;
+      for(k=0;k<subdomains_per_rank_in_k;k++){
+      for(j=0;j<subdomains_per_rank_in_j;j++){
+      for(i=0;i<subdomains_per_rank_in_i;i++){
+        int recvBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
+        int off_node_exchange = 1;
+        if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == domain->rank)off_node_exchange=0; // same node
+        if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) ==           -1)off_node_exchange=0; // domain boundary
+        if(off_node_exchange){
+          int _di=0;
+          int _dj=0;
+          int _dk=0;
+          if((i+di)<0){_di=-1;}else if((i+di)>=subdomains_per_rank_in_i){_di=1;}
+          if((j+dj)<0){_dj=-1;}else if((j+dj)>=subdomains_per_rank_in_j){_dj=1;}
+          if((k+dk)<0){_dk=-1;}else if((k+dk)>=subdomains_per_rank_in_k){_dk=1;}
+          int _n = 13 + _di + 3*_dj + 9*_dk;
+          int  dim_i, dim_j, dim_k; // dimensions of face/edge/corner
+          int grid_i,grid_j,grid_k; // ijk in the destination grid (relative to first ghost zone element)
+          int  buf_i, buf_j, buf_k; // ijk in the source buffer
+          int buf_plane =1;
+          int buf_pencil=1;
+          switch(di){
+            case -1:dim_i=domain->subdomains[recvBox].levels[level].ghosts;grid_i=0;                                                                                               break;
+            case  0:dim_i=domain->subdomains[recvBox].levels[level].dim.i; grid_i=domain->subdomains[recvBox].levels[level].ghosts;                                                break;
+            case  1:dim_i=domain->subdomains[recvBox].levels[level].ghosts;grid_i=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.i;break;
+          };
+          switch(dj){
+            case -1:dim_j=domain->subdomains[recvBox].levels[level].ghosts;grid_j=0;                                                                                               break;
+            case  0:dim_j=domain->subdomains[recvBox].levels[level].dim.j; grid_j=domain->subdomains[recvBox].levels[level].ghosts;                                                break;
+            case  1:dim_j=domain->subdomains[recvBox].levels[level].ghosts;grid_j=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.j;break;
+          };
+          switch(dk){
+            case -1:dim_k=domain->subdomains[recvBox].levels[level].ghosts;grid_k=0;                                                                                               break;
+            case  0:dim_k=domain->subdomains[recvBox].levels[level].dim.k; grid_k=domain->subdomains[recvBox].levels[level].ghosts;                                                break;
+            case  1:dim_k=domain->subdomains[recvBox].levels[level].ghosts;grid_k=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.k;break;
+          };
+          switch(_di){
+            case -1:buf_i=0;                                                buf_plane*=                         domain->subdomains[recvBox].levels[level].ghosts;buf_pencil=                         domain->subdomains[recvBox].levels[level].ghosts;break;
+            case  0:buf_i=i*domain->subdomains[recvBox].levels[level].dim.i;buf_plane*=subdomains_per_rank_in_i*domain->subdomains[recvBox].levels[level].dim.i; buf_pencil=subdomains_per_rank_in_i*domain->subdomains[recvBox].levels[level].dim.i; break;
+            case  1:buf_i=0;                                                buf_plane*=                         domain->subdomains[recvBox].levels[level].ghosts;buf_pencil=                         domain->subdomains[recvBox].levels[level].ghosts;break;
+          };
+          switch(_dj){
+            case -1:buf_j=0;                                                buf_plane*=                         domain->subdomains[recvBox].levels[level].ghosts;break;
+            case  0:buf_j=j*domain->subdomains[recvBox].levels[level].dim.j;buf_plane*=subdomains_per_rank_in_j*domain->subdomains[recvBox].levels[level].dim.j; break;
+            case  1:buf_j=0;                                                buf_plane*=                         domain->subdomains[recvBox].levels[level].ghosts;break;
+          };
+          switch(_dk){
+            case -1:buf_k=0;                                                break;
+            case  0:buf_k=k*domain->subdomains[recvBox].levels[level].dim.k;break;
+            case  1:buf_k=0;                                                break;
+          };
+          if( (i+di>=0) && (i+di<subdomains_per_rank_in_i) ){if(di<0)buf_i-=domain->subdomains[recvBox].levels[level].ghosts;else if(di>0)buf_i+=domain->subdomains[recvBox].levels[level].dim.i;}// forward by dim.i or back by ghosts
+          if( (j+dj>=0) && (j+dj<subdomains_per_rank_in_j) ){if(dj<0)buf_j-=domain->subdomains[recvBox].levels[level].ghosts;else if(dj>0)buf_j+=domain->subdomains[recvBox].levels[level].dim.j;}// forward by dim.j or back by ghosts
+          if( (k+dk>=0) && (k+dk<subdomains_per_rank_in_k) ){if(dk<0)buf_k-=domain->subdomains[recvBox].levels[level].ghosts;else if(dk>0)buf_k+=domain->subdomains[recvBox].levels[level].dim.k;}// forward by dim.k or back by ghosts
+          // FIX, could be broken into sub buffers for more parallelism (i.e. pencils)
+          if(initialize==1){
+            domain->bufferCopies[level][buffer].dim.i        = dim_i;
+            domain->bufferCopies[level][buffer].dim.j        = dim_j;
+            domain->bufferCopies[level][buffer].dim.k        = dim_k;
+            domain->bufferCopies[level][buffer].read.box     = -1;
+            domain->bufferCopies[level][buffer].read.ptr     = domain->recv_buffer[_n];
+            domain->bufferCopies[level][buffer].read.i       = buf_i;
+            domain->bufferCopies[level][buffer].read.j       = buf_j;
+            domain->bufferCopies[level][buffer].read.k       = buf_k;
+            domain->bufferCopies[level][buffer].read.pencil  = buf_pencil;
+            domain->bufferCopies[level][buffer].read.plane   = buf_plane;
+            domain->bufferCopies[level][buffer].write.box    = recvBox;
+            domain->bufferCopies[level][buffer].write.ptr    = NULL;
+            domain->bufferCopies[level][buffer].write.i      = grid_i;
+            domain->bufferCopies[level][buffer].write.j      = grid_j;
+            domain->bufferCopies[level][buffer].write.k      = grid_k;
+            domain->bufferCopies[level][buffer].write.pencil = domain->subdomains[recvBox].levels[level].pencil;
+            domain->bufferCopies[level][buffer].write.plane  = domain->subdomains[recvBox].levels[level].plane;
+            domain->bufferCopies[level][buffer].isFace       =   faces[n];
+            domain->bufferCopies[level][buffer].isEdge       =   edges[n];
+            domain->bufferCopies[level][buffer].isCorner     = corners[n];
+          }
+          //else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from MPI[%2d,%2d,%2d] (%3d,%4d) to box=%3d at (%3d,%3d,%3d)\n",level,buffer,dim_i,dim_j,dim_k,buf_i,buf_j,buf_k,_di,_dj,_dk,buf_pencil,buf_plane,recvBox,grid_i,grid_j,grid_k);
+          buffer++;
+        }
+      }}}
+    }
+    #endif
+    domain->bufferCopy_Unpack_End=buffer;
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+    // At the end of initialize 0, malloc the buffers
+    if(initialize==0){
+      domain->bufferCopies[level] = (bufferCopy_type*)malloc(buffer*sizeof(bufferCopy_type));
+      memory_allocated+=buffer*sizeof(bufferCopy_type);
+    }
+    //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  }}
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+
+
+  if(domain->rank==0){
+  printf("done\n");fflush(stdout); 
+  printf("  %d x %d x %d (per subdomain)\n",subdomain_dim_i,subdomain_dim_j,subdomain_dim_k);
+  printf("  %d x %d x %d (per process)\n",subdomains_per_rank_in_i*subdomain_dim_i,subdomains_per_rank_in_j*subdomain_dim_j,subdomains_per_rank_in_k*subdomain_dim_k);
+  printf("  %d x %d x %d (overall)\n",ranks_in_i*subdomains_per_rank_in_i*subdomain_dim_i,ranks_in_j*subdomains_per_rank_in_j*subdomain_dim_j,ranks_in_k*subdomains_per_rank_in_k*subdomain_dim_k);
+  printf("  %d-deep ghost zones\n",ghosts);
+  printf("  allocated %llu MB\n",memory_allocated>>20);
+  fflush(stdout);
+  }
+  return(memory_allocated);
+}
+
+
+void destroy_domain(domain_type * domain){
+  if(domain->rank==0){printf("deallocating domain...   ");fflush(stdout);}
+  int box;for(box=0;box<domain->subdomains_per_rank;box++){
+    destroy_subdomain(&domain->subdomains[box]);
+  }
+  free(domain->subdomains);
+  // FIX, free buffer_copies, etc...
+  if(domain->rank==0){printf("done\n");fflush(stdout);}
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void print_timing(domain_type *domain){
+  #ifdef TIMING
+  int level,numLevels = domain->numLevels;
+  uint64_t _timeStart=CycleTime();sleep(1);uint64_t _timeEnd=CycleTime();
+  double SecondsPerCycle = (double)1.0/(double)(_timeEnd-_timeStart);
+         SecondsPerCycle = SecondsPerCycle/(double)domain->MGSolves_performed; // prints average performance per MGSolve
+
+  if(domain->rank!=0)return;
+
+  uint64_t total;
+          printf("                       ");for(level=0;level<(numLevels  );level++){printf("%12d ",level);}printf("\n");
+          printf("                       ");for(level=0;level<(numLevels  );level++){printf("%10d^3 ",domain->subdomains[0].levels[0].dim.i>>level);}printf("       total\n");
+  total=0;printf("smooth                 ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)       domain->cycles.smooth[level]);total+=       domain->cycles.smooth[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("residual               ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)     domain->cycles.residual[level]);total+=     domain->cycles.residual[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("restriction            ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)  domain->cycles.restriction[level]);total+=  domain->cycles.restriction[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("interpolation          ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)domain->cycles.interpolation[level]);total+=domain->cycles.interpolation[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("applyOp                ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)     domain->cycles.apply_op[level]);total+=     domain->cycles.apply_op[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("BLAS1                  ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)        domain->cycles.blas1[level]);total+=        domain->cycles.blas1[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("BLAS3                  ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)        domain->cycles.blas3[level]);total+=        domain->cycles.blas3[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("communication          ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)domain->cycles.communication[level]);total+=domain->cycles.communication[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("  local exchange       ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)    domain->cycles.grid2grid[level]);total+=    domain->cycles.grid2grid[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  #ifdef __MPI
+  total=0;printf("  pack MPI buffers     ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)         domain->cycles.pack[level]);total+=         domain->cycles.pack[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("  unpack MPI buffers   ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)       domain->cycles.unpack[level]);total+=       domain->cycles.unpack[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("  MPI_Isend            ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)         domain->cycles.send[level]);total+=         domain->cycles.send[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("  MPI_Irecv            ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)         domain->cycles.recv[level]);total+=         domain->cycles.recv[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("  MPI_Waitall          ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)         domain->cycles.wait[level]);total+=         domain->cycles.wait[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  total=0;printf("  MPI_collectives      ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)  domain->cycles.collectives[level]);total+=  domain->cycles.collectives[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
+  #endif
+  total=0;printf("--------------         ");for(level=0;level<(numLevels+1);level++){printf("------------ ");}printf("\n");
+  total=0;printf("Total by level         ");for(level=0;level<(numLevels  );level++){printf("%12.6f " ,SecondsPerCycle*(double)domain->cycles.Total[level]);total+=domain->cycles.Total[level];}
+                                                                                     printf("%12.6f\n",SecondsPerCycle*(double)total);
+  printf("\n");
+  printf( "  Total time in MGBuild   %12.6f\n",SecondsPerCycle*(double)domain->cycles.MGBuild);
+  printf( "  Total time in MGSolve   %12.6f\n",SecondsPerCycle*(double)domain->cycles.MGSolve);
+  printf("              \" v-cycles  %12.6f\n",SecondsPerCycle*(double)domain->cycles.vcycles);
+  printf( "      number of v-cycles  %12d\n"  ,domain->vcycles_performed/domain->MGSolves_performed);
+  printf( "Bottom solver iterations  %12d\n"  ,domain->Krylov_iterations/domain->MGSolves_performed);
+  #if defined(__USE_CABICGSTAB) || defined(__USE_CACG)
+  printf( "     formations of G[][]  %12d\n"  ,domain->CAKrylov_formations_of_G/domain->MGSolves_performed);
+  #endif
+  printf("\n\n");fflush(stdout);
+  #endif
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void MGResetTimers(domain_type * domain){
+  int level;
+  for(level=0;level<10;level++){
+  domain->cycles.smooth[level]        = 0;
+  domain->cycles.apply_op[level]      = 0;
+  domain->cycles.residual[level]      = 0;
+  domain->cycles.communication[level] = 0;
+  domain->cycles.restriction[level]   = 0;
+  domain->cycles.interpolation[level] = 0;
+  domain->cycles.blas1[level]         = 0;
+  domain->cycles.blas3[level]         = 0;
+  domain->cycles.pack[level]          = 0;
+  domain->cycles.unpack[level]        = 0;
+  domain->cycles.grid2grid[level]     = 0;
+  domain->cycles.recv[level]          = 0;
+  domain->cycles.send[level]          = 0;
+  domain->cycles.wait[level]          = 0;
+  domain->cycles.collectives[level]   = 0;
+  domain->cycles.Total[level]         = 0;
+  }
+  domain->cycles.vcycles              = 0;
+  domain->cycles.MGSolve              = 0;
+  domain->vcycles_performed           = 0;
+  domain->MGSolves_performed          = 0;
+  domain->Krylov_iterations           = 0;
+  domain->CAKrylov_formations_of_G    = 0;
+}
+//------------------------------------------------------------------------------------------------------------------------------
+void MGBuild(domain_type * domain, double a, double b, double h0){
+  int box,level,numLevels = domain->numLevels;
+
+  if(domain->rank==0){printf("MGBuild...                      ");fflush(stdout);}
+
+  // initialize timers...
+  MGResetTimers(domain);
+  domain->cycles.MGBuild = 0;
+
+  uint64_t _timeStartMGBuild = CycleTime();
+
+
+  // calculate hLevel for all levels/boxes
+  for(level=0;level<numLevels;level++){
+    double hLevel = h0 * (double)(1<<level);
+    domain->h[level] = hLevel;
+    for(box=0;box<domain->subdomains_per_rank;box++){
+      domain->subdomains[box].levels[level].h = hLevel;
+  }}
+
+  // alias all red/black masks to box0's - performance tweak to avoid having a single global list of RedBlack masks as a function of box size
+  //for(level=0;level<numLevels-1;level++){
+  for(level=0;level<numLevels;level++){
+    for(box=1;box<domain->subdomains_per_rank;box++){
+      domain->subdomains[box].levels[level].RedBlack_64bMask = domain->subdomains[0].levels[level].RedBlack_64bMask;
+      domain->subdomains[box].levels[level].RedBlack_FP[0]   = domain->subdomains[0].levels[level].RedBlack_FP[0];
+      domain->subdomains[box].levels[level].RedBlack_FP[1]   = domain->subdomains[0].levels[level].RedBlack_FP[1];
+  }}
+
+
+  // form all restrictions of alpha[] for all boxes...
+  for(level=0;level<numLevels-1;level++){      restriction(domain,level,__alpha,__alpha);}
+  for(level=0;level<numLevels  ;level++){exchange_boundary(domain,level,__alpha ,1,1,1);} // FIX, only necessary if CA version
+
+
+  // form all restrictions of beta_*[] for all boxes...
+  #if 0
+  #warning cell centered beta's are first restricted then projected for faces
+  for(level=0;level<numLevels-1;level++){restriction(domain,level,__beta,__beta);}
+  for(level=0;level<numLevels;level++){
+    exchange_boundary(domain,level,__beta,1,1,1);
+    project_cell_to_face(domain,level,__beta,__beta_i,0);
+    project_cell_to_face(domain,level,__beta,__beta_j,1);
+    project_cell_to_face(domain,level,__beta,__beta_k,2);
+  }
+  #else
+  #warning cell centered beta's are first projected to faces, then restricted to coarsest grid
+     exchange_boundary(domain,0,__beta,1,1,1);
+  project_cell_to_face(domain,0,__beta,__beta_i,0);
+  project_cell_to_face(domain,0,__beta,__beta_j,1);
+  project_cell_to_face(domain,0,__beta,__beta_k,2);
+  for(level=0;level<numLevels;level++){
+                         exchange_boundary(domain,level,__beta_i,1,1,1); // must communicate betas before precomputing lambda (i.e. need high values of beta from next subdomain)
+                         exchange_boundary(domain,level,__beta_j,1,1,1);
+                         exchange_boundary(domain,level,__beta_k,1,1,1);
+    if(level<numLevels-1)restriction_betas(domain,level,level+1);
+  }
+  #endif
+
+
+  // form all restrictions of lambda[] for all boxes...
+  for(level=0;level<numLevels;level++){   rebuild_lambda(domain,level,a,b);}
+  for(level=0;level<numLevels;level++){exchange_boundary(domain,level,__lambda,1,1,1);} // FIX, only necessary if CA version
+
+
+/*
+  // find eigenvalue_max using the power method...
+  // probably won't converge, so use slop factor(*1.1) in chebyshev smoother
+  if(domain->rank==0){printf("\n");fflush(stdout);}
+  for(level=0;level<numLevels;level++){
+    int k;
+    int bk   = __ee;              // arbitrary... just reusing a grid...
+    int Abk = __temp;             // arbitrary... just reusing a grid...
+    noise(domain,level,bk,-1,1);  // careful... an initial guess of all 1's gives a*alpha*I+b*beta*(1-1) = {a*alpha}'s
+    double eigenvalue_max = 0.0;
+    for(k=0;k<10;k++){
+      apply_op(domain,level,Abk,bk,a,b); // eigenvalue of D^{-1}A == lambda A bk
+      mul_grids(domain,level,Abk,1.0,Abk,__lambda);
+      eigenvalue_max = dot(domain,level,bk,Abk)/dot(domain,level,bk,bk);
+      double den = sqrt(dot(domain,level,Abk,Abk)); // normalize bk+1 by the magnitude of Abk
+      scale_grid(domain,level,bk,1.0/den,Abk); // i.e. bk+1 = Abk / ||Abk||
+    }
+    if(domain->rank==0){printf("  level=%2d, eigenvalue_max ~= %13.3f\n",level,eigenvalue_max);fflush(stdout);}
+    domain->dominant_eigenvalue_of_DinvA[level] = eigenvalue_max;
+  }
+  // find eigenvalue_min using the power method...
+  if(domain->rank==0){printf("\n");fflush(stdout);}
+  for(level=0;level<numLevels;level++){
+    int k;
+    int bk   = __ee;              // arbitrary... just reusing a grid...
+    int Abk = __temp;             // arbitrary... just reusing a grid...
+    noise(domain,level,bk,-1,1);  // careful... an initial guess of all 1's gives a*alpha*I+b*beta*(1-1) = {a*alpha}'s
+    double eigenvalue_min = 0.0;
+    for(k=0;k<10;k++){
+      apply_op(domain,level,Abk,bk,a,b); // eigenvalue of D^{-1}A == lambda A bk
+      mul_grids(domain,level,Abk,1.0,Abk,__lambda);
+      add_grids(domain,level,Abk,1.0,Abk,-domain->dominant_eigenvalue_of_DinvA[level],bk); // Abk = (A-ev_max I)bk = Abk - ev_max*bk
+      eigenvalue_min = dot(domain,level,bk,Abk)/dot(domain,level,bk,bk);
+      double den = sqrt(dot(domain,level,Abk,Abk)); // normalize bk+1 by the magnitude of Abk
+      scale_grid(domain,level,bk,1.0/den,Abk); // i.e. bk+1 = Abk / ||Abk||
+    }
+    eigenvalue_min += domain->dominant_eigenvalue_of_DinvA[level];
+    if(domain->rank==0){printf("  level=%2d, eigenvalue_min ~= %13.3f\n",level,eigenvalue_min);fflush(stdout);}
+  }
+*/
+
+
+  uint64_t delta = (CycleTime()-_timeStartMGBuild);
+  domain->cycles.MGBuild += delta;
+
+  if(domain->rank==0){printf("done\n");fflush(stdout);}
+}
+
+
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void MGSolve(domain_type * domain, int u_id, int F_id, double a, double b, double desired_mg_norm){ 
+  domain->MGSolves_performed++;
+  int level;
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  int e_id = __u;
+  int R_id = __f_minus_Av;
+  int box,v;
+  int numLevels  = domain->numLevels;
+  #ifdef __TEST_MG_CONVERGENCE
+  int maxVCycles       = 20;
+  #else
+  int maxVCycles       = 10;
+  #endif
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  if(domain->rank==0){printf("MGSolve...                      ");fflush(stdout);}
+  uint64_t _timeStartMGSolve = CycleTime();
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // make initial guess for e (=0) and setup the RHS
+  #if 1
+   zero_grid(domain,0,e_id);                // ee = 0
+  scale_grid(domain,0,R_id,1.0,F_id);       // R_id = F_id
+  #else
+   mul_grids(domain,0,e_id,1.0,F_id,__lambda);  // e_id = F_id*lambda
+  scale_grid(domain,0,R_id,1.0,F_id);           // R_id = F_id
+  #endif
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // now do v-cycles to calculate the correction...
+  uint64_t _timeStartVCycle = CycleTime();
+  for(v=0;v<maxVCycles;v++){
+    domain->vcycles_performed++;
+
+    // down the v-cycle...
+    for(level=0;level<(domain->numLevels-1);level++){
+      uint64_t _LevelStart = CycleTime();
+      #ifdef __FUSION_RESIDUAL_RESTRICTION
+      smooth(domain,level,e_id,R_id,a,b);
+      residual_and_restriction(domain,level,e_id,R_id,level+1,R_id,a,b);
+      zero_grid(domain,level+1,e_id);
+      #else
+           smooth(domain,level,e_id,R_id,a,b);
+         residual(domain,level,__temp,e_id,R_id,a,b);
+      restriction(domain,level,R_id,__temp);
+        zero_grid(domain,level+1,e_id);
+      #endif
+      domain->cycles.Total[level] += (uint64_t)(CycleTime()-_LevelStart);
+    } // down
+ 
+  
+    // bottom solve...
+    uint64_t _timeBottomStart = CycleTime();
+    level = domain->numLevels-1;
+    IterativeSolver(domain,level,e_id,R_id,a,b,__DEFAULT_BOTTOM_NORM);
+    domain->cycles.Total[level] += (uint64_t)(CycleTime()-_timeBottomStart);
+  
+  
+    // back up the v-cycle...
+    for(level=(domain->numLevels-2);level>=0;level--){
+      uint64_t _LevelStart = CycleTime();
+      #ifdef __LINEAR_INTERPOLATION
+         interpolation_linear(domain,level,1.0,e_id,e_id);
+      #else
+       interpolation_constant(domain,level,1.0,e_id,e_id);
+      #endif
+      smooth(domain,level,e_id,R_id,a,b);
+      domain->cycles.Total[level] += (uint64_t)(CycleTime()-_LevelStart);
+    } // up
+
+
+    // now calculate the norm of the residual...
+    #if defined(__PRINT_NORM) || defined(__TEST_MG_CONVERGENCE)
+      uint64_t _timeStart = CycleTime();
+      level = 0;
+      residual(domain,level,__temp,e_id,F_id,a,b);
+      #if 1
+      #warning Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria...
+      mul_grids(domain,level,__temp,1.0,__temp,__lambda); /// <<< precondition by D^{-1} ???
+      #endif
+      double norm_of_residual = norm(domain,level,__temp);
+      uint64_t _timeNorm = CycleTime();
+      domain->cycles.Total[0] += (uint64_t)(_timeNorm-_timeStart);
+      #if defined(__PRINT_NORM)
+      if(domain->rank==0){
+        if(v==0)printf("\n");
+                printf("v-cycle=%2d, norm=%22.20f (%12.6e)\n",v+1,norm_of_residual,norm_of_residual);
+      }
+      #endif
+      #if defined(__TEST_MG_CONVERGENCE)
+      if(norm_of_residual<desired_mg_norm)break;
+      #endif
+    #endif
+
+  } // maxVCycles
+  domain->cycles.vcycles += (uint64_t)(CycleTime()-_timeStartVCycle);
+  domain->cycles.MGSolve += (uint64_t)(CycleTime()-_timeStartMGSolve);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  if(domain->rank==0){printf("done\n");fflush(stdout);}
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.c
@@ -0,0 +1,181 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef OMP
+#include <omp.h>
+#endif
+#ifdef __MPI
+#include <mpi.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "defines.h"
+#include "box.h"
+#include "mg.h"
+#include "operators.h"
+//------------------------------------------------------------------------------------------------------------------------------
+int main(int argc, char **argv){
+  int MPI_Rank=0;
+  int MPI_Tasks=1;
+  int OMP_Threads = 1;
+
+#ifdef OMP
+  #pragma omp parallel 
+  {
+    #pragma omp master
+    {
+      OMP_Threads = omp_get_num_threads();
+    }
+  }
+#endif    
+
+  #ifdef __MPI
+  #warning Compiling for MPI...
+  int MPI_threadingModel          = -1;
+//int MPI_threadingModelRequested = MPI_THREAD_SINGLE;
+//int MPI_threadingModelRequested = MPI_THREAD_SERIALIZED;
+  int MPI_threadingModelRequested = MPI_THREAD_FUNNELED;
+//int MPI_threadingModelRequested = MPI_THREAD_MULTIPLE;
+      #ifdef __MPI_THREAD_MULTIPLE
+      MPI_threadingModelRequested = MPI_THREAD_MULTIPLE;
+      #endif
+  MPI_Init_thread(&argc, &argv, MPI_threadingModelRequested, &MPI_threadingModel);
+  MPI_Comm_size(MPI_COMM_WORLD, &MPI_Tasks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &MPI_Rank);
+
+  if(MPI_threadingModel>MPI_threadingModelRequested)MPI_threadingModel=MPI_threadingModelRequested;
+  if(MPI_Rank==0){
+       if(MPI_threadingModelRequested == MPI_THREAD_MULTIPLE  )printf("Requested MPI_THREAD_MULTIPLE, ");
+  else if(MPI_threadingModelRequested == MPI_THREAD_SINGLE    )printf("Requested MPI_THREAD_SINGLE, ");
+  else if(MPI_threadingModelRequested == MPI_THREAD_FUNNELED  )printf("Requested MPI_THREAD_FUNNELED, ");
+  else if(MPI_threadingModelRequested == MPI_THREAD_SERIALIZED)printf("Requested MPI_THREAD_SERIALIZED, ");
+  else if(MPI_threadingModelRequested == MPI_THREAD_MULTIPLE  )printf("Requested MPI_THREAD_MULTIPLE, ");
+  else                                                         printf("Requested Unknown MPI Threading Model (%d), ",MPI_threadingModelRequested);
+       if(MPI_threadingModel == MPI_THREAD_MULTIPLE  )printf("got MPI_THREAD_MULTIPLE\n");
+  else if(MPI_threadingModel == MPI_THREAD_SINGLE    )printf("got MPI_THREAD_SINGLE\n");
+  else if(MPI_threadingModel == MPI_THREAD_FUNNELED  )printf("got MPI_THREAD_FUNNELED\n");
+  else if(MPI_threadingModel == MPI_THREAD_SERIALIZED)printf("got MPI_THREAD_SERIALIZED\n");
+  else if(MPI_threadingModel == MPI_THREAD_MULTIPLE  )printf("got MPI_THREAD_MULTIPLE\n");
+  else                                                printf("got Unknown MPI Threading Model (%d)\n",MPI_threadingModel);
+  fflush(stdout);  }
+    #ifdef __MPI_THREAD_MULTIPLE
+    if( (MPI_threadingModelRequested == MPI_THREAD_MULTIPLE) && (MPI_threadingModel != MPI_THREAD_MULTIPLE) ){MPI_Finalize();exit(0);}
+    #endif
+  #endif
+
+
+  int log2_subdomain_dim = 6;
+  int subdomains_per_rank_in_i=256 / (1<<log2_subdomain_dim);
+  int subdomains_per_rank_in_j=256 / (1<<log2_subdomain_dim);
+  int subdomains_per_rank_in_k=256 / (1<<log2_subdomain_dim);
+  int ranks_in_i=1;
+  int ranks_in_j=1;
+  int ranks_in_k=1;
+
+  if(argc==2){
+          log2_subdomain_dim=atoi(argv[1]);
+          subdomains_per_rank_in_i=256 / (1<<log2_subdomain_dim);
+          subdomains_per_rank_in_j=256 / (1<<log2_subdomain_dim);
+          subdomains_per_rank_in_k=256 / (1<<log2_subdomain_dim);
+  }else if(argc==5){
+          log2_subdomain_dim=atoi(argv[1]);
+    subdomains_per_rank_in_i=atoi(argv[2]);
+    subdomains_per_rank_in_j=atoi(argv[3]);
+    subdomains_per_rank_in_k=atoi(argv[4]);
+  }else if(argc==8){
+          log2_subdomain_dim=atoi(argv[1]);
+    subdomains_per_rank_in_i=atoi(argv[2]);
+    subdomains_per_rank_in_j=atoi(argv[3]);
+    subdomains_per_rank_in_k=atoi(argv[4]);
+                  ranks_in_i=atoi(argv[5]);
+                  ranks_in_j=atoi(argv[6]);
+                  ranks_in_k=atoi(argv[7]);
+  }else if(argc!=1){
+    if(MPI_Rank==0){printf("usage: ./a.out [log2_subdomain_dim]   [subdomains per rank in i,j,k]  [ranks in i,j,k]\n");}
+    #ifdef __MPI
+    MPI_Finalize();
+    #endif
+    exit(0);
+  }
+
+/*
+  if(log2_subdomain_dim>7){
+    if(MPI_Rank==0){printf("error, log2_subdomain_dim(%d)>7\n",log2_subdomain_dim);}
+    #ifdef __MPI
+    MPI_Finalize();
+    #endif
+    exit(0);
+  }
+*/
+
+  if(ranks_in_i*ranks_in_j*ranks_in_k != MPI_Tasks){
+    if(MPI_Rank==0){printf("error, ranks_in_i*ranks_in_j*ranks_in_k(%d*%d*%d=%d) != MPI_Tasks(%d)\n",ranks_in_i,ranks_in_j,ranks_in_k,ranks_in_i*ranks_in_j*ranks_in_k,MPI_Tasks);}
+    #ifdef __MPI
+    MPI_Finalize();
+    #endif
+    exit(0);
+  }
+
+  if(MPI_Rank==0)printf("%d MPI Tasks of %d threads\n",MPI_Tasks,OMP_Threads);
+
+  int subdomain_dim_i=1<<log2_subdomain_dim;
+  int subdomain_dim_j=1<<log2_subdomain_dim;
+  int subdomain_dim_k=1<<log2_subdomain_dim;
+  // fine dim = 128 64 32 16 8 4
+  //   levels =   6  5  4  3 2 1
+//int log2_coarse_dim = 2; // i.e. coarsen to 4^3
+  int log2_coarse_dim = 1; // i.e. coarsen to 2^3
+  int levels_in_vcycle=1+log2_subdomain_dim-log2_coarse_dim; // ie 1+log2(fine grid size)-log2(bottom grid size)
+  if(MPI_Rank==0){printf("truncating the v-cycle at %d^3 subdomains\n",1<<log2_coarse_dim);fflush(stdout);}
+
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  int box;
+  domain_type domain;
+  int boundary_conditions[3] = {__BOUNDARY_PERIODIC,__BOUNDARY_PERIODIC,__BOUNDARY_PERIODIC}; // i-, j-, and k-directions
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  create_domain(&domain,
+                           subdomain_dim_i,subdomain_dim_j,subdomain_dim_k,
+                           subdomains_per_rank_in_i,subdomains_per_rank_in_j,subdomains_per_rank_in_k,
+                           ranks_in_i,ranks_in_j,ranks_in_k,
+                           MPI_Rank,
+                           boundary_conditions,
+                           __NumGrids,1,levels_in_vcycle);
+  double h0=1.0/((double)(domain.dim.i));
+  if(MPI_Rank==0){printf("initializing alpha, beta, RHS for the ``hard problem''...");fflush(stdout);}
+  double  a=0.9;       // i.e. good Helmholtz
+  double  b=0.9;
+  initialize_problem(&domain,0,h0,a,b);
+  if(MPI_Rank==0){printf("done\n");fflush(stdout);}
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  MGBuild(&domain,a,b,h0); // restrictions, dominant eigenvalue, etc...
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  int s,sMax=2;
+  #ifdef __MPI
+  sMax=4;
+  #endif
+                    //Make initial an guess for u(=0)... Solve Lu=f to precision of 1e-10...print the benchmarking timing results...
+  MGResetTimers(&domain);for(s=0;s<sMax;s++){zero_grid(&domain,0,__u); MGSolve(&domain,__u,__f,a,b,1e-15);}print_timing(&domain);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  // calculate error...
+  double h3 = h0*h0*h0;
+                 add_grids(&domain,0,__temp,1.0,__u_exact,-1.0,__u);       // __temp = __u_exact - __u
+  double   max =      norm(&domain,0,__temp);                              // max norm of error function
+  double error = sqrt( dot(&domain,0,__temp,__temp)*h3);                   // normalized L2 error ?
+  if(MPI_Rank==0){printf("Error test: h = %e, max = %e\n",h0,max);}
+  if(MPI_Rank==0){printf("Error test: h = %e, L2  = %e\n",h0,error);}
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  destroy_domain(&domain);
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  #ifdef __MPI
+  MPI_Finalize();
+  #endif
+  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+  return(0);
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.reference_output
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.reference_output
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.reference_output
@@ -0,0 +1,25 @@
+1 MPI Tasks of 1 threads
+truncating the v-cycle at 2^3 subdomains
+creating domain...       done
+  64 x 64 x 64 (per subdomain)
+  128 x 128 x 128 (per process)
+  128 x 128 x 128 (overall)
+  1-deep ghost zones
+  allocated 246 MB
+initializing alpha, beta, RHS for the ``hard problem''...
+  average value of f =   0.000000000000e+00
+done
+MGBuild...                      
+  level= 0, eigenvalue_max ~= 1.000000e+00
+  level= 1, eigenvalue_max ~= -1.000000e+00
+  level= 2, eigenvalue_max ~= -1.000000e+00
+  level= 3, eigenvalue_max ~= -1.000000e+00
+  level= 4, eigenvalue_max ~= -1.000000e+00
+  level= 5, eigenvalue_max ~= -1.000000e+00
+done
+MGSolve...                      done
+MGSolve...                      done
+Error test: h = 7.812500e-03, max = 0.000000e+00
+Error test: h = 7.812500e-03, L2  = 0.000000e+00
+deallocating domain...   done
+exit 0
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/misc.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/misc.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/misc.inc
@@ -0,0 +1,420 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include "timer.h"
+#include "mg.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void zero_grid(domain_type *domain, int level, int grid_id){
+  // zero's the entire grid INCLUDING ghost zones...
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid = domain->subdomains[box].levels[level].grids[grid_id] + ghosts*(1+pencil+plane);
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=-ghosts;k<dim_k+ghosts;k++){
+     for(j=-ghosts;j<dim_j+ghosts;j++){
+      for(i=-ghosts;i<dim_i+ghosts;i++){
+        int ijk = i + j*pencil + k*plane;
+        grid[ijk] = 0.0;
+    }}}
+  }
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+}
+
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void initialize_grid_to_scalar(domain_type *domain, int level, int grid_id, double scalar){
+  // initializes the grid to a scalar while zero'ing the ghost zones...
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid = domain->subdomains[box].levels[level].grids[grid_id] + ghosts*(1+pencil+plane);
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=-ghosts;k<dim_k+ghosts;k++){
+     for(j=-ghosts;j<dim_j+ghosts;j++){
+      for(i=-ghosts;i<dim_i+ghosts;i++){
+        int ijk = i + j*pencil + k*plane;
+        int ghostZone = (i<0) || (j<0) || (k<0) || (i>=dim_i) || (j>=dim_j) || (k>=dim_k);
+        grid[ijk] = ghostZone ? 0.0 : scalar;
+    }}}
+  }
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void add_grids(domain_type *domain, int level, int id_c, double scale_a, int id_a, double scale_b, int id_b){ // c=scale_a*id_a + scale_b*id_b
+  uint64_t _timeStart = CycleTime();
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane);
+    double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane);
+    double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane);
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k;k++){
+     for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = i + j*pencil + k*plane;
+        grid_c[ijk] = scale_a*grid_a[ijk] + scale_b*grid_b[ijk];
+    }}}
+  }
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void mul_grids(domain_type *domain, int level, int id_c, double scale, int id_a, int id_b){ // id_c=scale*id_a*id_b
+  uint64_t _timeStart = CycleTime();
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane);
+    double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane);
+    double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane);
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k;k++){
+     for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = i + j*pencil + k*plane;
+        grid_c[ijk] = scale*grid_a[ijk]*grid_b[ijk];
+    }}}
+  }
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+}
+
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void scale_grid(domain_type *domain, int level, int id_c, double scale_a, int id_a){ // c[]=scale_a*a[]
+  uint64_t _timeStart = CycleTime();
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane);
+    double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane);
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k;k++){
+     for(j=0;j<dim_j;j++){
+      for(i=0;i<dim_i;i++){
+        int ijk = i + j*pencil + k*plane;
+        grid_c[ijk] = scale_a*grid_a[ijk];
+    }}}
+  }
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+double dot(domain_type *domain, int level, int id_a, int id_b){
+  uint64_t _timeStart = CycleTime();
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+
+  int box;
+  double a_dot_b_domain =  0.0;
+#ifdef OMP
+  // FIX, schedule(static) is a stand in to guarantee reproducibility...
+  #pragma omp parallel for private(box) if(omp_across_boxes) reduction(+:a_dot_b_domain) schedule(static)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
+    double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane);
+    double a_dot_b_box = 0.0;
+    #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:a_dot_b_box) schedule(static)
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int ijk = i + j*pencil + k*plane;
+      a_dot_b_box += grid_a[ijk]*grid_b[ijk];
+    }}}
+    a_dot_b_domain+=a_dot_b_box;
+  }
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+
+  #ifdef __MPI
+  uint64_t _timeStartAllReduce = CycleTime();
+  double send = a_dot_b_domain;
+  MPI_Allreduce(&send,&a_dot_b_domain,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+  uint64_t _timeEndAllReduce = CycleTime();
+  domain->cycles.collectives[level]   += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+
+  return(a_dot_b_domain);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+double norm(domain_type *domain, int level, int grid_id){ // implements the max norm
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+
+  int box;
+  double max_norm =  0.0;
+#ifdef OMP
+  // FIX, schedule(static) is a stand in to guarantee reproducibility...
+  #pragma omp parallel for private(box) if(omp_across_boxes) reduction(max:max_norm) schedule(static)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid   = domain->subdomains[box].levels[level].grids[ grid_id] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
+    double box_norm = 0.0;
+    #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(max:box_norm) schedule(static)
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int ijk = i + j*pencil + k*plane;
+      double fabs_grid_ijk = fabs(grid[ijk]);
+      if(fabs_grid_ijk>box_norm){box_norm=fabs_grid_ijk;} // max norm
+    }}}
+    if(box_norm>max_norm){max_norm = box_norm;}
+  } // box list
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+
+  #ifdef __MPI
+  uint64_t _timeStartAllReduce = CycleTime();
+  double send = max_norm;
+  MPI_Allreduce(&send,&max_norm,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
+  uint64_t _timeEndAllReduce = CycleTime();
+  domain->cycles.collectives[level]   += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+  return(max_norm);
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+double mean(domain_type *domain, int level, int id_a){
+  uint64_t _timeStart = CycleTime();
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+
+  int box;
+  double sum_domain =  0.0;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes) reduction(+:sum_domain)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
+    double sum_box = 0.0;
+    #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:sum_box)
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int ijk = i + j*pencil + k*plane;
+      sum_box += grid_a[ijk];
+    }}}
+    sum_domain+=sum_box;
+  }
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+  double ncells_domain = (double)domain->dim.i*(double)domain->dim.j*(double)domain->dim.k;
+
+  #ifdef __MPI
+  uint64_t _timeStartAllReduce = CycleTime();
+  double send = sum_domain;
+  MPI_Allreduce(&send,&sum_domain,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+  uint64_t _timeEndAllReduce = CycleTime();
+  domain->cycles.collectives[level]   += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce);
+  #endif
+
+  double mean_domain = sum_domain / ncells_domain;
+  return(mean_domain);
+}
+
+
+void shift_grid(domain_type *domain, int level, int id_c, int id_a, double shift_a){
+  uint64_t _timeStart = CycleTime();
+
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+
+  int box;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
+    double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
+
+    #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int ijk = i + j*pencil + k*plane;
+      grid_c[ijk] = grid_a[ijk] + shift_a;
+    }}}
+  }
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+void project_cell_to_face(domain_type *domain, int level, int id_cell, int id_face, int dir){
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double * __restrict__ grid_cell = domain->subdomains[box].levels[level].grids[id_cell] + ghosts*(1+pencil+plane);
+    double * __restrict__ grid_face = domain->subdomains[box].levels[level].grids[id_face] + ghosts*(1+pencil+plane);
+    int stride;
+    switch(dir){
+      case 0: stride =      1;break;//i-direction
+      case 1: stride = pencil;break;//j-direction
+      case 2: stride =  plane;break;//k-direction
+    }
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<=dim_k;k++){ // <= to ensure you do low and high faces
+     for(j=0;j<=dim_j;j++){
+      for(i=0;i<=dim_i;i++){
+        int ijk = i + j*pencil + k*plane;
+        grid_face[ijk] = 0.5*(grid_cell[ijk-stride] + grid_cell[ijk]); // simple linear interpolation
+    }}}
+  }
+#endif
+  domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart);
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.h
@@ -0,0 +1,36 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+  void                  apply_op(domain_type *domain, int level, int Ax_id,  int x_id,   double a, double b);
+  void                  residual(domain_type *domain, int level, int res_id, int phi_id, int rhs_id, double a, double b);
+  void                    smooth(domain_type *domain, int level, int phi_id, int rhs_id, double a, double b);
+  void            rebuild_lambda(domain_type *domain, int level, double a, double b);
+//------------------------------------------------------------------------------------------------------------------------------
+  void  residual_and_restriction(domain_type *domain, int level_f, int phi_id, int rhs_id, int level_c, int res_id, double a, double b);
+  void               restriction(domain_type *domain, int level_f, int id_c, int id_f);
+  void         restriction_betas(domain_type *domain, int level_f, int level_c);
+  void    interpolation_constant(domain_type *domain, int level_f, double prescale_f, int id_f, int id_c);
+  void      interpolation_linear(domain_type *domain, int level_f, double prescale_f, int id_f, int id_c);
+//------------------------------------------------------------------------------------------------------------------------------
+  void         exchange_boundary(domain_type *domain, int level, int grid_id, int exchange_faces, int exchange_edges, int exchange_corners);
+//------------------------------------------------------------------------------------------------------------------------------
+double                       dot(domain_type *domain, int level, int id_a, int id_b);
+double                      norm(domain_type *domain, int level, int grid_id);
+double                      mean(domain_type *domain, int level, int id_a);
+  void                 add_grids(domain_type *domain, int level, int id_c, double scale_a, int id_a, double scale_b, int id_b);
+  void                scale_grid(domain_type *domain, int level, int id_c, double scale_a, int id_a);
+  void                 zero_grid(domain_type *domain, int level, int grid_id);
+//  void                shift_grid(domain_type *domain, int level, int id_c, int id_a, double shift_a);
+  void                 mul_grids(domain_type *domain, int level, int id_c, double scale, int id_a, int id_b);
+//  void initialize_grid_to_scalar(domain_type *domain, int level, int grid_id, double scalar);
+//------------------------------------------------------------------------------------------------------------------------------
+  void      project_cell_to_face(domain_type *domain, int level, int id_cell, int id_face, int dir);
+//------------------------------------------------------------------------------------------------------------------------------
+  void              matmul_grids(domain_type *domain, int level, double *C, int *id_A, int *id_B, int rows, int cols, int A_equals_B_transpose);
+//------------------------------------------------------------------------------------------------------------------------------
+  void        initialize_problem(domain_type *domain, int level, double hLevel, double a, double b);
+//------------------------------------------------------------------------------------------------------------------------------
+//  void __box_smooth_GSRB_multiple(box_type *box, int phi_id, int rhs_id, double a, double b, int sweep);
+//  void __box_smooth_GSRB_multiple_threaded(box_type *box, int phi_id, int rhs_id, double a, double b, int sweep);
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.ompif.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.ompif.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.ompif.c
@@ -0,0 +1,31 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+
+//------------------------------------------------------------------------------------------------------------------------------
+#include "timer.h"
+#include "defines.h"
+#include "box.h"
+#include "mg.h"
+//------------------------------------------------------------------------------------------------------------------------------
+#include "exchange_boundary.inc"
+#include "lambda.inc"
+#include "jacobi.inc"
+//#include "operators.ompif/gsrb.inc"
+//#include "operators.ompif/chebyshev.inc"
+#include "apply_op.inc"
+#include "residual.inc"
+#include "restriction.inc"
+#include "interpolation.inc"
+#include "misc.inc"
+#include "matmul.inc"
+#include "problem1.inc"
+//#include "operators.ompif/problem2.inc"
+//------------------------------------------------------------------------------------------------------------------------------
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/problem1.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/problem1.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/problem1.inc
@@ -0,0 +1,84 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include "defines.h"
+#include "timer.h"
+#include "mg.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void initialize_problem(domain_type *domain, int level, double hLevel, double a, double b){
+  double NPi = 2.0*M_PI;
+  double Bmin =  1.0;
+  double Bmax = 10.0;
+  double c2 = (Bmax-Bmin)/2;
+  double c1 = (Bmax+Bmin)/2;
+  double c3=10.0; // how sharply (B)eta transitions
+  double c4 = -5.0/0.25;
+
+  int box;
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    memset(domain->subdomains[box].levels[level].grids[__u_exact],0,domain->subdomains[box].levels[level].volume*sizeof(double));
+    memset(domain->subdomains[box].levels[level].grids[__f      ],0,domain->subdomains[box].levels[level].volume*sizeof(double));
+    int i,j,k;
+    #pragma omp parallel for private(k,j,i) collapse(2)
+    for(k=0;k<domain->subdomains[box].levels[level].dim.k;k++){
+    for(j=0;j<domain->subdomains[box].levels[level].dim.j;j++){
+    for(i=0;i<domain->subdomains[box].levels[level].dim.i;i++){
+      double x = hLevel*((double)(i+domain->subdomains[box].levels[level].low.i)+0.5);
+      double y = hLevel*((double)(j+domain->subdomains[box].levels[level].low.j)+0.5);
+      double z = hLevel*((double)(k+domain->subdomains[box].levels[level].low.k)+0.5);
+      int ijk =                                              (i+domain->subdomains[box].levels[level].ghosts)+
+                domain->subdomains[box].levels[level].pencil*(j+domain->subdomains[box].levels[level].ghosts)+
+                domain->subdomains[box].levels[level].plane *(k+domain->subdomains[box].levels[level].ghosts);
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      double r2   = pow((x-0.50),2) +  pow((y-0.50),2) +  pow((z-0.50),2); // distance from center squared
+      double r2x  = 2.0*(x-0.50);
+      double r2y  = 2.0*(y-0.50);
+      double r2z  = 2.0*(z-0.50);
+      double r2xx = 2.0;
+      double r2yy = 2.0;
+      double r2zz = 2.0;
+      double r    = pow(r2,0.5);
+      double rx   = 0.5*r2x*pow(r2,-0.5);
+      double ry   = 0.5*r2y*pow(r2,-0.5);
+      double rz   = 0.5*r2z*pow(r2,-0.5);
+      double rxx  = 0.5*r2xx*pow(r2,-0.5) - 0.25*r2x*r2x*pow(r2,-1.5);
+      double ryy  = 0.5*r2yy*pow(r2,-0.5) - 0.25*r2y*r2y*pow(r2,-1.5);
+      double rzz  = 0.5*r2zz*pow(r2,-0.5) - 0.25*r2z*r2z*pow(r2,-1.5);
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      // beta (B) is a bubble...
+      double A  = 1.0;
+      double B  =           c1+c2*tanh( c3*(r-0.25) );
+      double Bx = c2*c3*rx*(1-pow(tanh( c3*(r-0.25) ),2));
+      double By = c2*c3*ry*(1-pow(tanh( c3*(r-0.25) ),2));
+      double Bz = c2*c3*rz*(1-pow(tanh( c3*(r-0.25) ),2)); 
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      double u   =                exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*sin(NPi*z);
+      double ux  = c4*r2x*u + NPi*exp(c4*r2)*cos(NPi*x)*sin(NPi*y)*sin(NPi*z);
+      double uy  = c4*r2y*u + NPi*exp(c4*r2)*sin(NPi*x)*cos(NPi*y)*sin(NPi*z);
+      double uz  = c4*r2z*u + NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*cos(NPi*z);
+      double uxx = c4*r2xx*u + c4*r2x*ux + c4*r2x*NPi*exp(c4*r2)*cos(NPi*x)*sin(NPi*y)*sin(NPi*z) - NPi*NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*sin(NPi*z);
+      double uyy = c4*r2yy*u + c4*r2y*uy + c4*r2y*NPi*exp(c4*r2)*sin(NPi*x)*cos(NPi*y)*sin(NPi*z) - NPi*NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*sin(NPi*z);
+      double uzz = c4*r2zz*u + c4*r2z*uz + c4*r2z*NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*cos(NPi*z) - NPi*NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*sin(NPi*z);
+      double f = a*A*u - b*( (Bx*ux + By*uy + Bz*uz)  +  B*(uxx + uyy + uzz) );
+      //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+      domain->subdomains[box].levels[level].grids[__alpha  ][ijk] = A;
+      domain->subdomains[box].levels[level].grids[__beta   ][ijk] = B;
+      domain->subdomains[box].levels[level].grids[__u_exact][ijk] = u;
+      domain->subdomains[box].levels[level].grids[__f      ][ijk] = f;
+    }}}
+  }
+
+  double average_value_of_f = mean(domain,level,__f);
+  if(domain->rank==0){printf("\n  average value of f = %20.12e\n",average_value_of_f);fflush(stdout);}
+  if(a!=0){
+  shift_grid(domain,level,__f      ,__f      ,-average_value_of_f);
+  shift_grid(domain,level,__u_exact,__u_exact,-average_value_of_f/a);
+  }
+  // what if a==0 and average_value_of_f != 0 ???
+}
+//------------------------------------------------------------------------------------------------------------------------------
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/residual.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/residual.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/residual.inc
@@ -0,0 +1,218 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include "timer.h"
+#include "mg.h"
+//------------------------------------------------------------------------------------------------------------------------------
+void residual(domain_type * domain, int level,  int res_id, int phi_id, int rhs_id, double a, double b){
+  // exchange the boundary for x in prep for Ax...
+  // for 7-point stencil, only needs to be a 1-deep ghost zone & faces only
+  exchange_boundary(domain,level,phi_id,1,0,0); 
+
+  // now do residual/restriction proper...
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int pencil = domain->subdomains[box].levels[level].pencil;
+    int  plane = domain->subdomains[box].levels[level].plane;
+    int ghosts = domain->subdomains[box].levels[level].ghosts;
+    int  dim_k = domain->subdomains[box].levels[level].dim.k;
+    int  dim_j = domain->subdomains[box].levels[level].dim.j;
+    int  dim_i = domain->subdomains[box].levels[level].dim.i;
+    double h2inv = 1.0/(domain->h[level]*domain->h[level]);
+    double * __restrict__ phi    = domain->subdomains[box].levels[level].grids[  phi_id] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
+    double * __restrict__ rhs    = domain->subdomains[box].levels[level].grids[  rhs_id] + ghosts*(1+pencil+plane);
+    double * __restrict__ alpha  = domain->subdomains[box].levels[level].grids[__alpha ] + ghosts*(1+pencil+plane);
+    double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[__beta_i] + ghosts*(1+pencil+plane);
+    double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[__beta_j] + ghosts*(1+pencil+plane);
+    double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane);
+    double * __restrict__ res    = domain->subdomains[box].levels[level].grids[  res_id] + ghosts*(1+pencil+plane);
+
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k;k++){
+    for(j=0;j<dim_j;j++){
+    for(i=0;i<dim_i;i++){
+      int ijk = i + j*pencil + k*plane;
+      double helmholtz =  a*alpha[ijk]*phi[ijk]
+                         -b*h2inv*(
+                            beta_i[ijk+1     ]*( phi[ijk+1     ]-phi[ijk       ] )
+                           -beta_i[ijk       ]*( phi[ijk       ]-phi[ijk-1     ] )
+                           +beta_j[ijk+pencil]*( phi[ijk+pencil]-phi[ijk       ] )
+                           -beta_j[ijk       ]*( phi[ijk       ]-phi[ijk-pencil] )
+                           +beta_k[ijk+plane ]*( phi[ijk+plane ]-phi[ijk       ] )
+                           -beta_k[ijk       ]*( phi[ijk       ]-phi[ijk-plane ] )
+                          );
+      res[ijk] = rhs[ijk]-helmholtz;
+    }}}
+  }
+#endif
+  domain->cycles.residual[level] += (uint64_t)(CycleTime()-_timeStart);
+}
+
+#if 1
+//------------------------------------------------------------------------------------------------------------------------------
+// This version maximizes parallelism by parallelizing over the resultant coarse grid.  
+// Thus, 
+//  one parallelizes over the list of 2x2 fine-grid bars,
+//  initializes a coarse grid pencil to zero, 
+//  additively restricts each pencil in the 2x2 fine-grid bar to the coarse grid pencil
+//------------------------------------------------------------------------------------------------------------------------------
+void residual_and_restriction(domain_type *domain, int level_f, int phi_id, int rhs_id, int level_c, int res_id, double a, double b){
+  // exchange the boundary for x in prep for Ax...
+  // for 7-point stencil, only needs to be a 1-deep ghost zone & faces only
+  exchange_boundary(domain,level_f,phi_id,1,0,0); 
+
+  // now do residual/restriction proper...
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int kk,jj;
+    int pencil_c = domain->subdomains[box].levels[level_c].pencil;
+    int  plane_c = domain->subdomains[box].levels[level_c].plane;
+    int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
+    int  dim_k_c = domain->subdomains[box].levels[level_c].dim.k;
+    int  dim_j_c = domain->subdomains[box].levels[level_c].dim.j;
+    int  dim_i_c = domain->subdomains[box].levels[level_c].dim.i;
+
+    int pencil_f = domain->subdomains[box].levels[level_f].pencil;
+    int  plane_f = domain->subdomains[box].levels[level_f].plane;
+    int ghosts_f = domain->subdomains[box].levels[level_f].ghosts;
+    int  dim_k_f = domain->subdomains[box].levels[level_f].dim.k;
+    int  dim_j_f = domain->subdomains[box].levels[level_f].dim.j;
+    int  dim_i_f = domain->subdomains[box].levels[level_f].dim.i;
+
+    double h2inv = 1.0/(domain->h[level_f]*domain->h[level_f]);
+    double * __restrict__ phi    = domain->subdomains[box].levels[level_f].grids[  phi_id] + ghosts_f*(1+pencil_f+plane_f); // i.e. [0] = first non ghost zone point
+    double * __restrict__ rhs    = domain->subdomains[box].levels[level_f].grids[  rhs_id] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ alpha  = domain->subdomains[box].levels[level_f].grids[__alpha ] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ beta_i = domain->subdomains[box].levels[level_f].grids[__beta_i] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ beta_j = domain->subdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ beta_k = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ res    = domain->subdomains[box].levels[level_c].grids[  res_id] + ghosts_c*(1+pencil_c+plane_c);
+
+    #pragma omp parallel for private(kk,jj) if(omp_within_a_box) collapse(2)
+    for(kk=0;kk<dim_k_f;kk+=2){
+    for(jj=0;jj<dim_j_f;jj+=2){
+      int i,j,k;
+      for(i=0;i<dim_i_c;i++){
+        int ijk_c = (i) + (jj>>1)*pencil_c + (kk>>1)*plane_c;
+        res[ijk_c] = 0.0;
+      }
+      for(k=kk;k<kk+2;k++){
+      for(j=jj;j<jj+2;j++){
+      for(i=0;i<dim_i_f;i++){
+        int ijk_f = (i   ) + (j   )*pencil_f + (k   )*plane_f;
+        int ijk_c = (i>>1) + (j>>1)*pencil_c + (k>>1)*plane_c;
+        double helmholtz =  a*alpha[ijk_f]*phi[ijk_f]
+                           -b*h2inv*(
+                              beta_i[ijk_f+1       ]*( phi[ijk_f+1       ]-phi[ijk_f         ] )
+                             -beta_i[ijk_f         ]*( phi[ijk_f         ]-phi[ijk_f-1       ] )
+                             +beta_j[ijk_f+pencil_f]*( phi[ijk_f+pencil_f]-phi[ijk_f         ] )
+                             -beta_j[ijk_f         ]*( phi[ijk_f         ]-phi[ijk_f-pencil_f] )
+                             +beta_k[ijk_f+plane_f ]*( phi[ijk_f+plane_f ]-phi[ijk_f         ] )
+                             -beta_k[ijk_f         ]*( phi[ijk_f         ]-phi[ijk_f-plane_f ] )
+                            );
+        res[ijk_c] += (rhs[ijk_f]-helmholtz)*0.125;
+      }
+    }}}}
+  }
+#endif
+  domain->cycles.residual[level_f] += (uint64_t)(CycleTime()-_timeStart);
+}
+#else
+//------------------------------------------------------------------------------------------------------------------------------
+// This version performs a 1D parallelization over the coarse-grid k-dimension (every two fine-grid planes)
+// It first zeros the coarse grid plane, then increments with restrictions from the fine grid
+//------------------------------------------------------------------------------------------------------------------------------
+void residual_and_restriction(domain_type *domain, int level_f, int phi_id, int rhs_id, int level_c, int res_id, double a, double b){
+  // exchange the boundary for x in prep for Ax...
+  // for 7-point stencil, only needs to be a 1-deep ghost zone & faces only
+  exchange_boundary(domain,level_f,phi_id,1,0,0); 
+
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int pencil_c = domain->subdomains[box].levels[level_c].pencil;
+    int  plane_c = domain->subdomains[box].levels[level_c].plane;
+    int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
+    int  dim_k_c = domain->subdomains[box].levels[level_c].dim.k;
+    int  dim_j_c = domain->subdomains[box].levels[level_c].dim.j;
+    int  dim_i_c = domain->subdomains[box].levels[level_c].dim.i;
+
+    int pencil_f = domain->subdomains[box].levels[level_f].pencil;
+    int  plane_f = domain->subdomains[box].levels[level_f].plane;
+    int ghosts_f = domain->subdomains[box].levels[level_f].ghosts;
+    int  dim_k_f = domain->subdomains[box].levels[level_f].dim.k;
+    int  dim_j_f = domain->subdomains[box].levels[level_f].dim.j;
+    int  dim_i_f = domain->subdomains[box].levels[level_f].dim.i;
+
+    double h2inv = 1.0/(domain->h[level_f]*domain->h[level_f]);
+    double * __restrict__ phi    = domain->subdomains[box].levels[level_f].grids[  phi_id] + ghosts_f*(1+pencil_f+plane_f); // i.e. [0] = first non ghost zone point
+    double * __restrict__ rhs    = domain->subdomains[box].levels[level_f].grids[  rhs_id] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ alpha  = domain->subdomains[box].levels[level_f].grids[__alpha ] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ beta_i = domain->subdomains[box].levels[level_f].grids[__beta_i] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ beta_j = domain->subdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ beta_k = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ res    = domain->subdomains[box].levels[level_c].grids[  res_id] + ghosts_c*(1+pencil_c+plane_c);
+
+    int kk;
+    #pragma omp parallel for private(kk) if(omp_within_a_box)
+    for(kk=0;kk<dim_k_f;kk+=2){
+      int i,j,k; 
+      // zero out the next coarse grid plane
+      for(j=0;j<dim_j_c;j++){
+      for(i=0;i<dim_i_c;i++){
+          int ijk_c = (i) + (j)*pencil_c + (kk>>1)*plane_c;
+          res[ijk_c] = 0.0;
+      }}
+      // restrict two fine grid planes into one coarse grid plane
+      for(k=kk;k<kk+2;k++){
+      for(j=0;j<dim_j_f;j++){
+      for(i=0;i<dim_i_f;i++){
+        int ijk_f = (i   ) + (j   )*pencil_f + (k   )*plane_f;
+        int ijk_c = (i>>1) + (j>>1)*pencil_c + (k>>1)*plane_c;
+        double helmholtz =  a*alpha[ijk_f]*phi[ijk_f]
+                           -b*h2inv*(
+                              beta_i[ijk_f+1       ]*( phi[ijk_f+1       ]-phi[ijk_f         ] )
+                             -beta_i[ijk_f         ]*( phi[ijk_f         ]-phi[ijk_f-1       ] )
+                             +beta_j[ijk_f+pencil_f]*( phi[ijk_f+pencil_f]-phi[ijk_f         ] )
+                             -beta_j[ijk_f         ]*( phi[ijk_f         ]-phi[ijk_f-pencil_f] )
+                             +beta_k[ijk_f+plane_f ]*( phi[ijk_f+plane_f ]-phi[ijk_f         ] )
+                             -beta_k[ijk_f         ]*( phi[ijk_f         ]-phi[ijk_f-plane_f ] )
+                            );
+        res[ijk_c] += (rhs[ijk_f]-helmholtz)*0.125;
+      }}}
+    }
+  }
+  domain->cycles.residual[level_f] += (uint64_t)(CycleTime()-_timeStart);
+}
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/restriction.inc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/restriction.inc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/restriction.inc
@@ -0,0 +1,131 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+#include "timer.h"
+#include "mg.h"
+//------------------------------------------------------------------------------------------------------------------------------
+// id_c^2h = R id_f^h
+// restriction of cell-centered data...
+//------------------------------------------------------------------------------------------------------------------------------
+void restriction(domain_type *domain, int level_f, int id_c, int id_f){
+  int level_c = level_f+1;
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level_c].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level_c].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
+    int pencil_c = domain->subdomains[box].levels[level_c].pencil;
+    int  plane_c = domain->subdomains[box].levels[level_c].plane;
+    int  dim_i_c = domain->subdomains[box].levels[level_c].dim.i;
+    int  dim_j_c = domain->subdomains[box].levels[level_c].dim.j;
+    int  dim_k_c = domain->subdomains[box].levels[level_c].dim.k;
+  
+    int ghosts_f = domain->subdomains[box].levels[level_f].ghosts;
+    int pencil_f = domain->subdomains[box].levels[level_f].pencil;
+    int  plane_f = domain->subdomains[box].levels[level_f].plane;
+  
+    double * __restrict__ grid_f = domain->subdomains[box].levels[level_f].grids[id_f] + ghosts_f*(1+pencil_f+plane_f);
+    double * __restrict__ grid_c = domain->subdomains[box].levels[level_c].grids[id_c] + ghosts_c*(1+pencil_c+plane_c);
+  
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k_c;k++){
+    for(j=0;j<dim_j_c;j++){
+    for(i=0;i<dim_i_c;i++){
+      int ijk_c = (i   ) + (j   )*pencil_c + (k   )*plane_c;
+      int ijk_f = (i<<1) + (j<<1)*pencil_f + (k<<1)*plane_f;
+      grid_c[ijk_c] = ( grid_f[ijk_f                   ]+grid_f[ijk_f+1                 ] +
+                        grid_f[ijk_f  +pencil_f        ]+grid_f[ijk_f+1+pencil_f        ] +
+                        grid_f[ijk_f           +plane_f]+grid_f[ijk_f+1         +plane_f] +
+                        grid_f[ijk_f  +pencil_f+plane_f]+grid_f[ijk_f+1+pencil_f+plane_f] ) * 0.125;
+    }}}
+  }
+#endif
+  domain->cycles.restriction[level_f] += (uint64_t)(CycleTime()-_timeStart);
+}
+
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+// restriction of face-centered data...
+//------------------------------------------------------------------------------------------------------------------------------
+void restriction_betas(domain_type * domain, int level_f, int level_c){
+  uint64_t _timeStart = CycleTime();
+  int CollaborativeThreadingBoxSize = 100000; // i.e. never
+  #ifdef __COLLABORATIVE_THREADING
+    CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
+  #endif
+  int omp_across_boxes = (domain->subdomains[0].levels[level_c].dim.i <  CollaborativeThreadingBoxSize);
+  int omp_within_a_box = (domain->subdomains[0].levels[level_c].dim.i >= CollaborativeThreadingBoxSize);
+  int box;
+
+#ifdef OMP
+  #pragma omp parallel for private(box) if(omp_across_boxes)
+  for(box=0;box<domain->subdomains_per_rank;box++){
+    int i,j,k;
+    int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
+    int pencil_c = domain->subdomains[box].levels[level_c].pencil;
+    int  plane_c = domain->subdomains[box].levels[level_c].plane;
+    int  dim_i_c = domain->subdomains[box].levels[level_c].dim.i;
+    int  dim_j_c = domain->subdomains[box].levels[level_c].dim.j;
+    int  dim_k_c = domain->subdomains[box].levels[level_c].dim.k;
+  
+    int ghosts_f = domain->subdomains[box].levels[level_f].ghosts;
+    int pencil_f = domain->subdomains[box].levels[level_f].pencil;
+    int  plane_f = domain->subdomains[box].levels[level_f].plane;
+  
+    double * __restrict__ beta_f;
+    double * __restrict__ beta_c;
+
+    // restrict beta_i  (== face in jk)
+    beta_f = domain->subdomains[box].levels[level_f].grids[__beta_i] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f;
+    beta_c = domain->subdomains[box].levels[level_c].grids[__beta_i] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c;
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k_c;k++){
+    for(j=0;j<dim_j_c;j++){
+    for(i=0;i<dim_i_c;i++){
+      int ijk_c = (i   ) + (j   )*pencil_c + (k   )*plane_c;
+      int ijk_f = (i<<1) + (j<<1)*pencil_f + (k<<1)*plane_f;
+      beta_c[ijk_c] = ( beta_f[ijk_f        ]+beta_f[ijk_f+pencil_f        ] +
+                        beta_f[ijk_f+plane_f]+beta_f[ijk_f+pencil_f+plane_f] ) * 0.25;
+    }}}
+
+    // restrict beta_j (== face in ik)
+    beta_f = domain->subdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f;
+    beta_c = domain->subdomains[box].levels[level_c].grids[__beta_j] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c;
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k_c;k++){
+    for(j=0;j<dim_j_c;j++){
+    for(i=0;i<dim_i_c;i++){
+      int ijk_c = (i   ) + (j   )*pencil_c + (k   )*plane_c;
+      int ijk_f = (i<<1) + (j<<1)*pencil_f + (k<<1)*plane_f;
+      beta_c[ijk_c] = ( beta_f[ijk_f        ]+beta_f[ijk_f+1        ] +
+                        beta_f[ijk_f+plane_f]+beta_f[ijk_f+1+plane_f] ) * 0.25;
+    }}}
+
+    // restrict beta_k (== face in ij)
+    beta_f = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f;
+    beta_c = domain->subdomains[box].levels[level_c].grids[__beta_k] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c;
+    #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
+    for(k=0;k<dim_k_c;k++){
+    for(j=0;j<dim_j_c;j++){
+    for(i=0;i<dim_i_c;i++){
+      int ijk_c = (i   ) + (j   )*pencil_c + (k   )*plane_c;
+      int ijk_f = (i<<1) + (j<<1)*pencil_f + (k<<1)*plane_f;
+      beta_c[ijk_c] = ( beta_f[ijk_f         ]+beta_f[ijk_f+1         ] +
+                        beta_f[ijk_f+pencil_f]+beta_f[ijk_f+1+pencil_f] ) * 0.25;
+    }}}
+  }
+#endif
+  domain->cycles.restriction[level_f] += (uint64_t)(CycleTime()-_timeStart);
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.h
@@ -0,0 +1,11 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef SOLVER_H
+#define SOLVER_H
+#include "mg.h"
+void IterativeSolver(domain_type *domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm);
+int  IterativeSolver_NumGrids();
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.c
@@ -0,0 +1,783 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+//------------------------------------------------------------------------------------------------------------------------------
+#ifdef __MPI
+#include <mpi.h>
+#endif
+//------------------------------------------------------------------------------------------------------------------------------
+#include "defines.h"
+#include "box.h"
+#include "mg.h"
+#include "operators.h"
+#include "timer.h"
+//------------------------------------------------------------------------------------------------------------------------------
+#warning Using Diagonally Preconditioned Iterative Solvers
+#define DiagonallyPrecondition
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef    __CA_KRYLOV_S
+#define    __CA_KRYLOV_S     4
+#endif
+
+//#define __VERBOSE
+//#define __DEBUG
+//------------------------------------------------------------------------------------------------------------------------------
+// z[r] = alpha*A[r][c]*x[c]+beta*y[r]   // [row][col]
+// z[r] = alpha*A[r][c]*x[c]+beta*y[r]   // [row][col]
+#define __gemv(z,alpha,A,x,beta,y,rows,cols)  {int r,c;double sum;for(r=0;r<(rows);r++){sum=0.0;for(c=0;c<(cols);c++){sum+=(A)[r][c]*(x)[c];}(z)[r]=(alpha)*sum+(beta)*(y)[r];}}
+inline void __axpy(double * z, double alpha, double * x, double beta, double * y, int n){ // z[n] = alpha*x[n]+beta*y[n]
+  int nn;
+  for(nn=0;nn<n;nn++){
+    z[nn] = alpha*x[nn] + beta*y[nn];
+  }
+}
+inline double __dot(double * x, double * y, int n){ // x[n].y[n]
+  int nn;
+  double sum = 0.0;
+  for(nn=0;nn<n;nn++){
+    sum += x[nn]*y[nn];
+  }
+  return(sum);
+}
+inline void __zero(double * z, int n){ // z[n] = 0.0
+  int nn;
+  for(nn=0;nn<n;nn++){
+    z[nn] = 0.0;
+  }
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void TelescopingCABiCGStab(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // based on Erin Carson/Jim Demmel/Nick Knight's s-Step BiCGStab Algorithm 3.4
+  // However, the formation of [P,R] is expensive ~ 4S+1 exchanges.  Moreover, formation of G[][] requires (4S+2)(4S+1) grid operations.
+  //   When the required number of iterations is small, this overhead is large and can make the s-step version slower than vanilla BiCGStab
+  //   Thus, this version is a telescoping s-step method that will start out with s=1, then do s=2, then s=4
+
+
+  // note: __CA_KRYLOV_S should be tiny (2-8?).  As such, 4*__CA_KRYLOV_S+1 is also tiny (9-33).  Just allocate on the stack...
+  double  temp1[4*__CA_KRYLOV_S+1];                                             //
+  double  temp2[4*__CA_KRYLOV_S+1];                                             //
+  double  temp3[4*__CA_KRYLOV_S+1];                                             //
+  double     Tp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1];                          // T'  indexed as [row][col]
+  double    Tpp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1];                          // T'' indexed as [row][col]
+  double     aj[4*__CA_KRYLOV_S+1];                                             //
+  double     cj[4*__CA_KRYLOV_S+1];                                             //
+  double     ej[4*__CA_KRYLOV_S+1];                                             //
+  double   Tpaj[4*__CA_KRYLOV_S+1];                                             //
+  double   Tpcj[4*__CA_KRYLOV_S+1];                                             //
+  double  Tppaj[4*__CA_KRYLOV_S+1];                                             //
+  double      G[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1];                          // extracted from first 4*__CA_KRYLOV_S+1 columns of Gg[][].  indexed as [row][col]
+  double      g[4*__CA_KRYLOV_S+1];                                             // extracted from last [4*__CA_KRYLOV_S+1] column of Gg[][].
+  double    Gg[(4*__CA_KRYLOV_S+1)*(4*__CA_KRYLOV_S+2)];                        // buffer to hold the Gram-like matrix produced by matmul().  indexed as [row*(4*__CA_KRYLOV_S+2) + col]
+  int      PRrt[4*__CA_KRYLOV_S+2];                                             // grid_id's of the concatenation of the 2S+1 matrix powers of P, 2S matrix powers of R, and rt
+
+  int mMax=200;
+  int m=0,n;
+  int i,j,k;
+  int BiCGStabFailed    = 0;
+  int BiCGStabConverged = 0;
+  double g_dot_Tpaj,alpha,omega_numerator,omega_denominator,omega,delta,delta_next,beta;
+  double L2_norm_of_rt,L2_norm_of_residual,cj_dot_Gcj,L2_norm_of_s;
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  residual(domain,level,__rt,e_id,R_id,a,b);                                    // rt[] = R_id[] - A(e_id)... note, if DPC, then rt = R-AD^-1De
+  scale_grid(domain,level,__r,1.0,__rt);                                        // r[] = rt[]
+  scale_grid(domain,level,__p,1.0,__rt);                                        // p[] = rt[]
+  double norm_of_rt = norm(domain,level,__rt);                                  // the norm of the initial residual...
+  #ifdef __VERBOSE
+  if(domain->rank==0)printf("m=%8d, norm   =%0.20f\n",m,norm_of_rt);
+  #endif
+  if(norm_of_rt == 0.0){BiCGStabConverged=1;}                                   // entered BiCGStab with exact solution
+  delta = dot(domain,level,__r,__rt);                                           // delta = dot(r,rt)
+  if(delta==0.0){BiCGStabConverged=1;}                                          // entered BiCGStab with exact solution (square of L2 norm of __r)
+  L2_norm_of_rt = sqrt(delta);
+
+  int __ca_krylov_s = 1;
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  while( (m<mMax) && (!BiCGStabFailed) && (!BiCGStabConverged) ){               // while(not done){
+    __zero(   aj,4*__ca_krylov_s+1);                                            //
+    __zero(   cj,4*__ca_krylov_s+1);                                            //
+    __zero(   ej,4*__ca_krylov_s+1);                                            //
+    __zero( Tpaj,4*__ca_krylov_s+1);                                            //
+    __zero( Tpcj,4*__ca_krylov_s+1);                                            //
+    __zero(Tppaj,4*__ca_krylov_s+1);                                            //
+    __zero(temp1,4*__ca_krylov_s+1);                                            //
+    __zero(temp2,4*__ca_krylov_s+1);                                            //
+    __zero(temp3,4*__ca_krylov_s+1);                                            //
+ 
+    for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++) Tp[i][j]=0;// initialize Tp[][] and Tpp[][] ...
+    for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++)Tpp[i][j]=0;//
+    for(i=                0;i<2*__ca_krylov_s  ;i++){ Tp[i+1][i]=1;}            // monomial basis... Fixed (typo in SIAM paper)
+    for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s  ;i++){ Tp[i+1][i]=1;}            //
+    for(i=                0;i<2*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;}            //
+    for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;}            //
+
+    for(i=0;i<4*__ca_krylov_s+1;i++){PRrt[                i] = __PRrt+i;}       // columns of PRrt map to the consecutive spare grid indices starting at __PRrt
+                                     PRrt[4*__ca_krylov_s+1] = __rt;            // last column or PRrt (r tilde) maps to rt
+    int *P = PRrt+                0;                                            // grid_id's of the 2S+1 Matrix Powers of P.  P[i] is the grid_id of A^i(p)
+    int *R = PRrt+2*__ca_krylov_s+1;                                            // grid_id's of the 2S   Matrix Powers of R.  R[i] is the grid_id of A^i(r)
+
+    // Using the monomial basis, compute 2s+1 matrix powers on p[] and 2s matrix powers on r[] one power at a time 
+    // (conventional approach applicable to CHOMBO and BoxLib)
+    scale_grid(domain,level,P[0],1.0,__p);                                      // P[0] = A^0p = __p
+    for(n=1;n<2*__ca_krylov_s+1;n++){                                           // naive way of calculating the monomial basis.
+      #ifdef DiagonallyPrecondition                                             //
+      mul_grids(domain,level,__temp,1.0,__lambda,P[n-1]);                       //   temp[] = lambda[]*P[n-1]
+      apply_op(domain,level,P[n],__temp,a,b);                                   //   P[n] = AD^{-1}__temp = AD^{-1}P[n-1] = ((AD^{-1})^n)p
+      #else                                                                     //
+      apply_op(domain,level,P[n],P[n-1],a,b);                                   //   P[n] = A(P[n-1]) = (A^n)p
+      #endif                                                                    //
+    }
+    scale_grid(domain,level,R[0],1.0,__r);                                      // R[0] = A^0r = __r
+    for(n=1;n<2*__ca_krylov_s;n++){                                             // naive way of calculating the monomial basis.
+      #ifdef DiagonallyPrecondition                                             //
+      mul_grids(domain,level,__temp,1.0,__lambda,R[n-1]);                       //   temp[] = lambda[]*R[n-1]
+      apply_op(domain,level,R[n],__temp,a,b);                                   //   R[n] = AD^{-1}__temp = AD^{-1}R[n-1]
+      #else                                                                     //
+      apply_op(domain,level,R[n],R[n-1],a,b);                                   //   R[n] = A(R[n-1]) = (A^n)r
+      #endif                                                                    //
+    }
+
+    // Compute Gg[][] = [P,R]^T * [P,R,rt] (Matmul with grids with ghost zones but only one MPI_AllReduce)
+    domain->CAKrylov_formations_of_G++;                                         //   Record the number of times CABiCGStab formed G[][]
+    matmul_grids(domain,level,Gg,PRrt,PRrt,4*__ca_krylov_s+1,4*__ca_krylov_s+2,1);
+    for(i=0,k=0;i<4*__ca_krylov_s+1;i++){                                       // extract G[][] and g[] from Gg[]
+    for(j=0    ;j<4*__ca_krylov_s+1;j++){G[i][j] = Gg[k++];}                    // first 4*__ca_krylov_s+1 elements in each row go to G[][].
+                                         g[i]    = Gg[k++];                     // last element in row goes to g[].
+    }
+
+    for(i=0;i<4*__ca_krylov_s+1;i++)aj[i]=0.0;aj[                0]=1.0;        // initialized based on (3.26)
+    for(i=0;i<4*__ca_krylov_s+1;i++)cj[i]=0.0;cj[2*__ca_krylov_s+1]=1.0;        // initialized based on (3.26)
+    for(i=0;i<4*__ca_krylov_s+1;i++)ej[i]=0.0;                                  // initialized based on (3.26)
+
+    for(n=0;n<__ca_krylov_s;n++){                                                               // for(n=0;n<__ca_krylov_s;n++){
+      domain->Krylov_iterations++;                                                              // record number of inner-loop (j) iterations for comparison
+      __gemv( Tpaj,   1.0, Tp,   aj,   0.0, Tpaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //    T'aj
+      __gemv( Tpcj,   1.0, Tp,   cj,   0.0, Tpcj,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //    T'cj
+      __gemv(Tppaj,   1.0,Tpp,   aj,   0.0,Tppaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //   T''aj
+                       g_dot_Tpaj = __dot(g,Tpaj,4*__ca_krylov_s+1);                            // (g,T'aj)
+      if(g_dot_Tpaj == 0.0){                                                                    // pivot breakdown ???
+        #ifdef __VERBOSE                                                                        //
+        if(domain->rank==0){printf("g_dot_Tpaj == 0.0\n");}                                     //
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      alpha = delta / g_dot_Tpaj;                                                               // delta / (g,T'aj)
+      if(isinf(alpha)){                                                                         // alpha = big/tiny(overflow) = inf -> breakdown
+        #ifdef __VERBOSE                                                                        //
+        if(domain->rank==0){printf("alpha == inf\n");}                                          // 
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 // 
+      }                                                                                         // 
+      #if 0                                                                                     // seems to have accuracy problems in finite precision...
+      __gemv(temp1,-alpha,  G, Tpaj,   0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp1[] =       - alpha*GT'aj
+      __gemv(temp1,   1.0,  G,   cj,   1.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp1[] =   Gcj - alpha*GT'aj
+      __gemv(temp2,-alpha,  G,Tppaj,   0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp2[] =       − alpha*GT′′aj
+      __gemv(temp2,   1.0,  G, Tpcj,   1.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp2[] = GT′cj − alpha*GT′′aj
+      __axpy(temp3,   1.0,     Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1);                            //  temp3[] =  T′cj − alpha*T′′aj
+             omega_numerator = __dot(temp3,temp1,4*__ca_krylov_s+1);                            //  (temp3,temp1) = ( T'cj-alpha*T''aj ,   Gcj-alpha*GT'aj )
+           omega_denominator = __dot(temp3,temp2,4*__ca_krylov_s+1);                            //  (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj )
+      #else                                                                                     // better to change the order of operations Gx-Gy -> G(x-y) ...  (note, G is symmetric)
+      __axpy(temp1,   1.0,     Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1);                            //  temp1[] =  (T'cj - alpha*T''aj)
+      __gemv(temp2,   1.0,  G,temp1,   0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp2[] = G(T'cj - alpha*T''aj)
+      __axpy(temp3,   1.0,       cj,-alpha, Tpaj,4*__ca_krylov_s+1);                            //  temp3[] =     cj - alpha*T'aj
+             omega_numerator = __dot(temp3,temp2,4*__ca_krylov_s+1);                            //  (temp3,temp2) = ( (  cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) )
+           omega_denominator = __dot(temp1,temp2,4*__ca_krylov_s+1);                            //  (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) )
+      #endif                                                                                    // 
+      // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0
+      // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj
+      // If omega_numerator==0, and ||s||!=0, then stabilization breakdown
+
+      // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!!
+      __axpy(   ej,1.0,ej,       alpha,   aj,4*__ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[]    
+
+      // calculate the norm of Saad's vector 's' to check intra s-step convergence...
+      __axpy(temp1,   1.0,       cj,-alpha, Tpaj,4*__ca_krylov_s+1);                            //  temp1[] =   cj - alpha*T'aj
+      __gemv(temp2,   1.0,  G,temp1,   0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp2[] = G(cj - alpha*T'aj)
+                                 L2_norm_of_s = __dot(temp1,temp2,4*__ca_krylov_s+1);           //  (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) )  == square of L2 norm of s in exact arithmetic
+      if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s);                    // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      #ifdef __VERBOSE                                                                          //
+      if(domain->rank==0){printf("m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);}                  //
+      #endif                                                                                    //
+      if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;}    // terminate the inner n-loop
+
+
+      if(omega_denominator == 0.0){                                                             // ??? breakdown
+        #ifdef __VERBOSE                                                                        //
+        if(domain->rank==0){if(omega_denominator == 0.0)printf("omega_denominator == 0.0\n");}  //
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      omega = omega_numerator / omega_denominator;                                              // 
+      if(isinf(omega)){                                                                         // omega = big/tiny(oveflow) = inf
+        #ifdef __VERBOSE                                                                        //
+        if(domain->rank==0){if(isinf(omega))printf("omega == inf\n");}                          // 
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok                    //
+      __axpy(   ej,1.0,ej,       omega,   cj,4*__ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[] + omega*cj[]
+      __axpy(   ej,1.0,ej,-omega*alpha, Tpaj,4*__ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[]
+      __axpy(   cj,1.0,cj,      -omega, Tpcj,4*__ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[]
+      __axpy(   cj,1.0,cj,      -alpha, Tpaj,4*__ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[]
+      __axpy(   cj,1.0,cj, omega*alpha,Tppaj,4*__ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[]
+
+
+      // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence...
+      __gemv(temp1,   1.0,  G,   cj,   0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          // temp1[] = Gcj
+                                       cj_dot_Gcj = __dot(cj,temp1,4*__ca_krylov_s+1);          // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic
+      L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj);           // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      #ifdef __VERBOSE 
+      if(domain->rank==0){printf("m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);}
+      #endif
+      if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop
+
+
+      delta_next = __dot( g,cj,4*__ca_krylov_s+1);                                              // (g,cj)
+      #ifdef __VERBOSE                                                                          //
+      if(domain->rank==0){                                                                      //
+        if(isinf(delta_next)     ){printf("delta == inf\n");}                                   // delta = big/tiny(overflow) = inf
+        if(delta_next      == 0.0){printf("delta == 0.0\n");}                                   // Lanczos breakdown
+        if(omega_numerator == 0.0){printf("omega_numerator == 0.0\n");}                         // stabilization breakdown
+        if(omega           == 0.0){printf("omega == 0.0\n");}                                   // stabilization breakdown 
+      }                                                                                         //
+      #endif                                                                                    //
+      if(isinf(delta_next)){BiCGStabFailed   =1;break;}                                         // delta = inf?
+      if(delta_next  ==0.0){BiCGStabFailed   =1;break;}                                         // Lanczos breakdown...
+      if(omega       ==0.0){BiCGStabFailed   =1;break;}                                         // stabilization breakdown 
+      beta = (delta_next/delta)*(alpha/omega);                                                  // (delta_next/delta)*(alpha/omega)
+      #ifdef __VERBOSE                                                                          //
+      if(domain->rank==0){                                                                      //
+        if(isinf(beta)           ){printf("beta == inf\n");}                                    // beta = inf?
+        if(beta            == 0.0){printf("beta == 0.0\n");}                                    // beta = 0?  can't make further progress(?)
+      }                                                                                         //
+      #endif                                                                                    //
+      if(isinf(beta)      ){BiCGStabFailed   =1;break;}                                         // beta = inf?
+      if(beta       == 0.0){BiCGStabFailed   =1;break;}                                         // beta = 0?  can't make further progress(?)
+      __axpy(   aj,1.0,cj,        beta,   aj,4*__ca_krylov_s+1);                                // aj[] = cj[] + beta*aj[]
+      __axpy(   aj,1.0,aj, -omega*beta, Tpaj,4*__ca_krylov_s+1);                                // aj[] = cj[] + beta*aj[] - omega*beta*T'aj
+      delta = delta_next;                                                                       // delta = delta_next
+
+    }                                                                                           // inner n (j) loop
+
+    // update iterates...
+    for(i=0;i<4*__ca_krylov_s+1;i++){add_grids(domain,level,e_id,1.0,e_id,ej[i],PRrt[i]);}      // e_id[] = [P,R]ej + e_id[]
+    if(!BiCGStabFailed && !BiCGStabConverged){                                                  // if we're done, then there is no point in updating these
+                                     add_grids(domain,level, __p,0.0, __p,aj[0],PRrt[0]);       //    p[] = [P,R]aj
+    for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __p,1.0, __p,aj[i],PRrt[i]);}      //          ...
+                                     add_grids(domain,level, __r,0.0, __r,cj[0],PRrt[0]);       //    r[] = [P,R]cj
+    for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __r,1.0, __r,cj[i],PRrt[i]);}      //          ...
+    }                                                                                           //
+    m+=__ca_krylov_s;                                                                           //   m+=ca_krylov_s;
+    __ca_krylov_s*=2;if(__ca_krylov_s>__CA_KRYLOV_S)__ca_krylov_s=__CA_KRYLOV_S;
+  }                                                                                       // } // outer m loop
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  #ifdef DiagonallyPrecondition
+  mul_grids(domain,level,e_id,1.0,__lambda,e_id);                                         //   e_id[] = lambda[]*e_id[] // i.e. e = D^{-1}e'
+  #endif
+
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void CABiCGStab(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // based on Erin Carson/Jim Demmel/Nick Knight's s-Step BiCGStab Algorithm 3.4
+
+  // note: __CA_KRYLOV_S should be tiny (2-8?).  As such, 4*__CA_KRYLOV_S+1 is also tiny (9-33).  Just allocate on the stack...
+  double  temp1[4*__CA_KRYLOV_S+1];                                             //
+  double  temp2[4*__CA_KRYLOV_S+1];                                             //
+  double  temp3[4*__CA_KRYLOV_S+1];                                             //
+  double     Tp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1];                          // T'  indexed as [row][col]
+  double    Tpp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1];                          // T'' indexed as [row][col]
+  double     aj[4*__CA_KRYLOV_S+1];                                             //
+  double     cj[4*__CA_KRYLOV_S+1];                                             //
+  double     ej[4*__CA_KRYLOV_S+1];                                             //
+  double   Tpaj[4*__CA_KRYLOV_S+1];                                             //
+  double   Tpcj[4*__CA_KRYLOV_S+1];                                             //
+  double  Tppaj[4*__CA_KRYLOV_S+1];                                             //
+  double      G[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1];                          // extracted from first 4*__CA_KRYLOV_S+1 columns of Gg[][].  indexed as [row][col]
+  double      g[4*__CA_KRYLOV_S+1];                                             // extracted from last [4*__CA_KRYLOV_S+1] column of Gg[][].
+  double    Gg[(4*__CA_KRYLOV_S+1)*(4*__CA_KRYLOV_S+2)];                        // buffer to hold the Gram-like matrix produced by matmul().  indexed as [row*(4*__CA_KRYLOV_S+2) + col]
+  int      PRrt[4*__CA_KRYLOV_S+2];                                             // grid_id's of the concatenation of the 2S+1 matrix powers of P, 2S matrix powers of R, and rt
+  int *P = PRrt+                0;                                              // grid_id's of the 2S+1 Matrix Powers of P.  P[i] is the grid_id of A^i(p)
+  int *R = PRrt+2*__CA_KRYLOV_S+1;                                              // grid_id's of the 2S   Matrix Powers of R.  R[i] is the grid_id of A^i(r)
+
+  int mMax=200;
+  int m=0,n;
+  int i,j,k;
+  int BiCGStabFailed    = 0;
+  int BiCGStabConverged = 0;
+  double g_dot_Tpaj,alpha,omega_numerator,omega_denominator,omega,delta,delta_next,beta;
+  double L2_norm_of_rt,L2_norm_of_residual,cj_dot_Gcj,L2_norm_of_s;
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  residual(domain,level,__rt,e_id,R_id,a,b);                                    // rt[] = R_id[] - A(e_id)... note, if DPC, then rt = R-AD^-1De
+  scale_grid(domain,level,__r,1.0,__rt);                                        // r[] = rt[]
+  scale_grid(domain,level,__p,1.0,__rt);                                        // p[] = rt[]
+  double norm_of_rt = norm(domain,level,__rt);                                  // the norm of the initial residual...
+  #ifdef __VERBOSE
+  if(domain->rank==0)printf("m=%8d, norm   =%0.20f\n",m,norm_of_rt);
+  #endif
+  if(norm_of_rt == 0.0){BiCGStabConverged=1;}                                   // entered BiCGStab with exact solution
+  delta = dot(domain,level,__r,__rt);                                           // delta = dot(r,rt)
+  if(delta==0.0){BiCGStabConverged=1;}                                          // entered BiCGStab with exact solution (square of L2 norm of __r)
+  L2_norm_of_rt = sqrt(delta);
+
+  int __ca_krylov_s = __CA_KRYLOV_S;                                            // by making this a variable, I prevent the compiler from optimizing more than the telescoping version, thus preserving a bit-identcal result
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++) Tp[i][j]=0;  // initialize Tp[][] and Tpp[][] ...
+  for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++)Tpp[i][j]=0;  //
+  for(i=                0;i<2*__ca_krylov_s  ;i++){ Tp[i+1][i]=1;}              // monomial basis... Fixed (typo in SIAM paper)
+  for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s  ;i++){ Tp[i+1][i]=1;}              //
+  for(i=                0;i<2*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;}              //
+  for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;}              //
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  for(i=0;i<4*__ca_krylov_s+1;i++){PRrt[                i] = __PRrt+i;}         // columns of PRrt map to the consecutive spare grid indices starting at __PRrt
+                                   PRrt[4*__ca_krylov_s+1] = __rt;              // last column or PRrt (r tilde) maps to rt
+
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  while( (m<mMax) && (!BiCGStabFailed) && (!BiCGStabConverged) ){               // while(not done){
+    __zero(   aj,4*__ca_krylov_s+1);                                            //
+    __zero(   cj,4*__ca_krylov_s+1);                                            //
+    __zero(   ej,4*__ca_krylov_s+1);                                            //
+    __zero( Tpaj,4*__ca_krylov_s+1);                                            //
+    __zero( Tpcj,4*__ca_krylov_s+1);                                            //
+    __zero(Tppaj,4*__ca_krylov_s+1);                                            //
+    __zero(temp1,4*__ca_krylov_s+1);                                            //
+    __zero(temp2,4*__ca_krylov_s+1);                                            //
+    __zero(temp3,4*__ca_krylov_s+1);                                            //
+
+    // Using the monomial basis, compute 2s+1 matrix powers on p[] and 2s matrix powers on r[] one power at a time 
+    // (conventional approach applicable to CHOMBO and BoxLib)
+    scale_grid(domain,level,P[0],1.0,__p);                                      // P[0] = A^0p = __p
+    for(n=1;n<2*__ca_krylov_s+1;n++){                                           // naive way of calculating the monomial basis.
+      #ifdef DiagonallyPrecondition                                             //
+      mul_grids(domain,level,__temp,1.0,__lambda,P[n-1]);                       //   temp[] = lambda[]*P[n-1]
+      apply_op(domain,level,P[n],__temp,a,b);                                   //   P[n] = AD^{-1}__temp = AD^{-1}P[n-1] = ((AD^{-1})^n)p
+      #else                                                                     //
+      apply_op(domain,level,P[n],P[n-1],a,b);                                   //   P[n] = A(P[n-1]) = (A^n)p
+      #endif                                                                    //
+    }
+    scale_grid(domain,level,R[0],1.0,__r);                                      // R[0] = A^0r = __r
+    for(n=1;n<2*__ca_krylov_s;n++){                                             // naive way of calculating the monomial basis.
+      #ifdef DiagonallyPrecondition                                             //
+      mul_grids(domain,level,__temp,1.0,__lambda,R[n-1]);                       //   temp[] = lambda[]*R[n-1]
+      apply_op(domain,level,R[n],__temp,a,b);                                   //   R[n] = AD^{-1}__temp = AD^{-1}R[n-1]
+      #else                                                                     //
+      apply_op(domain,level,R[n],R[n-1],a,b);                                   //   R[n] = A(R[n-1]) = (A^n)r
+      #endif                                                                    //
+    }
+
+    // Compute Gg[][] = [P,R]^T * [P,R,rt] (Matmul with grids with ghost zones but only one MPI_AllReduce)
+    domain->CAKrylov_formations_of_G++;                                         //   Record the number of times CABiCGStab formed G[][]
+    matmul_grids(domain,level,Gg,PRrt,PRrt,4*__ca_krylov_s+1,4*__ca_krylov_s+2,1);
+    for(i=0,k=0;i<4*__ca_krylov_s+1;i++){                                       // extract G[][] and g[] from Gg[]
+    for(j=0    ;j<4*__ca_krylov_s+1;j++){G[i][j] = Gg[k++];}                    // first 4*__ca_krylov_s+1 elements in each row go to G[][].
+                                         g[i]    = Gg[k++];                     // last element in row goes to g[].
+    }
+
+    for(i=0;i<4*__ca_krylov_s+1;i++)aj[i]=0.0;aj[                0]=1.0;        // initialized based on (3.26)
+    for(i=0;i<4*__ca_krylov_s+1;i++)cj[i]=0.0;cj[2*__ca_krylov_s+1]=1.0;        // initialized based on (3.26)
+    for(i=0;i<4*__ca_krylov_s+1;i++)ej[i]=0.0;                                  // initialized based on (3.26)
+
+    for(n=0;n<__ca_krylov_s;n++){                                                               // for(n=0;n<__ca_krylov_s;n++){
+      domain->Krylov_iterations++;                                                              // record number of inner-loop (j) iterations for comparison
+      __gemv( Tpaj,   1.0, Tp,   aj,   0.0, Tpaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //    T'aj
+      __gemv( Tpcj,   1.0, Tp,   cj,   0.0, Tpcj,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //    T'cj
+      __gemv(Tppaj,   1.0,Tpp,   aj,   0.0,Tppaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //   T''aj
+                       g_dot_Tpaj = __dot(g,Tpaj,4*__ca_krylov_s+1);                            // (g,T'aj)
+      if(g_dot_Tpaj == 0.0){                                                                    // pivot breakdown ???
+        #ifdef __VERBOSE                                                                        //
+        if(domain->rank==0){printf("g_dot_Tpaj == 0.0\n");}                                     //
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      alpha = delta / g_dot_Tpaj;                                                               // delta / (g,T'aj)
+      if(isinf(alpha)){                                                                         // alpha = big/tiny(overflow) = inf -> breakdown
+        #ifdef __VERBOSE                                                                        //
+        if(domain->rank==0){printf("alpha == inf\n");}                                          // 
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 // 
+      }                                                                                         // 
+      #if 0                                                                                     // seems to have accuracy problems in finite precision...
+      __gemv(temp1,-alpha,  G, Tpaj,   0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp1[] =       - alpha*GT'aj
+      __gemv(temp1,   1.0,  G,   cj,   1.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp1[] =   Gcj - alpha*GT'aj
+      __gemv(temp2,-alpha,  G,Tppaj,   0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp2[] =       − alpha*GT′′aj
+      __gemv(temp2,   1.0,  G, Tpcj,   1.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp2[] = GT′cj − alpha*GT′′aj
+      __axpy(temp3,   1.0,     Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1);                            //  temp3[] =  T′cj − alpha*T′′aj
+             omega_numerator = __dot(temp3,temp1,4*__ca_krylov_s+1);                            //  (temp3,temp1) = ( T'cj-alpha*T''aj ,   Gcj-alpha*GT'aj )
+           omega_denominator = __dot(temp3,temp2,4*__ca_krylov_s+1);                            //  (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj )
+      #else                                                                                     // better to change the order of operations Gx-Gy -> G(x-y) ...  (note, G is symmetric)
+      __axpy(temp1,   1.0,     Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1);                            //  temp1[] =  (T'cj - alpha*T''aj)
+      __gemv(temp2,   1.0,  G,temp1,   0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp2[] = G(T'cj - alpha*T''aj)
+      __axpy(temp3,   1.0,       cj,-alpha, Tpaj,4*__ca_krylov_s+1);                            //  temp3[] =     cj - alpha*T'aj
+             omega_numerator = __dot(temp3,temp2,4*__ca_krylov_s+1);                            //  (temp3,temp2) = ( (  cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) )
+           omega_denominator = __dot(temp1,temp2,4*__ca_krylov_s+1);                            //  (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) )
+      #endif                                                                                    // 
+      // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0
+      // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj
+      // If omega_numerator==0, and ||s||!=0, then stabilization breakdown
+
+      // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!!
+      __axpy(   ej,1.0,ej,       alpha,   aj,4*__ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[]    
+
+      // calculate the norm of Saad's vector 's' to check intra s-step convergence...
+      __axpy(temp1,   1.0,       cj,-alpha, Tpaj,4*__ca_krylov_s+1);                            //  temp1[] =   cj - alpha*T'aj
+      __gemv(temp2,   1.0,  G,temp1,   0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          //  temp2[] = G(cj - alpha*T'aj)
+                                 L2_norm_of_s = __dot(temp1,temp2,4*__ca_krylov_s+1);           //  (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) )  == square of L2 norm of s in exact arithmetic
+      if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s);                    // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      #ifdef __VERBOSE                                                                          //
+      if(domain->rank==0){printf("m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);}                  //
+      #endif                                                                                    //
+      if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;}    // terminate the inner n-loop
+
+
+      if(omega_denominator == 0.0){                                                             // ??? breakdown
+        #ifdef __VERBOSE                                                                        //
+        if(domain->rank==0){if(omega_denominator == 0.0)printf("omega_denominator == 0.0\n");}  //
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      omega = omega_numerator / omega_denominator;                                              // 
+      if(isinf(omega)){                                                                         // omega = big/tiny(oveflow) = inf
+        #ifdef __VERBOSE                                                                        //
+        if(domain->rank==0){if(isinf(omega))printf("omega == inf\n");}                          // 
+        #endif                                                                                  //
+        BiCGStabFailed=1;break;                                                                 //
+      }                                                                                         //
+      // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok                    //
+      __axpy(   ej,1.0,ej,       omega,   cj,4*__ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[] + omega*cj[]
+      __axpy(   ej,1.0,ej,-omega*alpha, Tpaj,4*__ca_krylov_s+1);                                // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[]
+      __axpy(   cj,1.0,cj,      -omega, Tpcj,4*__ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[]
+      __axpy(   cj,1.0,cj,      -alpha, Tpaj,4*__ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[]
+      __axpy(   cj,1.0,cj, omega*alpha,Tppaj,4*__ca_krylov_s+1);                                // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[]
+
+
+      // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence...
+      __gemv(temp1,   1.0,  G,   cj,   0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1);          // temp1[] = Gcj
+                                       cj_dot_Gcj = __dot(cj,temp1,4*__ca_krylov_s+1);          // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic
+      L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj);           // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      #ifdef __VERBOSE 
+      if(domain->rank==0){printf("m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);}
+      #endif
+      if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop
+
+
+      delta_next = __dot( g,cj,4*__ca_krylov_s+1);                                              // (g,cj)
+      #ifdef __VERBOSE                                                                          //
+      if(domain->rank==0){                                                                      //
+        if(isinf(delta_next)     ){printf("delta == inf\n");}                                   // delta = big/tiny(overflow) = inf
+        if(delta_next      == 0.0){printf("delta == 0.0\n");}                                   // Lanczos breakdown
+        if(omega_numerator == 0.0){printf("omega_numerator == 0.0\n");}                         // stabilization breakdown
+        if(omega           == 0.0){printf("omega == 0.0\n");}                                   // stabilization breakdown 
+      }                                                                                         //
+      #endif                                                                                    //
+      if(isinf(delta_next)){BiCGStabFailed   =1;break;}                                         // delta = inf?
+      if(delta_next  ==0.0){BiCGStabFailed   =1;break;}                                         // Lanczos breakdown...
+      if(omega       ==0.0){BiCGStabFailed   =1;break;}                                         // stabilization breakdown 
+      beta = (delta_next/delta)*(alpha/omega);                                                  // (delta_next/delta)*(alpha/omega)
+      #ifdef __VERBOSE                                                                          //
+      if(domain->rank==0){                                                                      //
+        if(isinf(beta)           ){printf("beta == inf\n");}                                    // beta = inf?
+        if(beta            == 0.0){printf("beta == 0.0\n");}                                    // beta = 0?  can't make further progress(?)
+      }                                                                                         //
+      #endif                                                                                    //
+      if(isinf(beta)      ){BiCGStabFailed   =1;break;}                                         // beta = inf?
+      if(beta       == 0.0){BiCGStabFailed   =1;break;}                                         // beta = 0?  can't make further progress(?)
+      __axpy(   aj,1.0,cj,        beta,   aj,4*__ca_krylov_s+1);                                // aj[] = cj[] + beta*aj[]
+      __axpy(   aj,1.0,aj, -omega*beta, Tpaj,4*__ca_krylov_s+1);                                // aj[] = cj[] + beta*aj[] - omega*beta*T'aj
+      delta = delta_next;                                                                       // delta = delta_next
+
+    }                                                                                           // inner n (j) loop
+
+    // update iterates...
+    for(i=0;i<4*__ca_krylov_s+1;i++){add_grids(domain,level,e_id,1.0,e_id,ej[i],PRrt[i]);}      // e_id[] = [P,R]ej + e_id[]
+    if(!BiCGStabFailed && !BiCGStabConverged){                                                  // if we're done, then there is no point in updating these
+                                     add_grids(domain,level, __p,0.0, __p,aj[0],PRrt[0]);       //    p[] = [P,R]aj
+    for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __p,1.0, __p,aj[i],PRrt[i]);}      //          ...
+                                     add_grids(domain,level, __r,0.0, __r,cj[0],PRrt[0]);       //    r[] = [P,R]cj
+    for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __r,1.0, __r,cj[i],PRrt[i]);}      //          ...
+    }                                                                                           //
+    m+=__ca_krylov_s;                                                                           //   m+=__ca_krylov_s;
+  }                                                                                             // } // outer m loop
+  // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  #ifdef DiagonallyPrecondition
+  mul_grids(domain,level,e_id,1.0,__lambda,e_id);                                               //   e_id[] = lambda[]*e_id[] // i.e. e = D^{-1}e'
+  #endif
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void BiCGStab(domain_type * domain, int level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // Algorithm 7.7 in Iterative Methods for Sparse Linear Systems(Yousef Saad)
+  // uses "right" preconditioning...  AD^{-1}(Dx) = b ... AD^{-1}y = b ... solve for y, then solve for x = D^{-1}y
+  int jMax=200;
+  int j=0;
+  int BiCGStabFailed    = 0;
+  int BiCGStabConverged = 0;
+  residual(domain,level,__r0,x_id,R_id,a,b);                                    // r0[] = R_id[] - A(x_id)
+  scale_grid(domain,level,__r,1.0,__r0);                                        // r[] = r0[]
+  scale_grid(domain,level,__p,1.0,__r0);                                        // p[] = r0[]
+  double r_dot_r0 = dot(domain,level,__r,__r0);                                 // r_dot_r0 = dot(r,r0)
+  double norm_of_r0 = norm(domain,level,__r);                                   // the norm of the initial residual...
+  if(r_dot_r0   == 0.0){BiCGStabConverged=1;}                                   // entered BiCGStab with exact solution
+  if(norm_of_r0 == 0.0){BiCGStabConverged=1;}                                   // entered BiCGStab with exact solution
+  while( (j<jMax) && (!BiCGStabFailed) && (!BiCGStabConverged) ){               // while(not done){
+    j++;domain->Krylov_iterations++;                                            //
+    #ifdef DiagonallyPrecondition                                               //
+    mul_grids(domain,level,__temp,1.0,__lambda,__p);                            //   temp[] = lambda[]*p[]
+    apply_op(domain,level,__Ap,__temp,a,b);                                     //   Ap = AD^{-1}(p)
+    #else                                                                       //
+    apply_op(domain,level,__Ap,__p,a,b);                                        //   Ap = A(p)
+    #endif                                                                      //
+    double Ap_dot_r0 = dot(domain,level,__Ap,__r0);                             //   Ap_dot_r0 = dot(Ap,r0)
+    if(Ap_dot_r0 == 0.0){BiCGStabFailed=1;break;}                               //   pivot breakdown ???
+    double alpha = r_dot_r0 / Ap_dot_r0;                                        //   alpha = r_dot_r0 / Ap_dot_r0
+    if(isinf(alpha)){BiCGStabFailed=1;break;}                                   //   pivot breakdown ???
+    add_grids(domain,level,x_id,1.0,x_id, alpha,__p );                          //   x_id[] = x_id[] + alpha*p[]
+    add_grids(domain,level,__s ,1.0,__r ,-alpha,__Ap);                          //   s[]    = r[]    - alpha*Ap[]   (intermediate residual?)
+    double norm_of_s = norm(domain,level,__s);                                  //   FIX - redundant??  norm of intermediate residual
+    if(norm_of_s == 0.0){BiCGStabConverged=1;break;}                            //   FIX - redundant??  if As_dot_As==0, then As must be 0 which implies s==0
+    if(norm_of_s < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;}
+    #ifdef DiagonallyPrecondition                                               //
+    mul_grids(domain,level,__temp,1.0,__lambda,__s);                            //   temp[] = lambda[]*s[]
+    apply_op(domain,level,__As,__temp,a,b);                                     //   As = AD^{-1}(s)
+    #else                                                                       //
+    apply_op(domain,level,__As,__s,a,b);                                        //   As = A(s)
+    #endif                                                                      //
+    double As_dot_As = dot(domain,level,__As,__As);                             //   As_dot_As = dot(As,As)
+    double As_dot_s  = dot(domain,level,__As, __s);                             //   As_dot_s  = dot(As, s)
+    if(As_dot_As == 0.0){BiCGStabConverged=1;break;}                            //   converged ?
+    double omega = As_dot_s / As_dot_As;                                        //   omega = As_dot_s / As_dot_As
+    if(omega == 0.0){BiCGStabFailed=1;break;}                                   //   stabilization breakdown ???
+    if(isinf(omega)){BiCGStabFailed=1;break;}                                   //   stabilization breakdown ???
+    add_grids(domain,level,  x_id,  1.0,x_id, omega,__s   );                    //   x_id[] = x_id[] + omega*s[]
+    add_grids(domain,level,__r   ,  1.0,__s ,-omega,__As  );                    //   r[]    = s[]    - omega*As[]  (recursively computed / updated residual)
+    double norm_of_r = norm(domain,level,__r);                                  //   norm of recursively computed residual (good enough??)
+    if(norm_of_r == 0.0){BiCGStabConverged=1;break;}                            //
+    if(norm_of_r < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;}
+    #ifdef __DEBUG                                                              //
+    residual(domain,level,__temp,x_id,R_id,a,b);                                //
+    double norm_of_residual = norm(domain,level,__temp);                        //
+    if(domain->rank==0)printf("j=%8d, norm=%12.6e, norm_inital=%12.6e, reduction=%e\n",j,norm_of_residual,norm_of_r0,norm_of_residual/norm_of_r0);   //
+    #endif                                                                      //
+    double r_dot_r0_new = dot(domain,level,__r,__r0);                           //   r_dot_r0_new = dot(r,r0)
+    if(r_dot_r0_new == 0.0){BiCGStabFailed=1;break;}                            //   Lanczos breakdown ???
+    double beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega);                      //   beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega)
+    if(isinf(beta)){BiCGStabFailed=1;break;}                                    //   ???
+    add_grids(domain,level,__temp,1.0,__p,-omega,__Ap  );                       //   __temp =         (p[]-omega*Ap[])
+    add_grids(domain,level,__p   ,1.0,__r,  beta,__temp);                       //   p[] = r[] + beta*(p[]-omega*Ap[])
+    r_dot_r0 = r_dot_r0_new;                                                    //   r_dot_r0 = r_dot_r0_new   (save old r_dot_r0)
+  }                                                                             // }
+    #ifdef DiagonallyPrecondition                                               //
+    mul_grids(domain,level,x_id,1.0,__lambda,x_id);                             //   x_id[] = lambda[]*x_id[] // i.e. x = D^{-1}x'
+    #endif                                                                      //
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+void CACG(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // based on Lauren Goodfriend, Yinghui Huang, and David Thorman's derivation in their Spring 2013 CS267 Report
+
+  double  temp1[2*__CA_KRYLOV_S+1];                                            //
+  double  temp2[2*__CA_KRYLOV_S+1];                                            //
+  double  temp3[2*__CA_KRYLOV_S+1];                                            //
+  double     aj[2*__CA_KRYLOV_S+1];                                            //
+  double     cj[2*__CA_KRYLOV_S+1];                                            //
+  double     ej[2*__CA_KRYLOV_S+1];                                            //
+  double   Tpaj[2*__CA_KRYLOV_S+1];                                            //
+  double     Tp[2*__CA_KRYLOV_S+1][2*__CA_KRYLOV_S+1];                         // T'  indexed as [row][col]
+  double      G[2*__CA_KRYLOV_S+1][2*__CA_KRYLOV_S+1];                         // extracted from first 2*__CA_KRYLOV_S+1 columns of Gg[][].  indexed as [row][col]
+  double   Gbuf[(2*__CA_KRYLOV_S+1)*(2*__CA_KRYLOV_S+1)];                      // buffer to hold the Gram-like matrix produced by matmul().  indexed as [row*(2*__CA_KRYLOV_S+1) + col]
+  int      PR[2*__CA_KRYLOV_S+1];                                              // grid_id's of the concatenation of the S+1 matrix powers of P, and the S matrix powers of R
+  int *P = PR+               0;                                                // grid_id's of the S+1 Matrix Powers of P.  P[i] is the grid_id of A^i(p)
+  int *R = PR+__CA_KRYLOV_S+1;                                                 // grid_id's of the S   Matrix Powers of R.  R[i] is the grid_id of A^i(r)
+
+  int mMax=200;
+  int m=0,n;
+  int i,j,k;
+  int CGFailed    = 0;
+  int CGConverged = 0;
+
+  double aj_dot_GTpaj,cj_dot_Gcj,alpha,cj_dot_Gcj_new,beta,L2_norm_of_r0,L2_norm_of_residual,delta;
+
+  residual(domain,level,__r0,e_id,R_id,a,b);                                                     // r0[] = R_id[] - A(e_id)
+  scale_grid(domain,level,__r,1.0,__r0);                                                         // r[] = r0[]
+  scale_grid(domain,level,__p,1.0,__r0);                                                         // p[] = r0[]
+  double norm_of_r0 = norm(domain,level,__r0);                                                   // the norm of the initial residual...
+  if(norm_of_r0 == 0.0){CGConverged=1;}                                                          // entered CG with exact solution
+
+  delta = dot(domain,level,__r,__r0);                                                            // delta = dot(r,r0)
+  if(delta==0.0){CGConverged=1;}                                                                 // entered CG with exact solution (square of L2 norm of __r)
+  L2_norm_of_r0 = sqrt(delta);                                                                   // 
+
+
+
+  // initialize Tp[][] ...
+  for(i=0;i<2*__CA_KRYLOV_S+1;i++)for(j=0;j<2*__CA_KRYLOV_S+1;j++) Tp[i][j]=0;                  // zero Tp
+  for(i=              0;i<  __CA_KRYLOV_S  ;i++){ Tp[i+1][i]=1;}                                // monomial basis
+  for(i=__CA_KRYLOV_S+1;i<2*__CA_KRYLOV_S  ;i++){ Tp[i+1][i]=1;}                                //
+
+  for(i=0;i<2*__CA_KRYLOV_S+1;i++){PR[i] = __PRrt+i;}                                           // columns of PR map to the consecutive spare grids allocated for the bottom solver starting at __PRrt
+
+
+  while( (m<mMax) && (!CGFailed) && (!CGConverged) ){                                           // while(not done){
+    __zero(   aj,2*__CA_KRYLOV_S+1);
+    __zero(   cj,2*__CA_KRYLOV_S+1);
+    __zero(   ej,2*__CA_KRYLOV_S+1);
+    __zero( Tpaj,2*__CA_KRYLOV_S+1);
+    __zero(temp1,2*__CA_KRYLOV_S+1);
+    __zero(temp2,2*__CA_KRYLOV_S+1);
+    __zero(temp3,2*__CA_KRYLOV_S+1);
+
+    // Using the monomial basis, compute s+1 matrix powers on p[] and s matrix powers on r[] one power at a time
+    //  (conventional approach applicable to CHOMBO and BoxLib)
+    scale_grid(domain,level,P[0],1.0,__p);                                                      // P[0] = A^0p = __p
+    for(n=1;n<__CA_KRYLOV_S+1;n++){                                                             // naive way of calculating the monomial basis.
+      apply_op(domain,level,P[n],P[n-1],a,b);                                                   // P[n] = A(P[n-1]) = A^(n)p
+    }
+    scale_grid(domain,level,R[0],1.0,__r);                                                      // R[0] = A^0r = __r
+    for(n=1;n<__CA_KRYLOV_S;n++){                                                               // naive way of calculating the monomial basis.
+      apply_op(domain,level,R[n],R[n-1],a,b);                                                   // R[n] = A(R[n-1]) = A^(n)r
+    }
+
+
+    // form G[][] and g[]
+    domain->CAKrylov_formations_of_G++;                                                         //   Record the number of times CACG formed G[][]
+    matmul_grids(domain,level,Gbuf,PR,PR,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1,1);                // Compute Gbuf[][] = [P,R]^T * [P,R] (Matmul with grids but only one MPI_AllReduce)
+    for(i=0,k=0;i<2*__CA_KRYLOV_S+1;i++){                                                       // extract G[][] from Gbuf[]
+    for(j=0    ;j<2*__CA_KRYLOV_S+1;j++){G[i][j] = Gbuf[k++];}                                  // first 2*__CA_KRYLOV_S+1 elements in each row go to G[][].
+    }
+
+
+    for(i=0;i<2*__CA_KRYLOV_S+1;i++)aj[i]=0.0;aj[               0]=1.0;                         // initialized based on (???)
+    for(i=0;i<2*__CA_KRYLOV_S+1;i++)cj[i]=0.0;cj[__CA_KRYLOV_S+1]=1.0;                          // initialized based on (???)
+    for(i=0;i<2*__CA_KRYLOV_S+1;i++)ej[i]=0.0;                                                  // initialized based on (???)
+
+    for(n=0;n<__CA_KRYLOV_S;n++){                                                               // for(n=0;n<__CA_KRYLOV_S;n++){
+      domain->Krylov_iterations++;                                                              //   record number of inner-loop (j) iterations for comparison
+      __gemv( Tpaj,1.0,Tp,  aj,0.0, Tpaj,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1);                  //               T'aj
+      __gemv(temp1,1.0, G,Tpaj,0.0,temp1,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1);                  //    temp1[] = GT'aj
+      __gemv(temp2,1.0, G,  cj,0.0,temp2,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1);                  //    temp2[] = Gcj
+           aj_dot_GTpaj = __dot(aj,temp1,2*__CA_KRYLOV_S+1);                                    //   (aj,GT'aj)
+             cj_dot_Gcj = __dot(cj,temp2,2*__CA_KRYLOV_S+1);                                    //   (cj,  Gcj)
+      // FIX, can cj_dot_Gcj ever be zero ?
+      if(aj_dot_GTpaj == 0.0){                                                                  //   pivot breakdown ???
+        CGFailed=1;break;                                                                       //
+      }                                                                                         //
+      alpha = cj_dot_Gcj / aj_dot_GTpaj;                                                        //   alpha = (cj,Gcj) / (aj,GT'aj)
+      if(isinf(alpha)){                                                                         //   alpha = big/tiny(overflow) = inf -> breakdown
+        CGFailed=1;break;                                                                       // 
+      }                                                                                         //
+      __axpy(   ej,1.0,ej,   alpha,   aj,2*__CA_KRYLOV_S+1);                                    //   ej[] = ej[] + alpha*aj[]    
+      __axpy(   cj,1.0,cj,  -alpha, Tpaj,2*__CA_KRYLOV_S+1);                                    //   cj[] = cj[] - alpha*T'*aj[]    
+      __gemv(temp2,1.0, G,  cj,0.0,temp2,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1);                  //    temp2[] = Gcj
+         cj_dot_Gcj_new = __dot(cj,temp2,2*__CA_KRYLOV_S+1);                                    //   (cj,  Gcj)
+      // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... == cj_dot_Gcj_new??
+      L2_norm_of_residual = 0.0;if(cj_dot_Gcj_new>0)L2_norm_of_residual=sqrt(cj_dot_Gcj_new);   // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0)
+      if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_r0){CGConverged=1;break;}   // terminate the inner n-loop
+      if(cj_dot_Gcj_new == 0.0){                                                                //   Lanczos breakdown ???
+        CGFailed=1;break;                                                                       //
+      }                                                                                         //
+      beta = cj_dot_Gcj_new / cj_dot_Gcj;                                                       // 
+      if(isinf(beta)){CGFailed=1;break;}                                                        //   beta = inf?
+      if(beta == 0.0){CGFailed=1;break;}                                                        //   beta = 0?  can't make further progress(?)
+      __axpy(   aj,1.0,cj,    beta,   aj,2*__CA_KRYLOV_S+1);                                    //   cj[] = cj[] + beta*aj[]    
+
+    }                                                                                           // inner n (j) loop
+
+    // update iterates...
+    for(i=0;i<2*__CA_KRYLOV_S+1;i++){add_grids(domain,level,e_id,1.0,e_id,ej[i],PR[i]);}        // e_id[] = [P,R]ej + e_id[]
+    if(!CGFailed && !CGConverged){                                                              // if we're done, then there is no point in updating these
+                                     add_grids(domain,level, __p,0.0, __p,aj[0],PR[0]);         //    p[] = [P,R]aj
+    for(i=1;i<2*__CA_KRYLOV_S+1;i++){add_grids(domain,level, __p,1.0, __p,aj[i],PR[i]);}        //          ...
+                                     add_grids(domain,level, __r,0.0, __r,cj[0],PR[0]);         //    r[] = [P,R]cj
+    for(i=1;i<2*__CA_KRYLOV_S+1;i++){add_grids(domain,level, __r,1.0, __r,cj[i],PR[i]);}        //          ...
+    }
+                              m+=__CA_KRYLOV_S;                                                 //   m+=__CA_KRYLOV_S;
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+  }                                                                                             // } // outer m loop
+
+}
+
+
+//------------------------------------------------------------------------------------------------------------------------------
+void CG(domain_type * domain, int level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){
+  // Algorithm 6.18 in Iterative Methods for Sparse Linear Systems(Yousef Saad)
+  int jMax=200;
+  int j=0;
+  int CGFailed    = 0;
+  int CGConverged = 0;
+  residual(domain,level,__r0,x_id,R_id,a,b);                                   // r0[] = R_id[] - A(x_id)
+  scale_grid(domain,level,__r,1.0,__r0);                                       // r[] = r0[]
+  scale_grid(domain,level,__p,1.0,__r0);                                       // p[] = r0[]
+  double norm_of_r0 = norm(domain,level,__r);                                  // the norm of the initial residual...
+  if(norm_of_r0 == 0.0){CGConverged=1;}                                        // entered CG with exact solution
+  double r_dot_r = dot(domain,level,__r,__r);                                  // r_dot_r = dot(r,r)
+  while( (j<jMax) && (!CGFailed) && (!CGConverged) ){                          // while(not done){
+    j++;domain->Krylov_iterations++;                                           //
+    apply_op(domain,level,__Ap,__p,a,b);                                       //   Ap = A(p)
+    double Ap_dot_p = dot(domain,level,__Ap,__p);                              //   Ap_dot_p = dot(Ap,p)
+    if(Ap_dot_p == 0.0){CGFailed=1;break;}                                     //   pivot breakdown ???
+    double alpha = r_dot_r / Ap_dot_p;                                         //   alpha = r_dot_r / Ap_dot_p
+    if(isinf(alpha)){CGFailed=1;break;}                                        //   ???
+    add_grids(domain,level,x_id,1.0,x_id, alpha,__p );                         //   x_id[] = x_id[] + alpha*p[]
+    add_grids(domain,level,__r ,1.0,__r ,-alpha,__Ap);                         //   r[]    = r[]    - alpha*Ap[]   (intermediate residual?)
+    double norm_of_r = norm(domain,level,__r);                                 //   norm of intermediate residual
+    if(norm_of_r == 0.0){CGConverged=1;break;}                                 //
+    if(norm_of_r < desired_reduction_in_norm*norm_of_r0){CGConverged=1;break;} //
+    double r_dot_r_new = dot(domain,level,__r,__r);                            //   r_dot_r_new = dot(r_{j+1},r_{j+1})
+    if(r_dot_r_new == 0.0){CGFailed=1;break;}                                  //   Lanczos breakdown ???
+    double beta = (r_dot_r_new/r_dot_r);                                       //   beta = (r_dot_r_new/r_dot_r)
+    if(isinf(beta)){CGFailed=1;break;}                                         //   ???
+    add_grids(domain,level,__p,1.0,__r,beta,__p );                             //   p[] = r[] + beta*p[]
+    r_dot_r = r_dot_r_new;                                                     //   r_dot_r = r_dot_r_new   (save old r_dot_r)
+  }                                                                            // }
+}
+
+//------------------------------------------------------------------------------------------------------------------------------
+void IterativeSolver(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){ 
+  // code presumes e_id was an externally initialized with the initial guess
+  #ifdef __USE_BICGSTAB
+    #warning Using BiCGStab bottom solver...
+    BiCGStab(domain,level,e_id,R_id,a,b,desired_reduction_in_norm);
+  #elif  __USE_CG
+    #warning Using CG bottom solver...
+    CG(domain,level,e_id,R_id,a,b,desired_reduction_in_norm);
+  #elif  __USE_CABICGSTAB
+    #warning Using Communication-Avoiding BiCGStab bottom solver... 
+  //           CABiCGStab(domain,level,e_id,R_id,a,b,desired_reduction_in_norm);
+    TelescopingCABiCGStab(domain,level,e_id,R_id,a,b,desired_reduction_in_norm);
+  #elif  __USE_CACG
+    #warning Using Communication-Avoiding CG bottom solver... 
+    CACG(domain,level,e_id,R_id,a,b,desired_reduction_in_norm);
+  #else // just multiple smooth()'s
+    #warning Defaulting to simple bottom solver with fixed number of smooths...
+    int s,numSmoothsBottom=48;
+    for(s=0;s<numSmoothsBottom;s+=numSmooths)smooth(domain,level,e_id,R_id,a,b);
+  #endif
+}
+
+int IterativeSolver_NumGrids(){
+  // additionally number of grids required by an iterative solver...
+  #ifdef __USE_BICGSTAB
+  return(6);                  // BiCGStab requires additional grids r0,r,p,s,Ap,As
+  #elif  __USE_CG
+  return(6);                  // CG requires extra grids r0,r,p,Ap
+  #elif  __USE_CABICGSTAB
+  return(4+4*__CA_KRYLOV_S); // CABiCGStab requires additional grids rt,p,r,P[2s+1],R[2s].
+  #elif  __USE_CACG
+  return(4+2*__CA_KRYLOV_S); // CACG requires additional grids r0,p,r,P[s+1],R[s].
+  #endif
+  return(0);                  // simply doing multiple smooths requires no extra grids
+}
+//------------------------------------------------------------------------------------------------------------------------------
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/timer.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/timer.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/timer.h
@@ -0,0 +1,10 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#ifndef TIMER_H
+#define TIMER_H
+#include<stdint.h>
+uint64_t CycleTime();
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/timer.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/timer.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/timer.c
@@ -0,0 +1,21 @@
+//------------------------------------------------------------------------------------------------------------------------------
+// Samuel Williams
+// SWWilliams@lbl.gov
+// Lawrence Berkeley National Lab
+//------------------------------------------------------------------------------------------------------------------------------
+#include <stdint.h>
+uint64_t CycleTime(){
+  #ifndef __has_builtin
+  #define __has_builtin(x) 0
+  #endif 
+
+  #if !defined(USE_CYCLE_COUNTER)
+    return 0;
+  #elif __has_builtin(__builtin_readcyclecounter)
+    return __builtin_readcyclecounter();
+  #else
+    uint64_t lo, hi;
+    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+    return( (((uint64_t)hi) << 32) | ((uint64_t)lo) );
+  #endif
+}