Index: LICENSE.TXT
===================================================================
--- LICENSE.TXT
+++ LICENSE.TXT
@@ -84,6 +84,7 @@
                     llvm-test/MultiSource/Benchmarks/ASC_Sequoia/sphot
 smg2000:            llvm-test/MultiSource/Benchmarks/ASCI_Purple/SMG2000
 XSBench:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench
+miniFE:              llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE
 Fhourstones:        llvm-test/MultiSource/Benchmarks/Fhourstones
 Fhourstones-3.1:    llvm-test/MultiSource/Benchmarks/Fhourstones-3.1
 McCat:              llvm-test/MultiSource/Benchmarks/McCat
Index: MultiSource/Benchmarks/CMakeLists.txt
===================================================================
--- MultiSource/Benchmarks/CMakeLists.txt
+++ MultiSource/Benchmarks/CMakeLists.txt
@@ -27,6 +27,7 @@
   add_subdirectory(Prolangs-C++)
   add_subdirectory(Bullet)
   add_subdirectory(tramp3d-v4)
+  add_subdirectory(DOE-ProxyApps-C++)
   if(NOT "${ARCH}" STREQUAL "XCore")
     add_subdirectory(7zip)
     add_subdirectory(PAQ8p)
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(miniFE)
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile
@@ -0,0 +1,6 @@
+# MultiSource/DOE-ProxyApps-C++ Makefile: Build all subdirectories automatically
+
+LEVEL = ../../..
+PARALLEL_DIRS = miniFE
+
+include $(LEVEL)/Makefile.programs
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Box.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Box.hpp
@@ -0,0 +1,55 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _Box_hpp_
+#define _Box_hpp_
+
+/**
+  * a 'Box' is 3 pairs of ints, where each pair specifies a lower
+  * and upper bound for one of the 3 spatial dimensions.
+  *
+  * This struct stores the 3 pairs as a simple array of 6 ints,
+  * but defines the bracket operator so that it can be referenced
+  * using 2-dimensional array notation like this:
+  * int xmin = box[0][0]; int xmax = box[0][1];
+  * int ymin = box[1][0]; int ymax = box[1][1];
+  * int zmin = box[2][0]; int zmax = box[2][1];
+ */
+struct Box {
+  int ranges[6];
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+  int* operator[](int xyz) { return &ranges[xyz*2]; }
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+  const int* operator[](int xyz) const { return &ranges[xyz*2]; }
+};
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/BoxIterator.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/BoxIterator.hpp
@@ -0,0 +1,142 @@
+#ifndef _BoxTraverser_hpp_
+#define _BoxTraverser_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+namespace miniFE {
+
+/** Class for traversing a 3-dimensional 'box' of indices.
+
+  //One way to traverse a 'box[3][2]' is to use a triply-nested for-loop:
+  for(int z=box[2][0]; z<box[2][1]; ++z) {
+    for(int y=box[1][0]; y<box[1][1]; ++y) {
+      for(int x=box[0][0]; x<box[0][1]; ++x) {
+        ...
+      }
+    }
+  }
+
+  //Another way is to use this BoxIterator class, like so:
+  //BoxIterator iter = BoxIterator::begin(box);
+  //BoxIterator end = BoxIterator::end(box);
+  for(; iter != end; ++iter) {
+    int x = iter.x;
+    int y = iter.y;
+    int z = iter.z;
+    ...
+  }
+*/
+class BoxIterator {
+public:
+  ~BoxIterator(){}
+
+  static BoxIterator begin(const Box& box)
+  {
+    return BoxIterator(box);
+  }
+
+  static BoxIterator end(const Box& box)
+  {
+    return BoxIterator(box, true/*at_end==true*/);
+  }
+
+  BoxIterator& operator=(const BoxIterator& src)
+  {
+    box_[0][0] = src.box_[0][0]; box_[0][1] = src.box_[0][1];
+    box_[1][0] = src.box_[1][0]; box_[1][1] = src.box_[1][1];
+    box_[2][0] = src.box_[2][0]; box_[2][1] = src.box_[2][1];
+    x = src.x;
+    y = src.y;
+    z = src.z;
+    return *this;
+  }
+
+  BoxIterator& operator++()
+  {
+    ++x;
+    if (x >= box_[0][1]) {
+      x = box_[0][0];
+      ++y;
+      if (y >= box_[1][1]) {
+        y = box_[1][0];
+        ++z;
+        if (z >= box_[2][1]) {
+          z = box_[2][1];
+          y = box_[1][1];
+          x = box_[0][1];
+        }
+      }
+    }
+    return *this;
+  }
+
+  BoxIterator operator++(int)
+  {
+    BoxIterator temp = *this;
+    ++(*this);
+    return temp;
+  }
+
+  bool operator==(const BoxIterator& rhs) const
+  {
+    return x == rhs.x && y == rhs.y && z == rhs.z;
+  }
+
+  bool operator!=(const BoxIterator& rhs) const
+  {
+    return !(this->operator==(rhs));
+  }
+
+  int x;
+  int y;
+  int z;
+
+private:
+  BoxIterator(const Box& box, bool at_end = false)
+   : x(box[0][0]),
+     y(box[1][0]),
+     z(box[2][0]),
+     box_()
+  {
+    box_[0][0] = box[0][0]; box_[0][1] = box[0][1];
+    box_[1][0] = box[1][0]; box_[1][1] = box[1][1];
+    box_[2][0] = box[2][0]; box_[2][1] = box[2][1];
+    if (at_end) {
+      x = box[0][1];
+      y = box[1][1];
+      z = box[2][1];
+    }
+  }
+
+  Box box_;
+};//class BoxTraverser
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/BoxPartition.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/BoxPartition.hpp
@@ -0,0 +1,103 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _BoxPartition_hpp_
+#define _BoxPartition_hpp_
+
+#include <Box.hpp>
+
+/** \brief Recursively split a box into (up-ip) sub-boxes
+ */
+void box_partition( int ip , int up , int axis ,
+                    const Box& box ,
+                    Box* p_box );
+
+/** \brief  Partition a { [ix,jx) X [iy,jy) X [iz,jz) } box.
+ *
+ *  Use recursive coordinate bisection to partition a box 
+ *  into np disjoint sub-boxes.  Allocate (via malloc) and
+ *  populate the sub-boxes, mapping the local (x,y,z) to
+ *  a local ordinal, and mappings for the send-recv messages
+ *  to update the ghost cells.
+ *
+ *  usage:
+ *
+ *  my_nx = pbox[my_p][0][1] - pbox[my_p][0][0] ;
+ *  my_ny = pbox[my_p][1][1] - pbox[my_p][1][0] ;
+ *  my_nz = pbox[my_p][2][1] - pbox[my_p][2][0] ;
+ *
+ *  for ( x = -ghost ; x < my_nx + ghost ; ++x ) {
+ *  for ( y = -ghost ; y < my_ny + ghost ; ++y ) {
+ *  for ( z = -ghost ; z < my_nz + ghost ; ++z ) {
+ *    const int x_global = x + pbox[my_p][0][0] ;
+ *    const int y_global = y + pbox[my_p][1][0] ;
+ *    const int z_global = z + pbox[my_p][2][0] ;
+ *
+ *    const int local_ordinal =
+ *      box_map_local( pbox[my_p], ghost, map_local_id, x, y, z );
+ *
+ *    if ( 0 <= local_ordinal ) {
+ *    }
+ *  }
+ *  
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int recv_processor = ( my_p + i ) % np ;
+ *    const int recv_ordinal_begin = map_recv_pc[i];
+ *    const int recv_ordinal_end   = map_recv_pc[i+1];
+ *  }
+ *
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int send_processor = ( my_p + i ) % np ;
+ *    const int send_map_begin = map_send_pc[i];
+ *    const int send_map_end   = map_send_pc[i+1];
+ *    for ( j = send_map_begin ; j < send_map_end ; ++j ) {
+ *      send_ordinal = map_send_id[j] ;
+ *    }
+ *  }
+ */
+void box_partition_rcb( 
+  const int np            /**< [in]  Number of partitions */ ,
+  const int my_p          /**< [in]  My partition rank    */ ,
+  const Box& root_box     /**< [in]  3D Box to partition  */ ,
+  const int ghost         /**< [in]  Ghost cell boundary  */ ,
+  Box* pbox               /**< [out] Partition's 3D boxes */ ,
+  int ** map_local_id     /**< [out] Map local cells */ ,
+  int ** map_recv_pc      /**< [out] Receive spans per processor */ ,
+  int ** map_send_pc      /**< [out] Send prefix counts per processor */ ,
+  int ** map_send_id      /**< [out] Send message ordinals */ );
+
+/* \brief  Map a local (x,y,z) to a local ordinal.
+ */
+int box_map_local( const Box& box_local ,
+                   const int ghost ,
+                   const int map_local_id[] ,
+                   const int local_x ,
+                   const int local_y ,
+                   const int local_z );
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/BoxPartition.cpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/BoxPartition.cpp
@@ -0,0 +1,503 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <Box.hpp>
+#include <BoxPartition.hpp>
+
+/*--------------------------------------------------------------------*/
+
+static int box_map_local_entry( const Box& box ,
+                                const int ghost ,
+                                int local_x ,
+                                int local_y ,
+                                int local_z )
+{
+  const int nx = 2 * ghost + box[0][1] - box[0][0] ;
+  const int ny = 2 * ghost + box[1][1] - box[1][0] ;
+  const int nz = 2 * ghost + box[2][1] - box[2][0] ;
+  int result = -1 ;
+
+  local_x += ghost ;
+  local_y += ghost ;
+  local_z += ghost ;
+
+  if ( 0 <= local_x && local_x < nx &&
+       0 <= local_y && local_y < ny &&
+       0 <= local_z && local_z < nz ) {
+
+    result = local_z * ny * nx + local_y * nx + local_x ;
+  }
+  return result ;
+}
+
+int box_map_local( const Box& box_local,
+                   const int ghost ,
+                   const int box_local_map[] ,
+                   const int local_x ,
+                   const int local_y ,
+                   const int local_z )
+{
+  int result = box_map_local_entry(box_local,ghost,local_x,local_y,local_z);
+
+  if ( 0 <= result ) {
+    result = box_local_map[ result ];
+  }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+/* Recursively split a box into into (up-ip) sub-boxes */
+
+void box_partition( int ip , int up , int axis ,
+                    const Box& box,
+                    Box* p_box )
+{
+  const int np = up - ip ;
+  if ( 1 == np ) {
+    p_box[ip][0][0] = box[0][0] ; p_box[ip][0][1] = box[0][1] ;
+    p_box[ip][1][0] = box[1][0] ; p_box[ip][1][1] = box[1][1] ;
+    p_box[ip][2][0] = box[2][0] ; p_box[ip][2][1] = box[2][1] ;
+  }
+  else {
+    const int n = box[ axis ][1] - box[ axis ][0] ;
+    const int np_low = np / 2 ;  /* Rounded down */
+    const int np_upp = np - np_low ;
+
+    const int n_upp = (int) (((double) n) * ( ((double)np_upp) / ((double)np)));
+    const int n_low = n - n_upp ;
+    const int next_axis = ( axis + 2 ) % 3 ;
+
+    if ( np_low ) { /* P = [ip,ip+np_low) */
+      Box dbox ;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      dbox[ axis ][1] = dbox[ axis ][0] + n_low ;
+
+      box_partition( ip, ip + np_low, next_axis, dbox, p_box );
+    }
+
+    if ( np_upp ) { /* P = [ip+np_low,ip+np_low+np_upp) */
+      Box dbox;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      ip += np_low ;
+      dbox[ axis ][0] += n_low ;
+      dbox[ axis ][1]  = dbox[ axis ][0] + n_upp ;
+
+      box_partition( ip, ip + np_upp, next_axis, dbox, p_box );
+    }
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+static int box_disjoint( const Box& a , const Box& b)
+{
+  return a[0][1] <= b[0][0] || b[0][1] <= a[0][0] ||
+         a[1][1] <= b[1][0] || b[1][1] <= a[1][0] ||
+         a[2][1] <= b[2][0] || b[2][1] <= a[2][0] ;
+}
+
+static void resize_int( int ** a , int * allocLen , int newLen )
+{
+  int k = 32;
+  while ( k < newLen ) { k <<= 1 ; }
+  if ( NULL == *a )
+    { *a = (int*)malloc( sizeof(int)*(*allocLen = k) ); }
+  else if ( *allocLen < k ) 
+    { *a = (int*)realloc(*a , sizeof(int)*(*allocLen = k)); }
+}
+
+static void box_partition_maps( 
+  const int np ,
+  const int my_p ,
+  const Box* pbox,
+  const int ghost ,
+  int ** map_local_id ,
+  int ** map_recv_pc ,
+  int ** map_send_pc ,
+  int ** map_send_id )
+{
+  const Box& my_box = pbox[my_p] ;
+
+  const int my_ix = my_box[0][0] ;
+  const int my_iy = my_box[1][0] ;
+  const int my_iz = my_box[2][0] ;
+  const int my_nx = my_box[0][1] - my_box[0][0] ;
+  const int my_ny = my_box[1][1] - my_box[1][0] ;
+  const int my_nz = my_box[2][1] - my_box[2][0] ;
+
+  const int my_use_nx = 2 * ghost + my_nx ;
+  const int my_use_ny = 2 * ghost + my_ny ;
+  const int my_use_nz = 2 * ghost + my_nz ;
+
+  const int id_length = my_use_nx * my_use_ny * my_use_nz ;
+
+  int * local_id  = (int *) malloc( id_length * sizeof(int) );
+  int * recv_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
+  int * send_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
+
+  int * send_id  = NULL ;
+  int   send_id_size = 0 ;
+
+  int iLocal , iSend ;
+  int i ;
+
+  Box my_use_box;
+
+  my_use_box[0][0] = my_box[0][0] - ghost ;
+  my_use_box[0][1] = my_box[0][1] + ghost ;
+  my_use_box[1][0] = my_box[1][0] - ghost ;
+  my_use_box[1][1] = my_box[1][1] + ghost ;
+  my_use_box[2][0] = my_box[2][0] - ghost ;
+  my_use_box[2][1] = my_box[2][1] + ghost ;
+
+  for ( i = 0 ; i < id_length ; ++i ) { local_id[i] = -1 ; }
+
+  iSend = 0 ;
+  iLocal = 0 ;
+
+  /* The vector space is partitioned by processors */
+
+  for ( i = 0 ; i < np ; ++i ) {
+    const int ip = ( i + my_p ) % np ;
+    recv_pc[i] = iLocal ;
+    send_pc[i] = iSend ;
+
+    if ( ! box_disjoint( my_use_box , pbox[ip] ) ) {
+      const int p_ix = pbox[ip][0][0] ;
+      const int p_iy = pbox[ip][1][0] ;
+      const int p_iz = pbox[ip][2][0] ;
+      const int p_ex = pbox[ip][0][1] ;
+      const int p_ey = pbox[ip][1][1] ;
+      const int p_ez = pbox[ip][2][1] ;
+
+      int local_x , local_y , local_z ;
+
+      /* Run the span of global cells that my processor uses */
+
+      for ( local_z = -ghost ; local_z < my_nz + ghost ; ++local_z ) {
+      for ( local_y = -ghost ; local_y < my_ny + ghost ; ++local_y ) {
+      for ( local_x = -ghost ; local_x < my_nx + ghost ; ++local_x ) {
+
+        const int global_z = local_z + my_iz ;
+        const int global_y = local_y + my_iy ;
+        const int global_x = local_x + my_ix ;
+
+        const int entry = 
+          box_map_local_entry(my_box,ghost,local_x,local_y,local_z);
+
+        if ( entry < 0 ) { abort(); }
+
+        if ( p_iz <= global_z && global_z < p_ez &&
+             p_iy <= global_y && global_y < p_ey &&
+             p_ix <= global_x && global_x < p_ex ) {
+
+          /* This ordinal is owned by processor 'ip' */
+
+          local_id[ entry ] = iLocal++ ;
+
+#if defined(DEBUG_PRINT)
+if ( my_p != ip ) {
+  fprintf(stdout,"  (%d,%d,%d) : P%d recv at local %d from P%d\n",
+                  global_x,global_y,global_z,my_p,local_id[entry],ip);
+  fflush(stdout);
+}
+#endif
+        }
+
+        /* If in my ownership and used by the other processor */
+        if ( my_p != ip &&
+             /* In my ownership: */
+             ( 0 <= local_z && local_z < my_nz &&
+               0 <= local_y && local_y < my_ny &&
+               0 <= local_x && local_x < my_nx ) &&
+             /* In other processors usage: */
+             ( p_iz - ghost <= global_z && global_z < p_ez + ghost &&
+               p_iy - ghost <= global_y && global_y < p_ey + ghost &&
+               p_ix - ghost <= global_x && global_x < p_ex + ghost ) ) {
+
+          resize_int( & send_id , & send_id_size , (iSend + 1) );
+          send_id[ iSend ] = local_id[ entry ] ;
+          ++iSend ;
+
+#if defined(DEBUG_PRINT)
+{
+  fprintf(stdout,"  (%d,%d,%d) : P%d send at local %d to P%d\n",
+                  global_x,global_y,global_z,my_p,local_id[entry],ip);
+  fflush(stdout);
+}
+#endif
+        }
+      }
+    }
+    }
+    }
+  }
+  recv_pc[np] = iLocal ;
+  send_pc[np] = iSend ;
+
+  *map_local_id  = local_id ;
+  *map_recv_pc   = recv_pc ;
+  *map_send_pc   = send_pc ;
+  *map_send_id   = send_id ;
+}
+
+void box_partition_rcb( const int np , 
+                        const int my_p ,
+                        const Box& root_box,
+                        const int ghost ,
+                        Box** pbox,
+                        int ** map_local_id ,
+                        int ** map_recv_pc ,
+                        int ** map_send_pc ,
+                        int ** map_send_id )
+{
+  *pbox = new Box[ np ];
+
+  box_partition( 0 , np , 2 , root_box , *pbox );
+
+  box_partition_maps( np , my_p , *pbox , ghost ,
+                      map_local_id , map_recv_pc , 
+                      map_send_pc , map_send_id );
+}
+
+/*--------------------------------------------------------------------*/
+
+#ifdef UNIT_TEST
+
+static int box_contain( const Box& a , const Box& b )
+{
+  return a[0][0] <= b[0][0] && b[0][1] <= a[0][1] &&
+         a[1][0] <= b[1][0] && b[1][1] <= a[1][1] &&
+         a[2][0] <= b[2][0] && b[2][1] <= a[2][1] ;
+}
+
+static void box_print( FILE * fp , const Box& a )
+{
+  fprintf(fp,"{ [ %d , %d ) , [ %d , %d ) , [ %d , %d ) }",
+                a[0][0] , a[0][1] ,  
+                a[1][0] , a[1][1] ,  
+                a[2][0] , a[2][1] );
+}
+
+static void test_box( const Box& box , const int np )
+{
+  const int ncell_box = box[0][1] * box[1][1] * box[2][1] ;
+  int ncell_total = 0 ;
+  int ncell_min = ncell_box ;
+  int ncell_max = 0 ;
+  std::vector<Box> pbox(np);
+  int i , j ;
+
+  box_partition( 0 , np , 2 , box , &pbox[0] );
+
+  for ( i = 0 ; i < np ; ++i ) {
+    const int ncell = ( pbox[i][0][1] - pbox[i][0][0] ) *
+                      ( pbox[i][1][1] - pbox[i][1][0] ) *
+                      ( pbox[i][2][1] - pbox[i][2][0] );
+
+    if ( ! box_contain( box , pbox[i] ) ) {
+      fprintf(stdout,"  OUT OF BOUNDS pbox[%d/%d] = ",i,np);
+      box_print(stdout,pbox[i]);
+      fprintf(stdout,"\n");
+      abort();
+    }
+
+    for ( j = i + 1 ; j < np ; ++j ) {
+      if ( ! box_disjoint( pbox[i] , pbox[j] ) ) {
+        fprintf(stdout,"  NOT DISJOINT pbox[%d/%d] = ",i,np);
+        box_print(stdout, pbox[i]);
+        fprintf(stdout,"\n");
+        fprintf(stdout,"               pbox[%d/%d] = ",j,np);
+        box_print(stdout, pbox[j]);
+        fprintf(stdout,"\n");
+        abort();
+      }
+    }
+    ncell_total += ncell ;
+
+    if ( ncell_max < ncell ) { ncell_max = ncell ; }
+    if ( ncell < ncell_min ) { ncell_min = ncell ; }
+  }
+
+  if ( ncell_total != ncell_box ) {
+    fprintf(stdout,"  WRONG CELL COUNT NP = %d\n",np);
+    abort();
+  }
+  fprintf(stdout,"NP = %d, total = %d, avg = %d, min = %d, max = %d\n",
+          np,ncell_box,ncell_box/np,ncell_min,ncell_max);
+}
+
+/*--------------------------------------------------------------------*/
+
+static void test_maps( const Box& root_box , const int np )
+{
+  const int ghost = 1 ;
+  const int nx_global = root_box[0][1] - root_box[0][0] ;
+  const int ny_global = root_box[1][1] - root_box[1][0] ;
+  int ieq , i , j ;
+  std::vector<Box> pbox(np);
+  int **local_values ;
+  int **map_local_id ;
+  int **map_recv_pc ;
+  int **map_send_pc ;
+  int **map_send_id ;
+
+  box_partition( 0 , np , 2 , root_box , &pbox[0] );
+
+  local_values = (int **) malloc( sizeof(int*) * np );
+  map_local_id = (int **) malloc( sizeof(int*) * np );
+  map_recv_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_id  = (int **) malloc( sizeof(int*) * np );
+
+  /* Set each local value to the global equation number */
+
+  for ( ieq = i = 0 ; i < np ; ++i ) {
+    const Box& mybox = pbox[i] ;
+    const int nx = mybox[0][1] - mybox[0][0] ;
+    const int ny = mybox[1][1] - mybox[1][0] ;
+    const int nz = mybox[2][1] - mybox[2][0] ;
+    int ix , iy , iz ;
+
+    /* Generate the partition maps for this rank */
+    box_partition_maps( np , i , &pbox[0] , ghost ,
+                        & map_local_id[i] , & map_recv_pc[i] , 
+                        & map_send_pc[i] , & map_send_id[i] );
+
+    local_values[i] = (int *) malloc( sizeof(int) * map_recv_pc[i][np] );
+
+    for ( iz = -ghost ; iz < nz + ghost ; ++iz ) {
+    for ( iy = -ghost ; iy < ny + ghost ; ++iy ) {
+    for ( ix = -ghost ; ix < nx + ghost ; ++ix ) {
+      const int ieq = box_map_local(mybox,ghost,map_local_id[i],ix,iy,iz);
+
+      if ( 0 <= ieq ) {
+        const int ix_global = ix + mybox[0][0] ;
+        const int iy_global = iy + mybox[1][0] ;
+        const int iz_global = iz + mybox[2][0] ;
+
+        if ( root_box[0][0] <= ix_global && ix_global < root_box[0][1] &&
+             root_box[1][0] <= iy_global && iy_global < root_box[1][1] &&
+             root_box[2][0] <= iz_global && iz_global < root_box[2][1] ) {
+
+          local_values[i][ ieq ] = ix_global +
+                                   iy_global * nx_global +
+                                   iz_global * nx_global * ny_global ;
+        }
+        else {
+          local_values[i][ ieq ] = -1 ;
+        }
+      }
+    }
+    }
+    }
+  }
+
+  /* Pair-wise compare the local values */
+  /* i  == receiving processor rank */
+  /* ip == sending   processor rank */
+  /* j  == receiving processor data entry for message from 'ip' */
+  /* jp == sending   processor data entry for message to   'i' */
+
+  for ( i = 0 ; i < np ; ++i ) {
+    for ( j = 1 ; j < np ; ++j ) {
+      const int ip = ( i + j ) % np ;
+      const int jp = ( i + np - ip ) % np ;
+      const int nrecv = map_recv_pc[i] [j+1]  - map_recv_pc[i] [j] ;
+      const int nsend = map_send_pc[ip][jp+1] - map_send_pc[ip][jp] ;
+      int k ;
+      if ( nrecv != nsend ) {
+        fprintf(stderr,"P%d recv %d from P%d\n",i,nrecv,ip);
+        fprintf(stderr,"P%d send %d to   P%d\n",ip,nsend,i);
+        abort();
+      }
+      for ( k = 0 ; k < nrecv ; ++k ) {
+        const int irecv = map_recv_pc[i][j] + k ;
+        const int isend = map_send_pc[ip][jp] + k ;
+        const int val_irecv = local_values[i][irecv] ;
+        const int val_isend = local_values[ip][ map_send_id[ip][isend] ] ;
+        if ( val_irecv != val_isend ) {
+          fprintf(stderr,"P%d recv[%d] = %d , from P%d\n",i,k,val_irecv,ip);
+          fprintf(stderr,"P%d send[%d] = %d , to   P%d\n",ip,k,val_isend,i);
+          abort();
+        }
+      }
+    }
+  }
+
+  for ( i = 0 ; i < np ; ++i ) {
+    free( map_local_id[i] );
+    free( map_recv_pc[i] );
+    free( map_send_pc[i] );
+    free( map_send_id[i] );
+    free( local_values[i] );
+  }
+  free( map_send_id );
+  free( map_send_pc );
+  free( map_recv_pc );
+  free( map_local_id );
+  free( local_values );
+}
+
+/*--------------------------------------------------------------------*/
+
+int main( int argc , char * argv[] )
+{
+  int np_max = 256 ;
+  Box box = { 0 , 64 , 0 , 64 , 0 , 64 };
+  int np = 0 ;
+
+  switch( argc ) {
+  case 3:
+    sscanf(argv[1],"%d",&np);
+    sscanf(argv[2],"%dx%dx%d",& box[0][1] , & box[1][1] , & box[2][1] );
+    if ( 0 < np ) { test_box( box , np ); }
+    if ( 0 < np ) { test_maps( box , np ); }
+    break ;
+  default:
+    for ( np = 1 ; np <= np_max ; ++np ) {
+      test_box( box , np );
+      test_maps( box , np );
+    }
+    break ;
+  }
+  return 0 ;
+}
+
+#endif
+
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/CMakeLists.txt
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(PROG miniFE)
+list(APPEND CXXFLAGS "-I." "-DMINIFE_SCALAR=double"  "-DMINIFE_LOCAL_ORDINAL=int" "-DMINIFE_GLOBAL_ORDINAL=int" "-DMINIFE_CSR_MATRIX")
+list(APPEND LDFLAGS -lm)
+set(RUN_OPTIONS -nx 64 -ny 64 -nz 64)
+llvm_multisource()
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/CSRMatrix.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/CSRMatrix.hpp
@@ -0,0 +1,134 @@
+#ifndef _CSRMatrix_hpp_
+#define _CSRMatrix_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <algorithm>
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal>
+struct
+CSRMatrix {
+  CSRMatrix()
+   : has_local_indices(false),
+     rows(), row_offsets(), row_offsets_external(),
+     packed_cols(), packed_coefs(),
+     num_cols(0)
+#ifdef HAVE_MPI
+     ,external_index(), external_local_index(), elements_to_send(),
+      neighbors(), recv_length(), send_length(), send_buffer(), request()
+#endif
+  {
+  }
+
+  ~CSRMatrix()
+  {}
+
+  typedef Scalar        ScalarType;
+  typedef LocalOrdinal  LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+
+  bool                       has_local_indices;
+  std::vector<GlobalOrdinal> rows;
+  std::vector<LocalOrdinal>  row_offsets;
+  std::vector<LocalOrdinal>  row_offsets_external;
+  std::vector<GlobalOrdinal> packed_cols;
+  std::vector<Scalar>        packed_coefs;
+  LocalOrdinal               num_cols;
+
+#ifdef HAVE_MPI
+  std::vector<GlobalOrdinal> external_index;
+  std::vector<GlobalOrdinal>  external_local_index;
+  std::vector<GlobalOrdinal> elements_to_send;
+  std::vector<int>           neighbors;
+  std::vector<LocalOrdinal>  recv_length;
+  std::vector<LocalOrdinal>  send_length;
+  std::vector<Scalar>        send_buffer;
+  std::vector<MPI_Request>   request;
+#endif
+
+  size_t num_nonzeros() const
+  {
+    return row_offsets[row_offsets.size()-1];
+  }
+
+  void reserve_space(unsigned nrows, unsigned ncols_per_row)
+  {
+    rows.resize(nrows);
+    row_offsets.resize(nrows+1);
+    packed_cols.reserve(nrows * ncols_per_row);
+    packed_coefs.reserve(nrows * ncols_per_row);
+  }
+
+  void get_row_pointers(GlobalOrdinalType row, size_t& row_length,
+                        GlobalOrdinalType*& cols,
+                        ScalarType*& coefs)
+  {
+    ptrdiff_t local_row = -1;
+    //first see if we can get the local-row index using fast direct lookup:
+    if (rows.size() >= 1) {
+      ptrdiff_t idx = row - rows[0];
+      if (idx < rows.size() && rows[idx] == row) {
+        local_row = idx;
+      }
+    }
+ 
+    //if we didn't get the local-row index using direct lookup, try a
+    //more expensive binary-search:
+    if (local_row == -1) {
+      typename std::vector<GlobalOrdinal>::iterator row_iter =
+          std::lower_bound(rows.begin(), rows.end(), row);
+  
+      //if we still haven't found row, it's not local so jump out:
+      if (row_iter == rows.end() || *row_iter != row) {
+        row_length = 0;
+        return;
+      }
+  
+      local_row = row_iter - rows.begin();
+    }
+
+    LocalOrdinalType offset = row_offsets[local_row];
+    row_length = row_offsets[local_row+1] - offset;
+    cols = &packed_cols[offset];
+    coefs = &packed_coefs[offset];
+  }
+};
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/ELLMatrix.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/ELLMatrix.hpp
@@ -0,0 +1,139 @@
+#ifndef _ELLMatrix_hpp_
+#define _ELLMatrix_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <algorithm>
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal>
+struct
+ELLMatrix {
+  ELLMatrix()
+   : has_local_indices(false),
+     rows(),
+     cols(), coefs(),
+     num_cols(0),
+     num_cols_per_row(0)
+#ifdef HAVE_MPI
+     ,external_index(), external_local_index(), elements_to_send(),
+      neighbors(), recv_length(), send_length(), send_buffer(), request()
+#endif
+  {
+  }
+
+  ~ELLMatrix()
+  {}
+
+  typedef Scalar        ScalarType;
+  typedef LocalOrdinal  LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+
+  bool                       has_local_indices;
+  std::vector<GlobalOrdinal> rows;
+  std::vector<GlobalOrdinal> cols;
+  std::vector<Scalar>        coefs;
+  LocalOrdinal               num_cols;
+  LocalOrdinal               num_cols_per_row;
+
+#ifdef HAVE_MPI
+  std::vector<GlobalOrdinal> external_index;
+  std::vector<GlobalOrdinal>  external_local_index;
+  std::vector<GlobalOrdinal> elements_to_send;
+  std::vector<int>           neighbors;
+  std::vector<LocalOrdinal>  recv_length;
+  std::vector<LocalOrdinal>  send_length;
+  std::vector<Scalar>        send_buffer;
+  std::vector<MPI_Request>   request;
+#endif
+
+  size_t num_nonzeros() const
+  {
+    return rows.size()*num_cols_per_row;
+  }
+
+  void reserve_space(unsigned nrows, unsigned ncols_per_row)
+  {
+    rows.resize(nrows);
+    cols.resize(nrows * ncols_per_row);
+    coefs.resize(nrows * ncols_per_row);
+    num_cols_per_row = ncols_per_row;
+  }
+
+  void get_row_pointers(GlobalOrdinalType row, size_t& row_length,
+                        GlobalOrdinalType*& cols_ptr,
+                        ScalarType*& coefs_ptr)
+  {
+    ptrdiff_t local_row = -1;
+    //first see if we can get the local-row index using fast direct lookup:
+    if (rows.size() >= 1) {
+      ptrdiff_t idx = row - rows[0];
+      if (idx < rows.size() && rows[idx] == row) {
+        local_row = idx;
+      }
+    }
+ 
+    //if we didn't get the local-row index using direct lookup, try a
+    //more expensive binary-search:
+    if (local_row == -1) {
+      typename std::vector<GlobalOrdinal>::iterator row_iter =
+          std::lower_bound(rows.begin(), rows.end(), row);
+  
+      //if we still haven't found row, it's not local so jump out:
+      if (row_iter == rows.end() || *row_iter != row) {
+        row_length = 0;
+        return;
+      }
+  
+      local_row = row_iter - rows.begin();
+    }
+
+    cols_ptr = &cols[local_row*num_cols_per_row];
+    coefs_ptr = &coefs[local_row*num_cols_per_row];
+    
+    int idx = num_cols_per_row-1;
+    while(idx>=0) {
+      if (cols_ptr[idx] != 0) break;
+      --idx;
+    }
+    row_length = idx+1;
+  }
+};
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/ElemData.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/ElemData.hpp
@@ -0,0 +1,64 @@
+#ifndef _ElemData_hpp_
+#define _ElemData_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <Hex8_enums.hpp>
+
+namespace miniFE {
+
+template<typename GlobalOrdinal, typename Scalar>
+struct ElemData {
+  ElemData() : nodes_per_elem(Hex8::numNodesPerElem) {}
+  ~ElemData(){}
+
+  const size_t nodes_per_elem;
+  GlobalOrdinal elem_node_ids[Hex8::numNodesPerElem];
+  Scalar grad_vals[Hex8::numGaussPointsPerDim * Hex8::numGaussPointsPerDim * Hex8::numGaussPointsPerDim * Hex8::numNodesPerElem * Hex8::spatialDim];
+  Scalar elem_node_coords[Hex8::numNodesPerElem*Hex8::spatialDim];
+  Scalar elem_diffusion_matrix[(Hex8::numNodesPerElem*(Hex8::numNodesPerElem+1))/2];
+  Scalar elem_source_vector[Hex8::numNodesPerElem];
+};
+
+template<typename GlobalOrdinal, typename Scalar>
+struct ElemDataPtr {
+  ElemDataPtr() : nodes_per_elem(Hex8::numNodesPerElem) {}
+  ~ElemDataPtr(){}
+
+  const size_t nodes_per_elem;
+  GlobalOrdinal elem_node_ids[Hex8::numNodesPerElem];
+  Scalar grad_vals[Hex8::numGaussPointsPerDim * Hex8::numGaussPointsPerDim * Hex8::numGaussPointsPerDim * Hex8::numNodesPerElem * Hex8::spatialDim];
+  Scalar elem_node_coords[(Hex8::numNodesPerElem*(Hex8::spatialDim+1))/2];
+  Scalar* elem_diffusion_matrix;
+  Scalar* elem_source_vector;
+};
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/GetNodesCoords.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/GetNodesCoords.hpp
@@ -0,0 +1,51 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _GETNODESCOORDS_HPP_
+#define _GETNODESCOORDS_HPP_
+
+#include <Hex8_enums.hpp>
+#include <simple_mesh_description.hpp>
+
+template<typename GlobalOrdinal,typename Scalar>
+struct GetNodesCoords {
+  const miniFE::simple_mesh_description<GlobalOrdinal>* mesh;
+  GlobalOrdinal* elemIDs;
+  GlobalOrdinal* node_ordinals;
+  Scalar* elem_node_coords;
+
+inline void operator()(int i)
+{
+  unsigned nnodes = miniFE::Hex8::numNodesPerElem;
+  GlobalOrdinal elemID = elemIDs[i];
+  GlobalOrdinal* node_ords = node_ordinals+i*nnodes;
+  Scalar* node_coords = elem_node_coords+i*nnodes*miniFE::Hex8::spatialDim;
+  get_elem_nodes_and_coords(*mesh, elemID, node_ords, node_coords);
+}
+};
+
+#endif
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Hex8.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Hex8.hpp
@@ -0,0 +1,417 @@
+#ifndef _Hex8_hpp_
+#define _Hex8_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KERNEL_PREFIX 
+#define KERNEL_PREFIX
+#endif
+
+#include <gauss_pts.hpp>
+#include <matrix_algebra_3x3.hpp>
+#include <Hex8_enums.hpp>
+
+namespace miniFE {
+
+namespace Hex8 {
+
+template<typename Scalar>
+KERNEL_PREFIX void shape_fns(const Scalar* x, Scalar* values_at_nodes)
+{
+  //assumptions: values_at_nodes has length numNodesPerElem
+  //             x has length 3 (hard-coded spatialDim)
+
+  const Scalar u = 1.0 - x[0];
+  const Scalar v = 1.0 - x[1];
+  const Scalar w = 1.0 - x[2];
+
+  const Scalar up1 = 1.0 + x[0];
+  const Scalar vp1 = 1.0 + x[1];
+  const Scalar wp1 = 1.0 + x[2];
+
+  values_at_nodes[0] = 0.125 *   u *   v *   w;//(1-x)*(1-y)*(1-z)
+  values_at_nodes[1] = 0.125 * up1 *   v *   w;//(1+x)*(1-y)*(1-z)
+  values_at_nodes[2] = 0.125 * up1 * vp1 *   w;//(1+x)*(1+y)*(1-z)
+  values_at_nodes[3] = 0.125 *   u * vp1 *   w;//(1-x)*(1+y)*(1-z)
+  values_at_nodes[4] = 0.125 *   u *   v * wp1;//(1-x)*(1-y)*(1+z)
+  values_at_nodes[5] = 0.125 * up1 *   v * wp1;//(1+x)*(1-y)*(1+z)
+  values_at_nodes[6] = 0.125 * up1 * vp1 * wp1;//(1+x)*(1+y)*(1+z)
+  values_at_nodes[7] = 0.125 *   u * vp1 * wp1;//(1-x)*(1+y)*(1+z)
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void gradients(const Scalar* x, Scalar* values_per_fn)
+{
+  //assumptions values_per_fn has length 24 (numNodesPerElem*spatialDim)
+  //        spatialDim == 3
+
+  const Scalar u = 1.0 - x[0];
+  const Scalar v = 1.0 - x[1];
+  const Scalar w = 1.0 - x[2];
+
+  const Scalar up1 = 1.0 + x[0];
+  const Scalar vp1 = 1.0 + x[1];
+  const Scalar wp1 = 1.0 + x[2];
+
+//fn 0
+  values_per_fn[0] = -0.125 *  v *  w;
+  values_per_fn[1] = -0.125 *  u *  w;
+  values_per_fn[2] = -0.125 *  u *  v;
+//fn 1
+  values_per_fn[3] =  0.125 *  v   *  w;
+  values_per_fn[4] = -0.125 *  up1 *  w;
+  values_per_fn[5] = -0.125 *  up1 *  v;
+//fn 2
+  values_per_fn[6] =  0.125 *  vp1 *  w;
+  values_per_fn[7] =  0.125 *  up1 *  w;
+  values_per_fn[8] = -0.125 *  up1 *  vp1;
+//fn 3
+  values_per_fn[9]  = -0.125 *  vp1 *  w;
+  values_per_fn[10] =  0.125 *  u   *  w;
+  values_per_fn[11] = -0.125 *  u   *  vp1;
+//fn 4
+  values_per_fn[12] = -0.125 *  v   * wp1;
+  values_per_fn[13] = -0.125 *  u   * wp1;
+  values_per_fn[14] =  0.125 *  u   * v;
+//fn 5
+  values_per_fn[15] =  0.125 *  v * wp1;
+  values_per_fn[16] = -0.125 *  up1 * wp1;
+  values_per_fn[17] =  0.125 *  up1 * v;
+//fn 6
+  values_per_fn[18] =  0.125 *  vp1 * wp1;
+  values_per_fn[19] =  0.125 *  up1 * wp1;
+  values_per_fn[20] =  0.125 *  up1 * vp1;
+//fn 7
+  values_per_fn[21] = -0.125 *  vp1 * wp1;
+  values_per_fn[22] =  0.125 *  u   * wp1;
+  values_per_fn[23] =  0.125 *  u   * vp1;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void gradients_and_detJ(const Scalar* elemNodeCoords,
+                                          const Scalar* grad_vals,
+                                          Scalar& detJ)
+{
+/**
+  pt is the point at which the jacobian is to be computed.
+*/
+
+  //assumptions on the lengths of input arguments:
+  //elemNodeCoords has length numNodesPerElem*spatialDim,
+  //grad_vals has length numNodesPerElem*spatialDim
+
+  const Scalar zero = 0;
+
+  Scalar J00 = zero;
+  Scalar J01 = zero;
+  Scalar J02 = zero;
+
+  Scalar J10 = zero;
+  Scalar J11 = zero;
+  Scalar J12 = zero;
+
+  Scalar J20 = zero;
+  Scalar J21 = zero;
+  Scalar J22 = zero;
+
+  size_t i_X_spatialDim = 0;
+  for(size_t i=0; i<numNodesPerElem; ++i) {
+//    size_t offset = 0;
+//    for(size_t gd=0; gd<spatialDim; ++gd) {
+//
+//      Scalar gval = grad_vals[i_X_spatialDim+gd];
+//
+//      for(size_t jd=0; jd<spatialDim; ++jd) {
+//        J[offset++] += gval*elemNodeCoords[i_X_spatialDim+jd];
+//      }
+//    }
+    //for optimization, unroll the above double-loop over spatialDim:
+    //(hard-coded assumption that spatialDim == 3)
+    J00 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+0];
+    J01 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+1];
+    J02 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+2];
+
+    J10 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+0];
+    J11 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+1];
+    J12 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+2];
+
+    J20 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+0];
+    J21 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+1];
+    J22 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+2];
+
+    i_X_spatialDim += spatialDim;
+  }
+
+  Scalar term0 = J22*J11 - J21*J12;
+  Scalar term1 = J22*J01 - J21*J02;
+  Scalar term2 = J12*J01 - J11*J02;
+
+  detJ = J00*term0 - J10*term1 + J20*term2;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void gradients_and_invJ_and_detJ(const Scalar* elemNodeCoords,
+                                               const Scalar* grad_vals,
+                                               Scalar* invJ,
+                                               Scalar& detJ)
+{
+/**
+  pt is the point at which the jacobian is to be computed.
+*/
+
+  //assumptions on the lengths of input arguments:
+  //pt has length spatialDim,
+  //elemNodeCoords has length numNodesPerElem*spatialDim,
+  //grad_vals has length numNodesPerElem*spatialDim, and
+  //J has length spatialDim*spatialDim
+
+  const Scalar zero = 0;
+
+  //
+  //First we compute the jacobian J:
+  //
+  Scalar J00 = zero;
+  Scalar J01 = zero;
+  Scalar J02 = zero;
+
+  Scalar J10 = zero;
+  Scalar J11 = zero;
+  Scalar J12 = zero;
+
+  Scalar J20 = zero;
+  Scalar J21 = zero;
+  Scalar J22 = zero;
+
+  size_t i_X_spatialDim = 0;
+  for(size_t i=0; i<numNodesPerElem; ++i) {
+//    size_t offset = 0;
+//    for(size_t gd=0; gd<spatialDim; ++gd) {
+//
+//      Scalar gval = grad_vals[i_X_spatialDim+gd];
+//
+//      for(size_t jd=0; jd<spatialDim; ++jd) {
+//        J[offset++] += gval*elemNodeCoords[i_X_spatialDim+jd];
+//      }
+//    }
+    //for optimization, unroll the above double-loop over spatialDim:
+    //(a hard-coded assumption that spatialDim == 3)
+    J00 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+0];
+    J01 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+1];
+    J02 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+2];
+
+    J10 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+0];
+    J11 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+1];
+    J12 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+2];
+
+    J20 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+0];
+    J21 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+1];
+    J22 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+2];
+
+    i_X_spatialDim += spatialDim;
+  }
+
+  Scalar term0 = J22*J11 - J21*J12;
+  Scalar term1 = J22*J01 - J21*J02;
+  Scalar term2 = J12*J01 - J11*J02;
+
+  detJ = J00*term0 - J10*term1 + J20*term2;
+
+  Scalar inv_detJ = 1.0/detJ;
+
+  invJ[0] =  term0*inv_detJ;
+  invJ[1] = -term1*inv_detJ;
+  invJ[2] =  term2*inv_detJ;
+
+  invJ[3] = -(J22*J10 - J20*J12)*inv_detJ;
+  invJ[4] =  (J22*J00 - J20*J02)*inv_detJ;
+  invJ[5] = -(J12*J00 - J10*J02)*inv_detJ;
+
+  invJ[6] =  (J21*J10 - J20*J11)*inv_detJ;
+  invJ[7] = -(J21*J00 - J20*J01)*inv_detJ;
+  invJ[8] =  (J11*J00 - J10*J01)*inv_detJ;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void diffusionMatrix_symm(const Scalar* elemNodeCoords,
+                        const Scalar* grad_vals,
+                        Scalar* elem_mat)
+{
+  int len = (numNodesPerElem * (numNodesPerElem+1))/2;
+  const Scalar zero = 0;
+  miniFE::fill(elem_mat, elem_mat+len, zero);
+
+  Scalar gpts[numGaussPointsPerDim];
+  Scalar gwts[numGaussPointsPerDim];
+
+  gauss_pts(numGaussPointsPerDim, gpts, gwts);
+
+  const Scalar k = 1.0;
+  Scalar detJ = 0.0;
+
+  Scalar dpsidx[numNodesPerElem], dpsidy[numNodesPerElem], dpsidz[numNodesPerElem];
+
+  Scalar invJ[spatialDim*spatialDim];
+
+  //The following nested loop implements equations 3.4.5 and 3.4.7 on page 88
+  //of Reddy & Gartling, "The Finite Element Method in Heat Transfer and Fluid
+  //Dynamics", 2nd edition,
+  //to compute the element diffusion matrix for the steady conduction equation.
+
+  Scalar pt[spatialDim];
+
+#ifdef MINIFE_DEBUG
+  Scalar volume = zero;
+#endif
+
+  size_t gv_offset = 0;
+  for(size_t ig=0; ig<numGaussPointsPerDim; ++ig) {
+    Scalar wi = gwts[ig];
+
+    for(size_t jg=0; jg<numGaussPointsPerDim; ++jg) {
+      Scalar wi_wj = wi*gwts[jg];
+
+      for(size_t kg=0; kg<numGaussPointsPerDim; ++kg) {
+        Scalar wi_wj_wk = wi_wj*gwts[kg];
+        const Scalar* grad_vals_ptr = &grad_vals[gv_offset];
+        gv_offset += numNodesPerElem*spatialDim;
+        gradients_and_invJ_and_detJ(elemNodeCoords, grad_vals_ptr, invJ, detJ);
+
+#ifdef MINIFE_DEBUG
+        volume += detJ;
+#endif
+        Scalar k_detJ_wi_wj_wk = k*detJ*wi_wj_wk;
+
+        const Scalar* gv = grad_vals_ptr;
+        for(int i=0; i<numNodesPerElem; ++i) {
+          Scalar gv0 = gv[0], gv1 = gv[1], gv2 = gv[2];
+          dpsidx[i] = gv0 * invJ[0] +
+                      gv1 * invJ[1] +
+                      gv2 * invJ[2];
+          dpsidy[i] = gv0 * invJ[3] +
+                      gv1 * invJ[4] +
+                      gv2 * invJ[5];
+          dpsidz[i] = gv0 * invJ[6] +
+                      gv1 * invJ[7] +
+                      gv2 * invJ[8];
+          gv += spatialDim;
+        }
+
+        int offset = 0;
+        for(int m=0; m<numNodesPerElem; ++m) {
+          const Scalar dpsidx_m = dpsidx[m];
+          const Scalar dpsidy_m = dpsidy[m];
+          const Scalar dpsidz_m = dpsidz[m];
+
+          elem_mat[offset++] += k_detJ_wi_wj_wk *
+                              ((dpsidx_m*dpsidx_m) +
+                               (dpsidy_m*dpsidy_m) +
+                               (dpsidz_m*dpsidz_m));
+
+          for(int n=m+1; n<numNodesPerElem; ++n) {
+            elem_mat[offset++] += k_detJ_wi_wj_wk *
+                                  ((dpsidx_m * dpsidx[n]) +
+                                   (dpsidy_m * dpsidy[n]) +
+                                   (dpsidz_m * dpsidz[n]));
+          }
+        }
+
+      }//for kg
+    }//for jg
+  }//for ig
+
+//int offset = 0;
+//std::cout.precision(16);
+//for(int m=0; m<numNodesPerElem; ++m) {
+//  for(int n=m; n<numNodesPerElem; ++n) {
+//std::cout<<"elem_mat["<<offset<<"] = "<<elem_mat[offset]<<";"<<std::endl;
+//   ++offset;
+//  }
+//}
+#ifdef MINIFE_DEBUG
+//  std::cout << "element volume: " << volume << std::endl;
+//  if (std::abs(volume - 1) > 1.e-7) {
+//    std::cout << "element volume is "<<volume<<", expected 1.0."<<std::endl;
+//  }
+#endif
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void sourceVector(const Scalar* elemNodeCoords,
+                                const Scalar* grad_vals,
+                                Scalar* elem_vec)
+{
+  int len = numNodesPerElem;
+  const Scalar zero = 0;
+  miniFE::fill(elem_vec, elem_vec+len, zero);
+
+  Scalar gpts[numGaussPointsPerDim];
+  Scalar gwts[numGaussPointsPerDim];
+
+  Scalar psi[numNodesPerElem];
+
+  gauss_pts(numGaussPointsPerDim, gpts, gwts);
+
+  Scalar Q = 1.0;
+
+  Scalar pt[spatialDim];
+
+  size_t gv_offset = 0;
+  for(size_t ig=0; ig<numGaussPointsPerDim; ++ig) {
+    pt[0] = gpts[ig];
+    Scalar wi = gwts[ig];
+
+    for(size_t jg=0; jg<numGaussPointsPerDim; ++jg) {
+      pt[1] = gpts[jg];
+      Scalar wj = gwts[jg];
+
+      for(size_t kg=0; kg<numGaussPointsPerDim; ++kg) {
+        pt[2] = gpts[kg];
+        Scalar wk = gwts[kg];
+    
+        shape_fns(pt, psi);
+        const Scalar* grad_vals_ptr = &grad_vals[gv_offset];
+        gv_offset += numNodesPerElem*spatialDim;
+        Scalar detJ;
+        gradients_and_detJ(elemNodeCoords, grad_vals_ptr, detJ);
+    
+        Scalar term = Q*detJ*wi*wj*wk;
+
+        for(int i=0; i<numNodesPerElem; ++i) {
+          elem_vec[i] += psi[i]*term;
+        }
+      }
+    }
+  }
+}
+
+}//namespace Hex8
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Hex8_ElemData.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Hex8_ElemData.hpp
@@ -0,0 +1,86 @@
+#ifndef _Hex8_ElemData_hpp_
+#define _Hex8_ElemData_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <Hex8_enums.hpp>
+#include <Hex8.hpp>
+#include <ElemData.hpp>
+
+namespace miniFE {
+
+template<typename Scalar>
+void compute_gradient_values(Scalar* grad_vals)
+{
+  Scalar gpts[Hex8::numGaussPointsPerDim];
+  Scalar gwts[Hex8::numGaussPointsPerDim];
+
+  gauss_pts(Hex8::numGaussPointsPerDim, gpts, gwts);
+
+  Scalar pt[Hex8::spatialDim];
+
+  Scalar* grad_vals_ptr = grad_vals;
+  for(size_t ig=0; ig<Hex8::numGaussPointsPerDim; ++ig) {
+    pt[0] = gpts[ig];
+    for(size_t jg=0; jg<Hex8::numGaussPointsPerDim; ++jg) {
+      pt[1] = gpts[jg];
+      for(size_t kg=0; kg<Hex8::numGaussPointsPerDim; ++kg) {
+        pt[2] = gpts[kg];
+
+        Hex8::gradients(pt, grad_vals_ptr);
+
+        grad_vals_ptr += Hex8::numNodesPerElem*Hex8::spatialDim;
+      }
+    }
+  }
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+void
+compute_element_matrix_and_vector(ElemData<GlobalOrdinal,Scalar>& elem_data)
+{
+  Hex8::diffusionMatrix_symm(elem_data.elem_node_coords, elem_data.grad_vals,
+                             elem_data.elem_diffusion_matrix);
+  Hex8::sourceVector(elem_data.elem_node_coords, elem_data.grad_vals,
+                     elem_data.elem_source_vector);
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+void
+compute_element_matrix_and_vector(ElemDataPtr<GlobalOrdinal,Scalar>& elem_data)
+{
+  Hex8::diffusionMatrix_symm(elem_data.elem_node_coords, elem_data.grad_vals,
+                             elem_data.elem_diffusion_matrix);
+  Hex8::sourceVector(elem_data.elem_node_coords, elem_data.grad_vals,
+                     elem_data.elem_source_vector);
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Hex8_box_utils.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Hex8_box_utils.hpp
@@ -0,0 +1,173 @@
+#ifndef _Hex8_box_utils_hpp_
+#define _Hex8_box_utils_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <stdexcept>
+
+#include <box_utils.hpp>
+#include <ElemData.hpp>
+#include <simple_mesh_description.hpp>
+#include <Hex8.hpp>
+
+namespace miniFE {
+
+
+template<typename GlobalOrdinal>
+void get_hex8_node_ids(int nx, int ny,
+                       GlobalOrdinal node0,
+                       GlobalOrdinal* elem_node_ids)
+{
+//Given box dimensions nx and ny, and a starting node
+//(local-node-0 for a hex8), compute the other nodes
+//of the hex8 using the exodus ordering convention.
+  elem_node_ids[0] = node0;
+  elem_node_ids[1] = node0 + 1;
+  elem_node_ids[2] = node0 + nx + 1;
+  elem_node_ids[3] = node0 + nx;
+  elem_node_ids[4] = node0 +     nx*ny;
+  elem_node_ids[5] = node0 + 1 + nx*ny;
+  elem_node_ids[6] = node0 + nx + nx*ny + 1;
+  elem_node_ids[7] = node0 + nx + nx*ny;
+}
+
+template<typename Scalar>
+void get_hex8_node_coords_3d(Scalar x, Scalar y, Scalar z,
+                             Scalar hx, Scalar hy, Scalar hz,
+                             Scalar* elem_node_coords)
+{
+  //Input: x,y,z are the coordinates of local-node 0 for a Hex8.
+  //'hx', 'hy', 'hz' are the lengths of the sides of the element
+  //in each direction.
+
+  elem_node_coords[0] = x;
+  elem_node_coords[1] = y;
+  elem_node_coords[2] = z;
+
+  elem_node_coords[3] = x + hx;
+  elem_node_coords[4] = y;
+  elem_node_coords[5] = z;
+
+  elem_node_coords[6] = x + hx;
+  elem_node_coords[7] = y + hy;
+  elem_node_coords[8] = z;
+
+  elem_node_coords[9]  = x;
+  elem_node_coords[10] = y + hy;
+  elem_node_coords[11] = z;
+
+  elem_node_coords[12] = x;
+  elem_node_coords[13] = y;
+  elem_node_coords[14] = z + hz;
+
+  elem_node_coords[15] = x + hx;
+  elem_node_coords[16] = y;
+  elem_node_coords[17] = z + hz;
+
+  elem_node_coords[18] = x + hx;
+  elem_node_coords[19] = y + hy;
+  elem_node_coords[20] = z + hz;
+
+  elem_node_coords[21] = x;
+  elem_node_coords[22] = y + hy;
+  elem_node_coords[23] = z + hz;
+}
+
+template<typename GlobalOrdinal, typename Scalar>
+void
+get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
+                          GlobalOrdinal elemID,
+                          GlobalOrdinal* node_ords, Scalar* node_coords)
+{
+  int global_nodes_x = mesh.global_box[0][1]+1;
+  int global_nodes_y = mesh.global_box[1][1]+1;
+  int global_nodes_z = mesh.global_box[2][1]+1;
+ 
+  if (elemID < 0) {
+    //I don't think this can happen, but check for the sake of paranoia...
+    throw std::runtime_error("get_elem_nodes_and_coords ERROR, negative elemID");
+  }
+
+  int elem_int_x, elem_int_y, elem_int_z;
+  get_int_coords(elemID, global_nodes_x-1, global_nodes_y-1, global_nodes_z-1,
+             elem_int_x, elem_int_y, elem_int_z);
+  GlobalOrdinal nodeID = get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z, elem_int_x, elem_int_y, elem_int_z);
+
+#ifdef MINIFE_DEBUG_VERBOSE
+  std::cout<<"\nelemID: "<<elemID<<", nodeID: "<<nodeID<<std::endl;
+#endif
+  get_hex8_node_ids(global_nodes_x, global_nodes_y, nodeID, node_ords);
+
+  //Map node-IDs to rows because each processor may have a non-contiguous block of
+  //node-ids, but needs a contiguous block of row-numbers:
+#ifdef MINIFE_DEBUG_VERBOSE
+  std::cout<<"elem "<<elemID<<" nodes: ";
+#endif
+  for(int i=0; i<Hex8::numNodesPerElem; ++i) {
+#ifdef MINIFE_DEBUG_VERBOSE
+    std::cout<<node_ords[i]<<" ";
+#endif
+    node_ords[i] = mesh.map_id_to_row(node_ords[i]);
+  }
+#ifdef MINIFE_DEBUG_VERBOSE
+  std::cout << std::endl;
+#endif
+
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+ 
+  Scalar ix,iy,iz;
+  get_coords<GlobalOrdinal,Scalar>(nodeID, global_nodes_x,global_nodes_y,global_nodes_z,
+                            ix,iy,iz);
+  Scalar hx = 1.0/global_elems_x;
+  Scalar hy = 1.0/global_elems_y;
+  Scalar hz = 1.0/global_elems_z;
+  get_hex8_node_coords_3d(ix, iy, iz, hx, hy, hz, node_coords);
+#ifdef MINIFE_DEBUG_VERBOSE
+  int offset = 0;
+  for(int i=0; i<Hex8::numNodesPerElem; ++i) {
+    std::cout << "("<<node_coords[offset++]<<","<<node_coords[offset++]<<","<<node_coords[offset++]<<")";
+  }
+  std::cout << std::endl;
+#endif
+}
+
+template<typename GlobalOrdinal, typename Scalar>
+void
+get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
+                          GlobalOrdinal elemID,
+                          ElemData<GlobalOrdinal,Scalar>& elem_data)
+{
+  get_elem_nodes_and_coords(mesh, elemID, elem_data.elem_node_ids, elem_data.elem_node_coords);
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Hex8_enums.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Hex8_enums.hpp
@@ -0,0 +1,52 @@
+#ifndef _Hex8_enums_hpp_
+#define _Hex8_enums_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+namespace miniFE {
+
+namespace Hex8 {
+
+//   !!!!!!!
+//Important note: there are places in miniFE code where
+//loops over spatialDim are unrolled (spatialDim is assumed to be 3).
+//Thus, changing this enum is not enough to make miniFE code
+//work for spatialDim values other than 3.
+//   !!!!!!!
+enum {
+  spatialDim = 3,
+  numNodesPerElem = 8,
+  numGaussPointsPerDim = 2
+};
+
+}//namespace Hex8
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Makefile
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Makefile
@@ -0,0 +1,7 @@
+LEVEL = ../../../..
+
+PROG     = miniFE
+CXXFLAGS = "-I." "-DMINIFE_SCALAR=double"  "-DMINIFE_LOCAL_ORDINAL=int" "-DMINIFE_GLOBAL_ORDINAL=int" "-DMINIFE_CSR_MATRIX"
+LDFLAGS  = -lm
+RUN_OPTIONS = -nx 64 -ny 64 -nz 64
+include $(LEVEL)/MultiSource/Makefile.multisrc
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/MatrixCopyOp.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/MatrixCopyOp.hpp
@@ -0,0 +1,60 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _MatrixCopyOp_hpp_
+#define _MatrixCopyOp_hpp_
+
+template<typename MatrixType>
+struct MatrixCopyOp {
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+  typedef typename MatrixType::ScalarType ScalarType;
+
+  const GlobalOrdinalType* src_rows;
+  const LocalOrdinalType*  src_rowoffsets;
+  const GlobalOrdinalType* src_cols;
+  const ScalarType*        src_coefs;
+
+  GlobalOrdinalType* dest_rows;
+  LocalOrdinalType*  dest_rowoffsets;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+
+  inline void operator()(int i)
+  {
+    dest_rows[i] = src_rows[i];
+    dest_rowoffsets[i] = src_rowoffsets[i];
+    for(int j=src_rowoffsets[i]; j<src_rowoffsets[i+1]; ++j) {
+      dest_cols[j] = src_cols[j];
+      dest_coefs[j] = src_coefs[j];
+    }
+  }
+};
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/MatrixInitOp.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/MatrixInitOp.hpp
@@ -0,0 +1,231 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _MatrixInitOp_hpp_
+#define _MatrixInitOp_hpp_
+
+#include <simple_mesh_description.hpp>
+#include <box_utils.hpp>
+
+#include <CSRMatrix.hpp>
+#include <ELLMatrix.hpp>
+
+#include <algorithm>
+
+template<typename GlobalOrdinal>
+void sort_if_needed(GlobalOrdinal* list,
+                    GlobalOrdinal list_len)
+{
+  bool need_to_sort = false;
+  for(GlobalOrdinal i=list_len-1; i>=1; --i) {
+    if (list[i] < list[i-1]) {
+      need_to_sort = true;
+      break;
+    }
+  }
+
+  if (need_to_sort) {
+    std::sort(list,list+list_len);
+  }
+}
+
+template<typename MatrixType>
+struct MatrixInitOp {
+};
+
+template<>
+struct MatrixInitOp<miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL> > {
+  MatrixInitOp(const std::vector<MINIFE_GLOBAL_ORDINAL>& rows_vec,
+               const std::vector<MINIFE_LOCAL_ORDINAL>& row_offsets_vec,
+               const std::vector<int>& row_coords_vec,
+               int global_nx, int global_ny, int global_nz,
+               MINIFE_GLOBAL_ORDINAL global_n_rows,
+               const miniFE::simple_mesh_description<MINIFE_GLOBAL_ORDINAL>& input_mesh,
+               miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL>& matrix)
+   : rows(&rows_vec[0]),
+     row_offsets(&row_offsets_vec[0]),
+     row_coords(&row_coords_vec[0]),
+     global_nodes_x(global_nx),
+     global_nodes_y(global_ny),
+     global_nodes_z(global_nz),
+     global_nrows(global_n_rows),
+     mesh(&input_mesh),
+     dest_rows(&matrix.rows[0]),
+     dest_rowoffsets(&matrix.row_offsets[0]),
+     dest_cols(&matrix.packed_cols[0]),
+     dest_coefs(&matrix.packed_coefs[0]),
+     n(matrix.rows.size())
+  {
+    if (matrix.packed_cols.capacity() != matrix.packed_coefs.capacity()) {
+      std::cout<<"Warning, packed_cols.capacity ("<<matrix.packed_cols.capacity()<<") != "
+        << "packed_coefs.capacity ("<<matrix.packed_coefs.capacity()<<")"<<std::endl;
+    }
+
+    size_t nnz = row_offsets_vec[n];
+    if (matrix.packed_cols.capacity() < nnz) {
+      std::cout<<"Warning, packed_cols.capacity ("<<matrix.packed_cols.capacity()<<") < "
+        " nnz ("<<nnz<<")"<<std::endl;
+    }
+
+    matrix.packed_cols.resize(nnz);
+    matrix.packed_coefs.resize(nnz);
+    dest_rowoffsets[n] = nnz;
+#ifdef HAVE_MPI 
+   MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+#else
+   proc = 0;
+#endif
+  }
+
+  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
+  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
+  typedef MINIFE_SCALAR ScalarType;
+
+  const GlobalOrdinalType* rows;
+  const LocalOrdinalType*  row_offsets;
+  const int*               row_coords;
+
+  int global_nodes_x;
+  int global_nodes_y;
+  int global_nodes_z;
+
+  GlobalOrdinalType global_nrows;
+
+  GlobalOrdinalType* dest_rows;
+  LocalOrdinalType*  dest_rowoffsets;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+  int proc;
+
+  const miniFE::simple_mesh_description<GlobalOrdinalType>* mesh;
+
+  inline void operator()(int i)
+  {
+    dest_rows[i] = rows[i];
+    int offset = row_offsets[i];
+    dest_rowoffsets[i] = offset;
+    int ix = row_coords[i*3];
+    int iy = row_coords[i*3+1];
+    int iz = row_coords[i*3+2];
+    GlobalOrdinalType nnz = 0;
+    for(int sz=-1; sz<=1; ++sz) {
+      for(int sy=-1; sy<=1; ++sy) {
+        for(int sx=-1; sx<=1; ++sx) {
+          GlobalOrdinalType col_id =
+              miniFE::get_id<GlobalOrdinalType>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                   ix+sx, iy+sy, iz+sz);
+          if (col_id >= 0 && col_id < global_nrows) {
+            GlobalOrdinalType col = mesh->map_id_to_row(col_id);
+            if (col >= global_nrows) {
+              std::cout << "mesh->map_id_to_row produced col="<<col<<" from col_id="<<col_id<<", but global_nrows="<<global_nrows<<", max_row_in_map="<<mesh->max_row_in_map()<<", proc="<<proc<<std::endl;
+            }
+            dest_cols[offset+nnz] = col;
+            dest_coefs[offset+nnz] = 0;
+            ++nnz;
+          }
+        }
+      }
+    }
+
+    sort_if_needed(&dest_cols[offset], nnz);
+  }
+};
+
+template<>
+struct MatrixInitOp<miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL> > {
+  MatrixInitOp(const std::vector<MINIFE_GLOBAL_ORDINAL>& rows_vec,
+               const std::vector<MINIFE_LOCAL_ORDINAL>& /*row_offsets_vec*/,
+               const std::vector<int>& row_coords_vec,
+               int global_nx, int global_ny, int global_nz,
+               MINIFE_GLOBAL_ORDINAL global_n_rows,
+               const miniFE::simple_mesh_description<MINIFE_GLOBAL_ORDINAL>& input_mesh,
+               miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL>& matrix)
+   : rows(&rows_vec[0]),
+     row_coords(&row_coords_vec[0]),
+     global_nodes_x(global_nx),
+     global_nodes_y(global_ny),
+     global_nodes_z(global_nz),
+     global_nrows(global_n_rows),
+     mesh(&input_mesh),
+     dest_rows(&matrix.rows[0]),
+     dest_cols(&matrix.cols[0]),
+     dest_coefs(&matrix.coefs[0]),
+     n(matrix.rows.size()),
+     ncols_per_row(matrix.num_cols_per_row)
+  {
+  }
+
+  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
+  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
+  typedef MINIFE_SCALAR ScalarType;
+
+  const GlobalOrdinalType* rows;
+  const int*               row_coords;
+
+  int global_nodes_x;
+  int global_nodes_y;
+  int global_nodes_z;
+
+  GlobalOrdinalType global_nrows;
+
+  GlobalOrdinalType* dest_rows;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+  int ncols_per_row;
+
+  const miniFE::simple_mesh_description<GlobalOrdinalType>* mesh;
+
+  inline void operator()(int i)
+  {
+    dest_rows[i] = rows[i];
+    int offset = i*ncols_per_row;
+    int ix = row_coords[i*3];
+    int iy = row_coords[i*3+1];
+    int iz = row_coords[i*3+2];
+    GlobalOrdinalType nnz = 0;
+    for(int sz=-1; sz<=1; ++sz)
+      for(int sy=-1; sy<=1; ++sy)
+        for(int sx=-1; sx<=1; ++sx) {
+          GlobalOrdinalType col_id =
+              miniFE::get_id<GlobalOrdinalType>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                   ix+sx, iy+sy, iz+sz);
+          if (col_id >= 0 && col_id < global_nrows) {
+            GlobalOrdinalType col = mesh->map_id_to_row(col_id);
+            dest_cols[offset+nnz] = col;
+            dest_coefs[offset+nnz] = 0;
+            ++nnz;
+          }
+        }
+
+    sort_if_needed(&dest_cols[offset], nnz);
+  }
+};
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Parameters.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Parameters.hpp
@@ -0,0 +1,64 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _parameters_hpp_
+#define _parameters_hpp_
+
+#include <string>
+
+namespace miniFE {
+
+struct Parameters {
+  Parameters()
+   : nx(5), ny(nx), nz(nx), numthreads(1),
+     mv_overlap_comm_comp(0), use_locking(0),
+     load_imbalance(0), name(), elem_group_size(1),
+     use_elem_mat_fields(1), verify_solution(0),
+     device(0),num_devices(2),skip_device(9999),numa(1)
+  {}
+
+  int nx;
+  int ny;
+  int nz;
+  int numthreads;
+  int mv_overlap_comm_comp;
+  int use_locking;
+  float load_imbalance;
+  std::string name;
+  int elem_group_size;
+  int use_elem_mat_fields;
+  int verify_solution;
+  int device;
+  int num_devices;
+  int skip_device;
+  int numa;
+};//struct Parameters
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/SparseMatrix_functions.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/SparseMatrix_functions.hpp
@@ -0,0 +1,680 @@
+#ifndef _SparseMatrix_functions_hpp_
+#define _SparseMatrix_functions_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <sstream>
+#include <fstream>
+
+#include <Vector.hpp>
+#include <Vector_functions.hpp>
+#include <ElemData.hpp>
+#include <MatrixInitOp.hpp>
+#include <MatrixCopyOp.hpp>
+#include <exchange_externals.hpp>
+#include <mytimer.hpp>
+
+#ifdef MINIFE_HAVE_TBB
+#include <LockingMatrix.hpp>
+#endif
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType>
+void init_matrix(MatrixType& M,
+                 const std::vector<typename MatrixType::GlobalOrdinalType>& rows,
+                 const std::vector<typename MatrixType::LocalOrdinalType>& row_offsets,
+                 const std::vector<int>& row_coords,
+                 int global_nodes_x,
+                 int global_nodes_y,
+                 int global_nodes_z,
+                 typename MatrixType::GlobalOrdinalType global_nrows,
+                 const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh)
+{
+  MatrixInitOp<MatrixType> mat_init(rows, row_offsets, row_coords,
+                                 global_nodes_x, global_nodes_y, global_nodes_z,
+                                 global_nrows, mesh, M);
+
+  for(int i=0; i<mat_init.n; ++i) {
+    mat_init(i);
+  }
+}
+
+template<typename T,
+         typename U>
+void sort_with_companions(ptrdiff_t len, T* array, U* companions)
+{
+  ptrdiff_t i, j, index;
+  U companion;
+
+  for (i=1; i < len; i++) {
+    index = array[i];
+    companion = companions[i];
+    j = i;
+    while ((j > 0) && (array[j-1] > index))
+    {
+      array[j] = array[j-1];
+      companions[j] = companions[j-1];
+      j = j - 1;
+    }
+    array[j] = index;
+    companions[j] = companion;
+  }
+}
+
+template<typename MatrixType>
+void write_matrix(const std::string& filename, 
+                  MatrixType& mat)
+{
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::ScalarType ScalarType;
+
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  std::ostringstream osstr;
+  osstr << filename << "." << numprocs << "." << myproc;
+  std::string full_name = osstr.str();
+  std::ofstream ofs(full_name.c_str());
+
+  size_t nrows = mat.rows.size();
+  size_t nnz = mat.num_nonzeros();
+
+  for(int p=0; p<numprocs; ++p) {
+    if (p == myproc) {
+      if (p == 0) {
+        ofs << nrows << " " << nnz << std::endl;
+      }
+      for(size_t i=0; i<nrows; ++i) {
+        size_t row_len = 0;
+        GlobalOrdinalType* cols = NULL;
+        ScalarType* coefs = NULL;
+        mat.get_row_pointers(mat.rows[i], row_len, cols, coefs);
+
+        for(size_t j=0; j<row_len; ++j) {
+          ofs << mat.rows[i] << " " << cols[j] << " " << coefs[j] << std::endl;
+        }
+      }
+    }
+#ifdef HAVE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  }
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+void
+sum_into_row(int row_len,
+             GlobalOrdinal* row_indices,
+             Scalar* row_coefs,
+             int num_inputs,
+             const GlobalOrdinal* input_indices,
+             const Scalar* input_coefs)
+{
+  for(size_t i=0; i<num_inputs; ++i) {
+    GlobalOrdinal* loc = std::lower_bound(row_indices, row_indices+row_len,
+                                          input_indices[i]);
+    if (loc-row_indices < row_len && *loc == input_indices[i]) {
+//if(flag && *loc==6)
+//std::cout<<"  ("<<*loc<<":"<<row_coefs[loc-row_indices]<<" += "<<input_coefs[i]<<")"<<std::endl;
+      row_coefs[loc-row_indices] += input_coefs[i];
+    }
+  }
+}
+
+template<typename MatrixType>
+void
+sum_into_row(typename MatrixType::GlobalOrdinalType row,
+             size_t num_indices,
+             const typename MatrixType::GlobalOrdinalType* col_inds,
+             const typename MatrixType::ScalarType* coefs,
+             MatrixType& mat)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  size_t row_len = 0;
+  GlobalOrdinal* mat_row_cols = NULL;
+  Scalar* mat_row_coefs = NULL;
+
+  mat.get_row_pointers(row, row_len, mat_row_cols, mat_row_coefs);
+  if (row_len == 0) return;
+
+  sum_into_row(row_len, mat_row_cols, mat_row_coefs, num_indices, col_inds, coefs);
+}
+
+template<typename MatrixType>
+void
+sum_in_symm_elem_matrix(size_t num,
+                   const typename MatrixType::GlobalOrdinalType* indices,
+                   const typename MatrixType::ScalarType* coefs,
+                   MatrixType& mat)
+{
+  typedef typename MatrixType::ScalarType Scalar;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+//indices is length num (which should be nodes-per-elem)
+//coefs is the upper triangle of the element diffusion matrix
+//which should be length num*(num+1)/2
+//std::cout<<std::endl;
+
+  int row_offset = 0;
+  bool flag = false;
+  for(size_t i=0; i<num; ++i) {
+    GlobalOrdinal row = indices[i];
+ 
+    const Scalar* row_coefs = &coefs[row_offset];
+    const GlobalOrdinal* row_col_inds = &indices[i];
+    size_t row_len = num - i;
+    row_offset += row_len;
+
+    size_t mat_row_len = 0;
+    GlobalOrdinal* mat_row_cols = NULL;
+    Scalar* mat_row_coefs = NULL;
+  
+    mat.get_row_pointers(row, mat_row_len, mat_row_cols, mat_row_coefs);
+    if (mat_row_len == 0) continue;
+
+    sum_into_row(mat_row_len, mat_row_cols, mat_row_coefs,
+                 row_len, row_col_inds, row_coefs);
+
+    int offset = i;
+    for(size_t j=0; j<i; ++j) {
+      Scalar coef = coefs[offset];
+//std::cout<<"i: "<<i<<", j: "<<j<<", offset: "<<offset<<std::endl;
+      sum_into_row(mat_row_len, mat_row_cols, mat_row_coefs,
+                   1, &indices[j], &coef);
+      offset += num - (j+1);
+    }
+  }
+}
+
+template<typename MatrixType>
+void
+sum_in_elem_matrix(size_t num,
+                   const typename MatrixType::GlobalOrdinalType* indices,
+                   const typename MatrixType::ScalarType* coefs,
+                   MatrixType& mat)
+{
+  size_t offset = 0;
+
+  for(size_t i=0; i<num; ++i) {
+    sum_into_row(indices[i], num,
+                 &indices[0], &coefs[offset], mat);
+    offset += num;
+  }
+}
+
+template<typename GlobalOrdinal, typename Scalar,
+         typename MatrixType, typename VectorType>
+void
+sum_into_global_linear_system(ElemData<GlobalOrdinal,Scalar>& elem_data,
+                              MatrixType& A, VectorType& b)
+{
+  sum_in_symm_elem_matrix(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                     elem_data.elem_diffusion_matrix, A);
+  sum_into_vector(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                  elem_data.elem_source_vector, b);
+}
+
+#ifdef MINIFE_HAVE_TBB
+template<typename MatrixType>
+void
+sum_in_elem_matrix(size_t num,
+                   const typename MatrixType::GlobalOrdinalType* indices,
+                   const typename MatrixType::ScalarType* coefs,
+                   LockingMatrix<MatrixType>& mat)
+{
+  size_t offset = 0;
+
+  for(size_t i=0; i<num; ++i) {
+    mat.sum_in(indices[i], num, &indices[0], &coefs[offset]);
+    offset += num;
+  }
+}
+
+template<typename GlobalOrdinal, typename Scalar,
+         typename MatrixType, typename VectorType>
+void
+sum_into_global_linear_system(ElemData<GlobalOrdinal,Scalar>& elem_data,
+                              LockingMatrix<MatrixType>& A, LockingVector<VectorType>& b)
+{
+  sum_in_elem_matrix(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                     elem_data.elem_diffusion_matrix, A);
+  sum_into_vector(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                  elem_data.elem_source_vector, b);
+}
+#endif
+
+template<typename MatrixType>
+void
+add_to_diagonal(typename MatrixType::ScalarType value, MatrixType& mat)
+{
+  for(size_t i=0; i<mat.rows.size(); ++i) {
+    sum_into_row(mat.rows[i], 1, &mat.rows[i], &value, mat);
+  }
+}
+
+template<typename MatrixType>
+double
+parallel_memory_overhead_MB(const MatrixType& A)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  double mem_MB = 0;
+
+#ifdef HAVE_MPI
+  double invMB = 1.0/(1024*1024);
+  mem_MB = invMB*A.external_index.size()*sizeof(GlobalOrdinal);
+  mem_MB += invMB*A.external_local_index.size()*sizeof(GlobalOrdinal);
+  mem_MB += invMB*A.elements_to_send.size()*sizeof(GlobalOrdinal);
+  mem_MB += invMB*A.neighbors.size()*sizeof(int);
+  mem_MB += invMB*A.recv_length.size()*sizeof(LocalOrdinal);
+  mem_MB += invMB*A.send_length.size()*sizeof(LocalOrdinal);
+
+  double tmp = mem_MB;
+  MPI_Allreduce(&tmp, &mem_MB, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  return mem_MB;
+}
+
+template<typename MatrixType>
+void rearrange_matrix_local_external(MatrixType& A)
+{
+  //This function will rearrange A so that local entries are contiguous at the front
+  //of A's memory, and external entries are contiguous at the back of A's memory.
+  //
+  //A.row_offsets will describe where the local entries occur, and
+  //A.row_offsets_external will describe where the external entries occur.
+
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  size_t nrows = A.rows.size();
+  std::vector<LocalOrdinal> tmp_row_offsets(nrows*2);
+  std::vector<LocalOrdinal> tmp_row_offsets_external(nrows*2);
+
+  LocalOrdinal num_local_nz = 0;
+  LocalOrdinal num_extern_nz = 0;
+
+  //First sort within each row of A, so that local entries come
+  //before external entries within each row.
+  //tmp_row_offsets describe the locations of the local entries, and
+  //tmp_row_offsets_external describe the locations of the external entries.
+  //
+  for(size_t i=0; i<nrows; ++i) {
+    GlobalOrdinal* row_begin = &A.packed_cols[A.row_offsets[i]];
+    GlobalOrdinal* row_end = &A.packed_cols[A.row_offsets[i+1]];
+
+    Scalar* coef_row_begin = &A.packed_coefs[A.row_offsets[i]];
+
+    tmp_row_offsets[i*2] = A.row_offsets[i];
+    tmp_row_offsets[i*2+1] = A.row_offsets[i+1];
+    tmp_row_offsets_external[i*2] = A.row_offsets[i+1];
+    tmp_row_offsets_external[i*2+1] = A.row_offsets[i+1];
+
+    ptrdiff_t row_len = row_end - row_begin;
+
+    sort_with_companions(row_len, row_begin, coef_row_begin);
+
+    GlobalOrdinal* row_iter = std::lower_bound(row_begin, row_end, nrows);
+
+    LocalOrdinal offset = A.row_offsets[i] + row_iter-row_begin;
+    tmp_row_offsets[i*2+1] = offset;
+    tmp_row_offsets_external[i*2] = offset;
+
+    num_local_nz += tmp_row_offsets[i*2+1]-tmp_row_offsets[i*2];
+    num_extern_nz += tmp_row_offsets_external[i*2+1]-tmp_row_offsets_external[i*2];
+  }
+
+  //Next, copy the external entries into separate arrays.
+
+  std::vector<GlobalOrdinal> ext_cols(num_extern_nz);
+  std::vector<Scalar> ext_coefs(num_extern_nz);
+  std::vector<LocalOrdinal> ext_offsets(nrows+1);
+  LocalOrdinal offset = 0;
+  for(size_t i=0; i<nrows; ++i) {
+    ext_offsets[i] = offset;
+    for(LocalOrdinal j=tmp_row_offsets_external[i*2];
+                     j<tmp_row_offsets_external[i*2+1]; ++j) {
+      ext_cols[offset] = A.packed_cols[j];
+      ext_coefs[offset++] = A.packed_coefs[j];
+    }
+  }
+  ext_offsets[nrows] = offset;
+
+  //Now slide all local entries down to the beginning of A's packed arrays
+
+  A.row_offsets.resize(nrows+1);
+  offset = 0;
+  for(size_t i=0; i<nrows; ++i) {
+    A.row_offsets[i] = offset;
+    for(LocalOrdinal j=tmp_row_offsets[i*2]; j<tmp_row_offsets[i*2+1]; ++j) {
+      A.packed_cols[offset] = A.packed_cols[j];
+      A.packed_coefs[offset++] = A.packed_coefs[j];
+    }
+  }
+  A.row_offsets[nrows] = offset;
+
+  //Finally, copy the external entries back into A.packed_cols and
+  //A.packed_coefs, starting at the end of the local entries.
+
+  for(LocalOrdinal i=offset; i<offset+ext_cols.size(); ++i) {
+    A.packed_cols[i] = ext_cols[i-offset];
+    A.packed_coefs[i] = ext_coefs[i-offset];
+  }
+
+  A.row_offsets_external.resize(nrows+1);
+  for(size_t i=0; i<=nrows; ++i) A.row_offsets_external[i] = ext_offsets[i] + offset;
+}
+
+//------------------------------------------------------------------------
+template<typename MatrixType>
+void
+zero_row_and_put_1_on_diagonal(MatrixType& A, typename MatrixType::GlobalOrdinalType row)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  size_t row_len = 0;
+  GlobalOrdinal* cols = NULL;
+  Scalar* coefs = NULL;
+  A.get_row_pointers(row, row_len, cols, coefs);
+  
+  for(size_t i=0; i<row_len; ++i) {
+    if (cols[i] == row) coefs[i] = 1;
+    else coefs[i] = 0;
+  }
+}
+
+//------------------------------------------------------------------------
+template<typename MatrixType,
+         typename VectorType>
+void
+impose_dirichlet(typename MatrixType::ScalarType prescribed_value,
+                    MatrixType& A,
+                    VectorType& b,
+                    int global_nx,
+                    int global_ny,
+                    int global_nz,
+                    const std::set<typename MatrixType::GlobalOrdinalType>& bc_rows)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  GlobalOrdinal first_local_row = A.rows.size()>0 ? A.rows[0] : 0;
+  GlobalOrdinal last_local_row  = A.rows.size()>0 ? A.rows[A.rows.size()-1] : -1;
+
+  typename std::set<GlobalOrdinal>::const_iterator
+    bc_iter = bc_rows.begin(), bc_end = bc_rows.end();
+  for(; bc_iter!=bc_end; ++bc_iter) {
+    GlobalOrdinal row = *bc_iter;
+    if (row >= first_local_row && row <= last_local_row) {
+      size_t local_row = row - first_local_row;
+      b.coefs[local_row] = prescribed_value;
+      zero_row_and_put_1_on_diagonal(A, row);
+    }
+  }
+
+  for(size_t i=0; i<A.rows.size(); ++i) {
+    GlobalOrdinal row = A.rows[i];
+
+    if (bc_rows.find(row) != bc_rows.end()) continue;
+
+    size_t row_length = 0;
+    GlobalOrdinal* cols = NULL;
+    Scalar* coefs = NULL;
+    A.get_row_pointers(row, row_length, cols, coefs);
+
+    Scalar sum = 0;
+    for(size_t j=0; j<row_length; ++j) {
+      if (bc_rows.find(cols[j]) != bc_rows.end()) {
+        sum += coefs[j];
+        coefs[j] = 0;
+      }
+    }
+
+    b.coefs[i] -= sum*prescribed_value;
+  }
+}
+
+static timer_type exchtime = 0;
+
+//------------------------------------------------------------------------
+//Compute matrix vector product y = A*x and return dot(x,y), where:
+//
+// A - input matrix
+// x - input vector
+// y - result vector
+//
+template<typename MatrixType,
+         typename VectorType>
+typename TypeTraits<typename VectorType::ScalarType>::magnitude_type
+matvec_and_dot(MatrixType& A,
+               VectorType& x,
+               VectorType& y)
+{
+  timer_type t0 = mytimer();
+  exchange_externals(A, x);
+  exchtime += mytimer()-t0;
+
+  typedef typename TypeTraits<typename VectorType::ScalarType>::magnitude_type magnitude;
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+
+  int n = A.rows.size();
+  const LocalOrdinalType* Arowoffsets = &A.row_offsets[0];
+  const GlobalOrdinalType* Acols      = &A.packed_cols[0];
+  const ScalarType* Acoefs            = &A.packed_coefs[0];
+  const ScalarType* xcoefs = &x.coefs[0];
+        ScalarType* ycoefs = &y.coefs[0];
+  ScalarType beta = 0;
+
+  magnitude result = 0;
+
+  for(int row=0; row<n; ++row) {
+    ScalarType sum = beta*ycoefs[row];
+
+    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
+      sum += Acoefs[i]*xcoefs[Acols[i]];
+    }
+
+    ycoefs[row] = sum;
+    result += xcoefs[row]*sum;
+  }
+
+#ifdef HAVE_MPI
+  magnitude local_dot = result, global_dot = 0;
+  MPI_Datatype mpi_dtype = TypeTraits<magnitude>::mpi_type();  
+  MPI_Allreduce(&local_dot, &global_dot, 1, mpi_dtype, MPI_SUM, MPI_COMM_WORLD);
+  return global_dot;
+#else
+  return result;
+#endif
+}
+
+//------------------------------------------------------------------------
+//Compute matrix vector product y = A*x where:
+//
+// A - input matrix
+// x - input vector
+// y - result vector
+//
+#if defined(MINIFE_CSR_MATRIX)
+template<typename MatrixType,
+         typename VectorType>
+struct matvec_std {
+void operator()(MatrixType& A,
+            VectorType& x,
+            VectorType& y)
+{
+  exchange_externals(A, x);
+
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+
+  int n = A.rows.size();
+  const LocalOrdinalType* Arowoffsets = &A.row_offsets[0];
+  const GlobalOrdinalType* Acols      = &A.packed_cols[0];
+  const ScalarType* Acoefs            = &A.packed_coefs[0];
+  const ScalarType* xcoefs = &x.coefs[0];
+        ScalarType* ycoefs = &y.coefs[0];
+  ScalarType beta = 0;
+
+  for(int row=0; row<n; ++row) {
+    ScalarType sum = beta*ycoefs[row];
+
+    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
+      sum += Acoefs[i]*xcoefs[Acols[i]];
+    }
+
+    //std::cout << "row[" << row << "] = " << sum << std::endl;
+    ycoefs[row] = sum;
+  }
+}
+};
+#elif defined(MINIFE_ELL_MATRIX)
+template<typename MatrixType,
+         typename VectorType>
+struct matvec_std {
+void operator()(MatrixType& A,
+            VectorType& x,
+            VectorType& y)
+{
+  exchange_externals(A, x);
+
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+
+  int row_len = A.num_cols_per_row;
+  int n = A.rows.size();
+  const GlobalOrdinalType* Acols      = &A.cols[0];
+  const ScalarType* Acoefs            = &A.coefs[0];
+  const ScalarType* xcoefs = &x.coefs[0];
+        ScalarType* ycoefs = &y.coefs[0];
+  ScalarType beta = 0;
+
+  for(int row=0; row<n; ++row) {
+    ScalarType sum = beta*ycoefs[row];
+
+    int row_start=row*row_len;
+    int row_end=row_start+row_len;
+    for(LocalOrdinalType i=row_start; i<row_end; ++i) {
+      sum += Acoefs[i]*xcoefs[Acols[i]];
+    }
+
+    ycoefs[row] = sum;
+  }
+}
+};
+#endif
+
+template<typename MatrixType,
+         typename VectorType>
+void matvec(MatrixType& A, VectorType& x, VectorType& y)
+{
+  matvec_std<MatrixType,VectorType> mv;
+  mv(A, x, y);
+}
+
+template<typename MatrixType,
+         typename VectorType>
+struct matvec_overlap {
+void operator()(MatrixType& A,
+                    VectorType& x,
+                    VectorType& y)
+{
+#ifdef HAVE_MPI
+  begin_exchange_externals(A, x);
+#endif
+
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+
+
+  int n = A.rows.size();
+  const LocalOrdinalType* Arowoffsets = &A.row_offsets[0];
+  const GlobalOrdinalType* Acols      = &A.packed_cols[0];
+  const ScalarType* Acoefs            = &A.packed_coefs[0];
+  const ScalarType* xcoefs = &x.coefs[0];
+        ScalarType* ycoefs = &y.coefs[0];
+  ScalarType beta = 0;
+
+  for(int row=0; row<n; ++row) {
+    ScalarType sum = beta*ycoefs[row];
+
+    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
+      sum += Acoefs[i]*xcoefs[Acols[i]];
+    }
+
+    ycoefs[row] = sum;
+  }
+
+#ifdef HAVE_MPI
+  finish_exchange_externals(A.neighbors.size());
+
+  Arowoffsets = &A.row_offsets_external[0];
+  beta = 1;
+
+  for(int row=0; row<n; ++row) {
+    ScalarType sum = beta*ycoefs[row];
+
+    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
+      sum += Acoefs[i]*xcoefs[Acols[i]];
+    }
+
+    ycoefs[row] = sum;
+  }
+#endif
+}
+};
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/TypeTraits.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/TypeTraits.hpp
@@ -0,0 +1,136 @@
+#ifndef _TypeTraits_hpp_
+#define _TypeTraits_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <complex>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename T> struct TypeTraits {};
+
+template<>
+struct TypeTraits<float> {
+  typedef float magnitude_type;
+
+  static const char* name() {return "float";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_FLOAT;}
+#endif
+};
+
+template<>
+struct TypeTraits<double> {
+  typedef double magnitude_type;
+
+  static const char* name() {return "double";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_DOUBLE;}
+#endif
+};
+
+template<>
+struct TypeTraits<int> {
+  typedef int magnitude_type;
+
+  static const char* name() {return "int";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_INT;}
+#endif
+};
+
+template<>
+struct TypeTraits<long int> {
+  typedef long int magnitude_type;
+
+  static const char* name() {return "long int";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_LONG;}
+#endif
+};
+
+#ifndef MINIFE_NO_LONG_LONG
+
+template<>
+struct TypeTraits<long long> {
+  typedef long long magnitude_type;
+
+  static const char* name() {return "long long";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_LONG_LONG;}
+#endif
+};
+
+#endif
+
+template<>
+struct TypeTraits<unsigned> {
+  typedef unsigned magnitude_type;
+
+  static const char* name() {return "unsigned";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_UNSIGNED;}
+#endif
+};
+
+template<>
+struct TypeTraits<std::complex<float> > {
+  typedef float magnitude_type;
+
+  static const char* name() {return "std::complex<float>";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_COMPLEX;}
+#endif
+};
+
+template<>
+struct TypeTraits<std::complex<double> > {
+  typedef double magnitude_type;
+
+  static const char* name() {return "std::complex<double>";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_DOUBLE_COMPLEX;}
+#endif
+};
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Vector.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Vector.hpp
@@ -0,0 +1,67 @@
+#ifndef _Vector_hpp_
+#define _Vector_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+
+namespace miniFE {
+
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal>
+struct Vector {
+  typedef Scalar ScalarType;
+  typedef LocalOrdinal LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+
+  Vector(GlobalOrdinal startIdx, LocalOrdinal local_sz)
+   : startIndex(startIdx),
+     local_size(local_sz),
+     coefs(local_size)
+  {
+    for(size_t i=0; i < local_size; ++i) {
+    	coefs[i] = 0;
+    }
+  }
+
+  ~Vector()
+  {
+  }
+
+  GlobalOrdinal startIndex;
+  LocalOrdinal local_size;
+  std::vector<Scalar> coefs;
+};
+
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Vector_functions.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/Vector_functions.hpp
@@ -0,0 +1,231 @@
+#ifndef _Vector_functions_hpp_
+#define _Vector_functions_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+#include <sstream>
+#include <fstream>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#ifdef MINIFE_HAVE_TBB
+#include <LockingVector.hpp>
+#endif
+
+#include <TypeTraits.hpp>
+#include <Vector.hpp>
+
+namespace miniFE {
+
+
+template<typename VectorType>
+void write_vector(const std::string& filename,
+                  const VectorType& vec)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  std::ostringstream osstr;
+  osstr << filename << "." << numprocs << "." << myproc;
+  std::string full_name = osstr.str();
+  std::ofstream ofs(full_name.c_str());
+
+  typedef typename VectorType::ScalarType ScalarType;
+
+  const std::vector<ScalarType>& coefs = vec.coefs;
+  for(int p=0; p<numprocs; ++p) {
+    if (p == myproc) {
+      if (p == 0) {
+        ofs << vec.local_size << std::endl;
+      }
+  
+      typename VectorType::GlobalOrdinalType first = vec.startIndex;
+      for(size_t i=0; i<vec.local_size; ++i) {
+        ofs << first+i << " " << coefs[i] << std::endl;
+      }
+    }
+#ifdef HAVE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  }
+}
+
+template<typename VectorType>
+void sum_into_vector(size_t num_indices,
+                     const typename VectorType::GlobalOrdinalType* indices,
+                     const typename VectorType::ScalarType* coefs,
+                     VectorType& vec)
+{
+  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename VectorType::ScalarType Scalar;
+
+  GlobalOrdinal first = vec.startIndex;
+  GlobalOrdinal last = first + vec.local_size - 1;
+
+  std::vector<Scalar>& vec_coefs = vec.coefs;
+
+  for(size_t i=0; i<num_indices; ++i) {
+    if (indices[i] < first || indices[i] > last) continue;
+    size_t idx = indices[i] - first;
+    vec_coefs[idx] += coefs[i];
+  }
+}
+
+#ifdef MINIFE_HAVE_TBB
+template<typename VectorType>
+void sum_into_vector(size_t num_indices,
+                     const typename VectorType::GlobalOrdinalType* indices,
+                     const typename VectorType::ScalarType* coefs,
+                     LockingVector<VectorType>& vec)
+{
+  vec.sum_in(num_indices, indices, coefs);
+}
+#endif
+
+//------------------------------------------------------------
+//Compute the update of a vector with the sum of two scaled vectors where:
+//
+// w = alpha*x + beta*y
+//
+// x,y - input vectors
+//
+// alpha,beta - scalars applied to x and y respectively
+//
+// w - output vector
+//
+template<typename VectorType>
+void
+  waxpby(typename VectorType::ScalarType alpha, const VectorType& x,
+         typename VectorType::ScalarType beta, const VectorType& y,
+         VectorType& w)
+{
+  typedef typename VectorType::ScalarType ScalarType;
+
+#ifdef MINIFE_DEBUG
+  if (y.local_size < x.local_size || w.local_size < x.local_size) {
+    std::cerr << "miniFE::waxpby ERROR, y and w must be at least as long as x." << std::endl;
+    return;
+  }
+#endif
+
+  int n = x.coefs.size();
+  const ScalarType* xcoefs = &x.coefs[0];
+  const ScalarType* ycoefs = &y.coefs[0];
+        ScalarType* wcoefs = &w.coefs[0];
+
+  for(int i=0; i<n; ++i) {
+    wcoefs[i] = alpha*xcoefs[i] + beta*ycoefs[i];
+  }
+}
+
+//Like waxpby above, except operates on two sets of arguments.
+//In other words, performs two waxpby operations in one loop.
+template<typename VectorType>
+void
+  fused_waxpby(typename VectorType::ScalarType alpha, const VectorType& x,
+         typename VectorType::ScalarType beta, const VectorType& y,
+         VectorType& w,
+         typename VectorType::ScalarType alpha2, const VectorType& x2,
+         typename VectorType::ScalarType beta2, const VectorType& y2,
+         VectorType& w2)
+{
+  typedef typename VectorType::ScalarType ScalarType;
+
+#ifdef MINIFE_DEBUG
+  if (y.local_size < x.local_size || w.local_size < x.local_size) {
+    std::cerr << "miniFE::waxpby ERROR, y and w must be at least as long as x." << std::endl;
+    return;
+  }
+#endif
+
+  int n = x.coefs.size();
+  const ScalarType* xcoefs = &x.coefs[0];
+  const ScalarType* ycoefs = &y.coefs[0];
+        ScalarType* wcoefs = &w.coefs[0];
+
+  const ScalarType* x2coefs = &x2.coefs[0];
+  const ScalarType* y2coefs = &y2.coefs[0];
+        ScalarType* w2coefs = &w2.coefs[0];
+
+  for(int i=0; i<n; ++i) {
+    wcoefs[i] = alpha*xcoefs[i] + beta*ycoefs[i];
+    w2coefs[i] = alpha2*x2coefs[i] + beta2*y2coefs[i];
+  }
+}
+
+//-----------------------------------------------------------
+//Compute the dot product of two vectors where:
+//
+// x,y - input vectors
+//
+// result - return-value
+//
+template<typename Vector>
+typename TypeTraits<typename Vector::ScalarType>::magnitude_type
+  dot(const Vector& x,
+      const Vector& y)
+{
+  int n = x.coefs.size();
+
+#ifdef MINIFE_DEBUG
+  if (y.local_size < n) {
+    std::cerr << "miniFE::dot ERROR, y must be at least as long as x."<<std::endl;
+    n = y.local_size;
+  }
+#endif
+
+  typedef typename Vector::ScalarType Scalar;
+  typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
+
+  const Scalar* xcoefs = &x.coefs[0];
+  const Scalar* ycoefs = &y.coefs[0];
+  magnitude result = 0;
+  for(int i=0; i<n; ++i) {
+    result += xcoefs[i]*ycoefs[i];
+  }
+
+#ifdef HAVE_MPI
+  magnitude local_dot = result, global_dot = 0;
+  MPI_Datatype mpi_dtype = TypeTraits<magnitude>::mpi_type();  
+  MPI_Allreduce(&local_dot, &global_dot, 1, mpi_dtype, MPI_SUM, MPI_COMM_WORLD);
+  return global_dot;
+#else
+  return result;
+#endif
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/YAML_Doc.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/YAML_Doc.hpp
@@ -0,0 +1,122 @@
+//@HEADER
+// ************************************************************************
+// 
+//               Mantevo: A collection of mini-applications for HPC
+//                 Copyright (2008) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+// Changelog
+//
+// Version 0.1
+// - Initial version.
+//
+/////////////////////////////////////////////////////////////////////////
+
+#ifndef YAML_DOC_H
+#define YAML_DOC_H
+#include <string>
+#include <vector>
+#include "YAML_Element.hpp"
+
+//! The Mantevo YAML_Doc class for the uniform collecting and reporting of performance data for mini-applications
+
+/*!
+
+The YAML_Doc class works in conjuction with the YAML_Element class to facilitate easy collecting and reporting of YAML-formatted
+data that can be then registered with the Mantevo results collection website.
+
+\code
+
+//EXAMPLE CODE FOR GENERATING YAML
+
+  YAML_Doc doc("hpccg","1.0");
+  doc.add("final_residual",1.4523e-13);
+  doc.add("time","4.893"); 
+ 
+//note: the following line will remove the data (4.890) associated with "time"
+  doc.get("time")->add("total",4.243);
+
+//note:  the following line will likewise remove the data (1.243) associated with "time"
+  doc.get("time")->get("total")->add("time",2.457);
+  doc.get("time")->get("total")->add("flops",4.88e5);
+  doc.get("time")->add("ddot",1.243);
+  doc.get("time")->add("sparsemv","");
+  doc.get("time")->get("sparsemv")->add("time",0.3445);
+  doc.get("time")->get("sparsemv")->add("overhead","");
+  doc.get("time")->get("sparsemv")->get("overhead")->add("time",0.0123);
+  doc.get("time")->get("sparsemv")->get("overhead")->add("percentage",0.034);
+  cout << doc.generateYAML() << endl; 
+  return 0;
+
+\endcode
+
+Below is the output generated by the above code:
+
+\verbatim
+
+final_residual: 1.4523e-13
+time: 
+  total:
+    time: 2.457
+    flops: 4.88e5
+  ddot: 1.243
+  sparsemv:
+    time: 0.3445
+    overhead:
+      time: 0.0123
+      percentage: 0.034
+
+\endverbatim
+
+\note {No value is allowed to be attached to a key that has children.  If children are added to a key, the value is simply set to "".}
+
+*/
+class YAML_Doc: public YAML_Element {
+  public:
+  //! Constructor: accepts mini-application name and version as strings, optionally accepts directory and file name for printing results.
+  /*!
+    The sole constructor for this class accepts and name and version number for the mini-application as well as optional directory 
+    and file name information for results that are generated by the generateYAML() method.
+    \param miniApp_Name (in) string containing name of the mini-application
+    \param miniApp_Version (in) string containing the version of the mini-application
+    \param destination_Directory (in, optional) path of diretory where results file will be stored, relative to current working directory. 
+           If this value is not supplied, the results file will be stored in the current working directory.  If the directory does not exist
+	   it will be created.
+    \param destination_FileName (in, optional) root name of the results file.  A suffix of ".yaml" will be automatically appended.  If no
+           file name is specified the filename will be constructed by concatenating the miniAppName + miniAppVersion + ".yaml" strings.
+  */
+  YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version, const std::string& destination_Directory = "", const std::string& destination_FileName = "");
+  //! Destructor
+  ~YAML_Doc();
+  //! Generate YAML results to standard out and to a file using specified directory and filename, using current directory and miniAppName + miniAppVersion + ".yaml" by default
+  std::string generateYAML();
+
+protected:
+  std::string miniAppName;
+  std::string miniAppVersion;
+  std::string destinationDirectory;
+  std::string destinationFileName;
+};
+#endif /* YAML_DOC_H */
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/YAML_Doc.cpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/YAML_Doc.cpp
@@ -0,0 +1,77 @@
+#include <ctime>
+#include <cstdlib>
+#include <ctime>
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#ifdef REDSTORM
+#include <time.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#endif
+#include "YAML_Doc.hpp"
+using namespace std;
+
+//set the microapp_name and version which will become part of the YAML doc.
+YAML_Doc::YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version, const std::string& destination_Directory, const std::string& destination_FileName){
+  miniAppName = miniApp_Name;
+  miniAppVersion = miniApp_Version;
+  destinationDirectory = destination_Directory;
+  destinationFileName = destination_FileName;
+}
+
+//inherits the destructor from YAML_Element
+YAML_Doc::~YAML_Doc(void){
+}
+
+/*
+* generates YAML from the elements of the document and saves it
+* to a file
+*/
+string YAML_Doc::generateYAML(){
+  string yaml;
+  yaml =  yaml + "Mini-Application Name: " + miniAppName + "\n";
+  yaml =  yaml + "Mini-Application Version: " + miniAppVersion + "\n";
+  for(size_t i=0; i<children.size(); i++){
+    yaml = yaml + children[i]->printYAML("");
+  }
+  
+  time_t rawtime;
+  tm * ptm;
+  time ( &rawtime );
+  ptm = localtime(&rawtime);
+  char sdate[25];
+  //use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
+  sprintf (sdate,"%04d:%02d:%02d-%02d:%02d:%02d",ptm->tm_year + 1900, ptm->tm_mon+1,
+    ptm->tm_mday, ptm->tm_hour, ptm->tm_min,ptm->tm_sec);
+
+  string filename;
+  if (destinationFileName=="") 
+    filename = miniAppName + "-" + miniAppVersion + "_";
+  else 
+    filename = destinationFileName;
+  filename = filename + string(sdate) + ".yaml";
+  if (destinationDirectory!="" && destinationDirectory!=".") {
+    string mkdir_cmd = "mkdir " + destinationDirectory;
+#ifdef REDSTORM
+    mkdir(destinationDirectory.c_str(),0755);
+#else
+    system(mkdir_cmd.c_str());
+#endif
+    filename = destinationDirectory + "/" + destinationFileName;
+  }
+  else 
+    filename = "./" + filename;
+
+#ifdef GENERATE_YAML
+  ofstream myfile;
+  myfile.open(filename.c_str());
+  myfile << yaml;
+  myfile.close();
+#endif
+
+  return yaml;
+}
+
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/YAML_Element.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/YAML_Element.hpp
@@ -0,0 +1,86 @@
+//@HEADER
+// ************************************************************************
+// 
+//               Mantevo: A collection of mini-applications for HPC
+//                 Copyright (2008) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+// Changelog
+//
+// Version 0.1
+// - Initial version.
+//
+/////////////////////////////////////////////////////////////////////////
+
+#ifndef YAML_ELEMENT_H
+#define YAML_ELEMENT_H
+#include <string>
+#include <vector>
+//! The Mantevo YAML_Element class for registering key-value pairs of performance data
+
+/*!
+  Mantevo mini-applications generate a collection of performance data for each run of the executable.  YAML_Element, and
+  the related YAML_Doc class, provide a uniform facility for gathering and reporting this data using the YAML text format.
+*/
+class YAML_Element {
+  public:
+
+  //! Default constructor.
+  YAML_Element (){key="";value="";}
+  //! Construct with known key-value pair
+  YAML_Element (const std::string& key_arg, const std::string& value_arg);
+  //! Destructor
+  ~YAML_Element ();
+  //! Key accessor method
+  std::string getKey(){return key;}
+  //! Add a child element to an element list associated with this element, value of type double
+  YAML_Element* add(const std::string& key_arg, double value_arg);
+  //! Add a child element to an element list associated with this element, value of type int
+  YAML_Element* add(const std::string& key_arg, int value_arg);
+#ifndef MINIFE_NO_LONG_LONG
+  //! Add a child element to an element list associated with this element, value of type long long
+  YAML_Element* add(const std::string& key_arg, long long value_arg);
+#endif
+  //! Add a child element to an element list associated with this element, value of type size_t
+  YAML_Element* add(const std::string& key_arg, size_t value_arg);
+  //! Add a child element to an element list associated with this element, value of type string
+  YAML_Element* add(const std::string& key_arg, const std::string& value_arg);
+  //! get the element in the list with the given key
+  YAML_Element* get(const std::string& key_arg);
+  std::string printYAML(std::string space);
+  
+protected:
+  std::string key;
+  std::string value;
+  std::vector<YAML_Element*> children;
+
+private:
+  std::string convert_double_to_string(double value_arg);
+  std::string convert_int_to_string(int value_arg);
+#ifndef MINIFE_NO_LONG_LONG
+  std::string convert_long_long_to_string(long long value_arg);
+#endif
+  std::string convert_size_t_to_string(size_t value_arg);
+};
+#endif /* YAML_ELEMENT_H */
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/YAML_Element.cpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/YAML_Element.cpp
@@ -0,0 +1,120 @@
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include "YAML_Element.hpp"
+using namespace std;
+YAML_Element::YAML_Element(const std::string& key_arg, const std::string& value_arg){
+  key = key_arg;
+  value = value_arg;
+}
+
+YAML_Element::~YAML_Element(){
+  for (size_t i=0; i<children.size(); i++) {
+    delete children[i];
+  }
+  children.clear();
+}
+
+/*
+* Add an element to the vector
+* QUESTION: if an element is not added because the key already exists,
+* will this lead to memory leakage?
+*/
+YAML_Element* YAML_Element::add(const std::string& key_arg, double value_arg) {
+  this->value = "";
+  string converted_value = convert_double_to_string(value_arg);
+  YAML_Element* element = new YAML_Element(key_arg,converted_value);
+  children.push_back(element);
+  return element;
+}
+
+YAML_Element* YAML_Element::add(const std::string& key_arg, int value_arg) {
+  this->value = "";
+  string converted_value = convert_int_to_string(value_arg);
+  YAML_Element* element = new YAML_Element(key_arg,converted_value);
+  children.push_back(element);
+  return element;
+}
+
+#ifndef MINIFE_NO_LONG_LONG
+
+YAML_Element* YAML_Element::add(const std::string& key_arg, long long value_arg) {
+  this->value = "";
+  string converted_value = convert_long_long_to_string(value_arg);
+  YAML_Element* element = new YAML_Element(key_arg,converted_value);
+  children.push_back(element);
+  return element;
+}
+
+#endif
+
+YAML_Element* YAML_Element::add(const std::string& key_arg, size_t value_arg) {
+  this->value = "";
+  string converted_value = convert_size_t_to_string(value_arg);
+  YAML_Element* element = new YAML_Element(key_arg,converted_value);
+  children.push_back(element);
+  return element;
+}
+
+YAML_Element* YAML_Element::add(const std::string& key_arg, const std::string& value_arg) {
+  this->value = "";
+  YAML_Element* element = new YAML_Element(key_arg, value_arg);
+  children.push_back(element);
+  return element;
+}
+
+/*
+* returns pointer to the YAML_Element for the given key.
+* I, cam, believe an exception should be thrown if there is no
+* element in the vector for the specified key
+*/
+YAML_Element* YAML_Element::get(const std::string& key_arg) {
+  for (size_t i=0; i<children.size(); i++) {
+    if(children[i]->getKey() == key_arg){
+      return children[i];
+    }
+  }
+  return 0;
+}
+
+/*
+* prints a line of a YAML document.  Correct YAML depends on
+* correct spacing; the parameter space should be the proper
+* amount of space for the parent element
+*/
+string YAML_Element::printYAML(std::string space){
+  string yaml_line = space + key + ": " + value + "\n";
+  for(int i=0; i<2; i++) space = space + " ";
+  for(size_t i=0; i<children.size(); i++){
+    yaml_line = yaml_line + children[i]->printYAML(space);
+  }
+  return yaml_line;
+}
+
+string YAML_Element::convert_double_to_string(double value_arg){
+  stringstream strm;
+  strm << value_arg;
+  return strm.str();
+}
+string YAML_Element::convert_int_to_string(int value_arg){
+  stringstream strm;
+  strm << value_arg;
+  return strm.str();
+}
+
+#ifndef MINIFE_NO_LONG_LONG
+
+string YAML_Element::convert_long_long_to_string(long long value_arg){
+  stringstream strm;
+  strm << value_arg;
+  return strm.str();
+}
+
+#endif
+
+string YAML_Element::convert_size_t_to_string(size_t value_arg){
+  stringstream strm;
+  strm << value_arg;
+  return strm.str();
+}
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/analytic_soln.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/analytic_soln.hpp
@@ -0,0 +1,116 @@
+#ifndef _analytic_soln_hpp_
+#define _analytic_soln_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+
+#ifndef MINIFE_SCALAR
+#define MINIFE_SCALAR double;
+#endif
+
+namespace miniFE {
+
+typedef MINIFE_SCALAR Scalar;
+
+// The 'soln' function below computes the analytic solution for
+// steady state temperature in a brick-shaped domain (formally called
+// a rectangular parallelepiped). The inputs to the function are
+// the x,y,z coordinates of the point at which temperature is to be
+// computed, and the number of terms p,q in the series expansion.
+//
+// The equations used for the temperature solution are equations 9 and 10
+// in section 6.2 of Carslaw & Jaeger, "Conduction of Heat in Solids".
+//
+// The paralellepiped being used is defined by this domain:
+// 0 <= x <= 1.0
+// 0 <= y <= 1.0
+// 0 <= z <= 1.0
+//
+// With boundary conditions prescribing the temperature to be 1.0 on
+// the x==1.0 face, and 0.0 on all other faces.
+//
+// Thus, in the equations from Carslaw & Jaeger, the following constants
+// are used:
+//
+// a == b == c == 1.0  (the extents of the domain)
+// v1 == 0.0           (temperature at x == 0.0)
+// v2 == 1.0           (temperature at x == 1.0)
+//
+
+const Scalar PI = 3.141592653589793238462;
+const Scalar PI_SQR = PI*PI;
+const Scalar term0 = 16.0/(PI_SQR);
+
+inline Scalar fcn_l(int p, int q)
+{
+  return std::sqrt((2*p+1)*(2*p+1)*PI_SQR + (2*q+1)*(2*q+1)*PI_SQR);
+}
+
+inline Scalar fcn(int n, Scalar u)
+{
+  return (2*n+1)*PI*u;
+}
+
+inline Scalar soln(Scalar x, Scalar y, Scalar z, int max_p, int max_q)
+{
+  Scalar sum = 0;
+  for(int p=0; p<=max_p; ++p) {
+    const Scalar p21y = fcn(p, y);
+    const Scalar sin_py = std::sin(p21y)/(2*p+1);
+    for(int q=0; q<=max_q; ++q) {
+      const Scalar q21z = fcn(q, z);
+      const Scalar sin_qz = std::sin(q21z)/(2*q+1);
+
+      const Scalar l = fcn_l(p, q);
+
+      const Scalar sinh1 = std::sinh(l*x);
+      const Scalar sinh2 = std::sinh(l);
+
+      const Scalar tmp = (sinh1*sin_py)*(sin_qz/sinh2);
+
+      //if the scalar l gets too big, sinh(l) becomes inf.
+      //if that happens, tmp is a NaN.
+      //crude check for NaN:
+      //if tmp != tmp, tmp is NaN
+      if (tmp == tmp) {
+        sum += tmp;
+      }
+      else {
+        //if we got a NaN, break out of this inner loop and go to
+        //the next iteration of the outer loop.
+        break;
+      }
+    }
+  }
+  return term0*sum;
+}
+
+}//namespace miniFE
+
+#endif /* _analytic_soln_hpp_ */
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/assemble_FE_data.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/assemble_FE_data.hpp
@@ -0,0 +1,78 @@
+#ifndef _assemble_FE_data_hpp_
+#define _assemble_FE_data_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <box_utils.hpp>
+#include <simple_mesh_description.hpp>
+
+#include <perform_element_loop.hpp>
+
+namespace miniFE {
+
+template<typename MatrixType,
+         typename VectorType>
+void
+assemble_FE_data(const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh,
+                 MatrixType& A,
+                 VectorType& b,
+                 Parameters& params)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+
+  Box local_elem_box;
+  copy_box(mesh.local_box, local_elem_box);
+
+  if (get_num_ids<GlobalOrdinal>(local_elem_box) < 1) {
+    return;
+  }
+
+  //
+  //We want the element-loop to loop over our (processor-local) domain plus a
+  //ghost layer, so we can assemble the complete linear-system without doing
+  //any communication.
+  //
+  int ghost = 1;
+  if (local_elem_box[0][0] > 0) local_elem_box[0][0] -= ghost;
+  if (local_elem_box[1][0] > 0) local_elem_box[1][0] -= ghost;
+  if (local_elem_box[2][0] > 0) local_elem_box[2][0] -= ghost;
+  if (local_elem_box[0][1] < global_elems_x) local_elem_box[0][1] += ghost;
+  if (local_elem_box[1][1] < global_elems_y) local_elem_box[1][1] += ghost;
+  if (local_elem_box[2][1] < global_elems_z) local_elem_box[2][1] += ghost;
+
+  perform_element_loop(mesh, local_elem_box, A, b, params);
+}
+                      
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/box_utils.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/box_utils.hpp
@@ -0,0 +1,320 @@
+#ifndef _box_utils_hpp_
+#define _box_utils_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+#include <set>
+#include <map>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <TypeTraits.hpp>
+#include <Box.hpp>
+
+namespace miniFE {
+
+inline void copy_box(const Box& from_box, Box& to_box)
+{
+  for(int i=0; i<3; ++i) {
+    to_box[i][0] = from_box[i][0];
+    to_box[i][1] = from_box[i][1];
+  }
+}
+
+template<typename GlobalOrdinal>
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+void get_int_coords(GlobalOrdinal ID, int nx, int ny, int nz,
+                int& x, int& y, int& z)
+{
+  z = ID/(nx*ny);
+  y = (ID%(nx*ny))/nx;
+  x = ID%nx;
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+void get_coords(GlobalOrdinal ID, int nx, int ny, int nz,
+                Scalar& x, Scalar& y, Scalar& z)
+{
+  const int xdiv = nx>1 ? nx-1 : 1;
+  const int ydiv = ny>1 ? ny-1 : 1;
+  const int zdiv = nz>1 ? nz-1 : 1;
+
+//This code assumes that ID is 0-based.
+//
+//compute coordinates that lie on (or in) the unit cube.
+//that's why we're dividing by nz,ny,nx:
+  z = (1.0*(ID/(nx*ny)))/zdiv;
+  y = 1.0*((ID%(nx*ny))/nx)/ydiv;
+  x = 1.0*(ID%nx)/xdiv;
+}
+
+template<typename GlobalOrdinal>
+GlobalOrdinal get_num_ids(const Box& box)
+{
+  int nx = box[0][1] - box[0][0];
+  int ny = box[1][1] - box[1][0];
+  int nz = box[2][1] - box[2][0];
+  GlobalOrdinal tmp = nx*ny;
+  tmp *= nz;
+  return tmp;
+}
+
+template<typename GlobalOrdinal>
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+GlobalOrdinal get_id(int nx, int ny, int nz,
+                     int x, int y, int z)
+{
+  if (x<0 || y<0 || z<0) return -1;
+  if (x>=nx || y>=ny || z>=nz) return -1;
+
+  //form x + nx*y + nx*ny*z:
+
+  GlobalOrdinal tmp = nx*ny;
+  tmp *= z;
+  tmp = x + nx * y + tmp;
+  return tmp;
+}
+
+template<typename GlobalOrdinal>
+void get_ids(int nx, int ny, int nz,
+             const Box& box,
+             std::vector<GlobalOrdinal>& ids,
+             bool include_ghost_layer=false)
+{
+  ids.clear();
+  int minz = box[2][0];
+  int maxz = box[2][1];
+  int miny = box[1][0];
+  int maxy = box[1][1];
+  int minx = box[0][0];
+  int maxx = box[0][1];
+
+  if (include_ghost_layer) {
+    if (minz > 0) minz--;
+    if (miny > 0) miny--;
+    if (minx > 0) minx--;
+    if (maxz < nz) maxz++;
+    if (maxy < ny) maxy++;
+    if (maxx < nx) maxx++;
+  }
+
+  size_t ids_size = ((maxz - minz) * (maxy - miny)) * (maxx - minx);
+  ids.reserve(ids_size);
+
+  for(int z=minz; z<maxz; ++z) {
+    for(int y=miny; y<maxy; ++y) {
+      for(int x=minx; x<maxx; ++x) {
+        ids.push_back(get_id<GlobalOrdinal>(nx, ny, nz, x, y, z));
+      }
+    }
+  }
+}
+
+template<typename GlobalOrdinal>
+void get_ghost_ids(int nx, int ny, int nz,
+             const Box& box,
+             std::vector<GlobalOrdinal>& ids)
+{
+  ids.clear();
+  int minz,maxz,miny,maxy,minx,maxx;
+  int orig_minz = minz = box[2][0];
+  int orig_maxz = maxz = box[2][1];
+  int orig_miny = miny = box[1][0];
+  int orig_maxy = maxy = box[1][1];
+  int orig_minx = minx = box[0][0];
+  int orig_maxx = maxx = box[0][1];
+
+  if (minz > 0) minz--;
+  if (miny > 0) miny--;
+  if (minx > 0) minx--;
+  if (maxz < nz) maxz++;
+  if (maxy < ny) maxy++;
+  if (maxx < nx) maxx++;
+
+  for(int z=minz; z<maxz; ++z) {
+    for(int y=miny; y<maxy; ++y) {
+      for(int x=minx; x<maxx; ++x) {
+        bool x_in_ghost_layer = (x < orig_minx) || (x >= orig_maxx);
+        bool y_in_ghost_layer = (y < orig_miny) || (y >= orig_maxy);
+        bool z_in_ghost_layer = (z < orig_minz) || (z >= orig_maxz);
+        //we are in the ghost layer if any one of x,y,z are in the ghost layer
+        if (!x_in_ghost_layer && !y_in_ghost_layer && !z_in_ghost_layer) continue;
+        ids.push_back(get_id<GlobalOrdinal>(nx, ny, nz, x, y, z));
+      }
+    }
+  }
+}
+
+ inline void print_box(int myproc, const char* name, const Box& box,
+                      const char* name2, const Box& box2)
+{
+  std::cout << "proc " << myproc << " "<<name
+      <<" ("<<box[0][0]<<","<<box[0][1]<<") "
+      <<" ("<<box[1][0]<<","<<box[1][1]<<") "
+      <<" ("<<box[2][0]<<","<<box[2][1]<<") "
+      <<name2
+      <<" ("<<box2[0][0]<<","<<box2[0][1]<<") "
+      <<" ("<<box2[1][0]<<","<<box2[1][1]<<") "
+      <<" ("<<box2[2][0]<<","<<box2[2][1]<<") "<<std::endl;
+}
+
+bool is_neighbor(const Box& box1, const Box& box2)
+{
+  //neighbors in the x dimension if:
+  bool x_neighbor = (box1[0][1] == box2[0][0]) || (box1[0][0] == box2[0][1]) || // min matches max
+                    (box1[0][0] == box2[0][0]) || (box1[0][1] == box2[0][1]) || // mins or maxs match
+                    (box1[0][0] >  box2[0][0]  &&  box1[0][1] <  box2[0][1]) || // range contains other
+                    (box2[0][0] >  box1[0][0]  &&  box2[0][1] <  box1[0][1]) || // range contains other
+                    (box1[0][0] >  box2[0][0]  &&  box1[0][0] <  box2[0][1]) || // min contained in rng
+                    (box2[0][0] >  box1[0][0]  &&  box2[0][0] <  box1[0][1]);   // min contained in rng
+  if (!x_neighbor) {
+    x_neighbor = (box1[0][1] == box2[0][0]-1) || (box1[0][0] == box2[0][1]+1);
+  }
+
+  bool y_neighbor = (box1[1][1] == box2[1][0]) || (box1[1][0] == box2[1][1]) || // min matches max
+                    (box1[1][0] == box2[1][0]) || (box1[1][1] == box2[1][1]) || // mins or maxs match
+                    (box1[1][0] >  box2[1][0]  &&  box1[1][1] <  box2[1][1]) || // range contains other
+                    (box2[1][0] >  box1[1][0]  &&  box2[1][1] <  box1[1][1]) || // range contains other
+                    (box1[1][0] >  box2[1][0]  &&  box1[1][0] <  box2[1][1]) || // min contained in rng
+                    (box2[1][0] >  box1[1][0]  &&  box2[1][0] <  box1[1][1]);   // min contained in rng
+  if (!y_neighbor) {
+    y_neighbor = (box1[1][1] == box2[1][0]-1) || (box1[1][0] == box2[1][1]+1);
+  }
+
+  bool z_neighbor = (box1[2][1] == box2[2][0]) || (box1[2][0] == box2[2][1]) || // min matches max
+                    (box1[2][0] == box2[2][0]) || (box1[2][1] == box2[2][1]) || // mins or maxs match
+                    (box1[2][0] >  box2[2][0]  &&  box1[2][1] <  box2[2][1]) || // range contains other
+                    (box2[2][0] >  box1[2][0]  &&  box2[2][1] <  box1[2][1]) || // range contains other
+                    (box1[2][0] >  box2[2][0]  &&  box1[2][0] <  box2[2][1]) || // min contained in rng
+                    (box2[2][0] >  box1[2][0]  &&  box2[2][0] <  box1[2][1]);   // min contained in rng
+  if (!z_neighbor) {
+    z_neighbor = (box1[2][1] == box2[2][0]-1) || (box1[2][0] == box2[2][1]+1);
+  }
+
+  return x_neighbor && y_neighbor && z_neighbor;
+}
+
+template<typename GlobalOrdinal>
+void create_map_id_to_row(int global_nx, int global_ny, int global_nz,
+                     const Box& box,
+                     std::map<GlobalOrdinal,GlobalOrdinal>& id_to_row)
+{
+  GlobalOrdinal num_my_ids = get_num_ids<GlobalOrdinal>(box);
+
+  typename std::vector<GlobalOrdinal> all_ids;
+  bool include_ghost_layer = false;
+  get_ids(global_nx, global_ny, global_nz, box, all_ids, include_ghost_layer);
+
+  GlobalOrdinal my_first_row = 0;
+  typename std::vector<GlobalOrdinal> global_offsets;
+  std::vector<int> all_boxes;
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+  GlobalOrdinal local_num_ids = num_my_ids;
+  global_offsets.resize(numprocs);
+  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
+  MPI_Allgather(&local_num_ids, 1, mpi_dtype, &global_offsets[0], 1, mpi_dtype, MPI_COMM_WORLD);
+  GlobalOrdinal offset = 0;
+  for(int i=0; i<numprocs; ++i) {
+    GlobalOrdinal tmp = global_offsets[i];
+    global_offsets[i] = offset;
+    offset += tmp;
+  }
+
+  my_first_row = global_offsets[myproc];
+
+  all_boxes.resize(6*numprocs);
+  int* local_box_ranges = const_cast<int*>(&box.ranges[0]);
+  MPI_Allgather(local_box_ranges, 6, MPI_INT, &all_boxes[0], 6, MPI_INT, MPI_COMM_WORLD);
+#endif
+
+  if (all_ids.size() > 0) {
+    id_to_row.insert(std::make_pair(all_ids[0], my_first_row));
+  }
+
+  for(size_t i=1; i<all_ids.size(); ++i) {
+    if (all_ids[i] != all_ids[i-1]+1) {
+      id_to_row.insert(std::make_pair(all_ids[i], my_first_row+i));
+    }
+  }
+
+//  int num_neighbors = 0;
+  for(int i=0; i<numprocs; ++i) {
+    if (i == myproc) continue;
+    Box box_i;
+    for(int r=0; r<6; ++r) box_i.ranges[r] = all_boxes[i*6 + r];
+//    bool neighbor= is_neighbor(box, box_i);
+//if(myproc==2) {
+//  std::cout<<"i: "<<i<<" "<<neighbor<<" ";
+//  print_box(myproc, " ", box, " ", box_i);
+//}
+    if (!is_neighbor(box, box_i)) {
+//      if (myproc==50) {
+//        std::cout<<"box ("<<box[0][0]<<","<<box[0][1]<<" - "<<box[1][0]<<","<<box[1][1]<<" - "<<box[2][0]<<","<<box[2][1]<<")"<<std::endl<<" and ("<<box_i[0][0]<<","<<box_i[0][1]<<" - "<<box_i[1][0]<<","<<box_i[1][1]<<" - "<<box_i[2][0]<<","<<box_i[2][1]<<") not neighbors."<<std::endl;
+//      }
+      continue;
+    }
+//    ++num_neighbors;
+
+    get_ids(global_nx, global_ny, global_nz, box_i, all_ids, include_ghost_layer);
+
+    GlobalOrdinal first_row = global_offsets[i];
+    if (all_ids.size() > 0) {
+      id_to_row.insert(std::make_pair(all_ids[0], first_row));
+    }
+    for(size_t j=1; j<all_ids.size(); ++j) {
+      if (all_ids[j] != all_ids[j-1]+1) {
+        id_to_row.insert(std::make_pair(all_ids[j], first_row+j));
+      }
+    }
+  }
+
+//std::cout<<"proc "<<myproc<<": num_neighbors: "<<num_neighbors<<", id_to_row.size(): "<<id_to_row.size()<<std::endl;
+//typename std::map<GlobalOrdinal,GlobalOrdinal>::iterator iter = id_to_row.begin(), end = id_to_row.end();
+//for(; iter!=end; ++iter) {
+//  std::cout<<"proc "<<myproc<<": "<<iter->first<<" :: "<<iter->second<<std::endl;
+//}
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/cg_solve.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/cg_solve.hpp
@@ -0,0 +1,234 @@
+#ifndef _cg_solve_hpp_
+#define _cg_solve_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+#include <limits>
+
+#include <Vector_functions.hpp>
+#include <mytimer.hpp>
+
+#include <outstream.hpp>
+
+namespace miniFE {
+
+template<typename Scalar>
+void print_vec(const std::vector<Scalar>& vec, const std::string& name)
+{
+  for(size_t i=0; i<vec.size(); ++i) {
+    std::cout << name << "["<<i<<"]: " << vec[i] << std::endl;
+  }
+}
+
+template<typename VectorType>
+bool breakdown(typename VectorType::ScalarType inner,
+               const VectorType& v,
+               const VectorType& w)
+{
+  typedef typename VectorType::ScalarType Scalar;
+  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
+
+//This is code that was copied from Aztec, and originally written
+//by my hero, Ray Tuminaro.
+//
+//Assuming that inner = <v,w> (inner product of v and w),
+//v and w are considered orthogonal if
+//  |inner| < 100 * ||v||_2 * ||w||_2 * epsilon
+
+  magnitude vnorm = std::sqrt(dot(v,v));
+  magnitude wnorm = std::sqrt(dot(w,w));
+  return std::abs(inner) <= 100*vnorm*wnorm*std::numeric_limits<magnitude>::epsilon();
+}
+
+template<typename OperatorType,
+         typename VectorType,
+         typename Matvec>
+void
+cg_solve(OperatorType& A,
+         const VectorType& b,
+         VectorType& x,
+         Matvec matvec,
+         typename OperatorType::LocalOrdinalType max_iter,
+         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
+         typename OperatorType::LocalOrdinalType& num_iters,
+         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
+         timer_type* my_cg_times)
+{
+  typedef typename OperatorType::ScalarType ScalarType;
+  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
+  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;
+
+  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
+  timer_type total_time = mytimer();
+
+  int myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#ifdef USE_MPI_PCONTROL
+  MPI_Pcontrol(1);
+#endif
+#endif
+
+  if (!A.has_local_indices) {
+    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
+       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
+       << std::endl;
+    return;
+  }
+
+  size_t nrows = A.rows.size();
+  LocalOrdinalType ncols = A.num_cols;
+
+  VectorType r(b.startIndex, nrows);
+  VectorType p(0, ncols);
+  VectorType Ap(b.startIndex, nrows);
+
+  normr = 0;
+  magnitude_type rtrans = 0;
+  magnitude_type oldrtrans = 0;
+
+  LocalOrdinalType print_freq = max_iter/10;
+  if (print_freq>50) print_freq = 50;
+  if (print_freq<1)  print_freq = 1;
+
+  ScalarType one = 1.0;
+  ScalarType zero = 0.0;
+
+  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);
+
+//  print_vec(p.coefs, "p");
+
+  TICK();
+  matvec(A, p, Ap);
+  TOCK(tMATVEC);
+
+  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);
+
+  TICK(); rtrans = dot(r, r); TOCK(tDOT);
+
+//std::cout << "rtrans="<<rtrans<<std::endl;
+
+  normr = std::sqrt(rtrans);
+
+  if (myproc == 0) {
+    std::cout << "Initial Residual = "<< normr << std::endl;
+  }
+
+  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();
+
+#ifdef MINIFE_DEBUG
+  std::ostream& os = outstream();
+  os << "brkdown_tol = " << brkdown_tol << std::endl;
+#endif
+
+
+  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
+    if (k == 1) {
+      TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
+    }
+    else {
+      oldrtrans = rtrans;
+      TICK(); rtrans = dot(r, r); TOCK(tDOT);
+      magnitude_type beta = rtrans/oldrtrans;
+      TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY);
+    }
+
+    normr = std::sqrt(rtrans);
+
+    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
+      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
+    }
+
+    magnitude_type alpha = 0;
+    magnitude_type p_ap_dot = 0;
+
+#ifdef MINIFE_FUSED
+    TICK();
+    p_ap_dot = matvec_and_dot(A, p, Ap);
+    TOCK(tMATVECDOT);
+#else
+    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);
+
+    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);
+#endif
+
+#ifdef MINIFE_DEBUG
+    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
+    os.flush();
+#endif
+    if (p_ap_dot < brkdown_tol) {
+      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
+        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
+#ifdef MINIFE_DEBUG
+        os << "ERROR, numerical breakdown!"<<std::endl;
+#endif
+        //update the timers before jumping out.
+        my_cg_times[WAXPY] = tWAXPY;
+        my_cg_times[DOT] = tDOT;
+        my_cg_times[MATVEC] = tMATVEC;
+        my_cg_times[TOTAL] = mytimer() - total_time;
+        return;
+      }
+      else brkdown_tol = 0.1 * p_ap_dot;
+    }
+    alpha = rtrans/p_ap_dot;
+#ifdef MINIFE_DEBUG
+    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
+#endif
+
+#ifdef MINIFE_FUSED
+    TICK();
+    fused_waxpby(one, x, alpha, p, x, one, r, -alpha, Ap, r);
+    TOCK(tWAXPY);
+#else
+    TICK(); waxpby(one, x, alpha, p, x);
+            waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY);
+#endif
+
+    num_iters = k;
+  }
+
+#ifdef HAVE_MPI
+#ifdef USE_MPI_PCONTROL
+  MPI_Pcontrol(0);
+#endif
+#endif
+
+  my_cg_times[WAXPY] = tWAXPY;
+  my_cg_times[DOT] = tDOT;
+  my_cg_times[MATVEC] = tMATVEC;
+  my_cg_times[MATVECDOT] = tMATVECDOT;
+  my_cg_times[TOTAL] = mytimer() - total_time;
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/compute_matrix_stats.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/compute_matrix_stats.hpp
@@ -0,0 +1,116 @@
+#ifndef _compute_matrix_stats_hpp_
+#define _compute_matrix_stats_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+
+#include <outstream.hpp>
+#include <utils.hpp>
+#include <YAML_Doc.hpp>
+
+namespace miniFE {
+
+template<typename MatrixType>
+size_t
+compute_matrix_stats(const MatrixType& A, int myproc, int numprocs, YAML_Doc& ydoc)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
+  int min_proc = 0, max_proc = 0;
+
+  GlobalOrdinal local_nrows = A.rows.size();
+
+  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
+                     max_nrows, max_proc);
+
+  //Gather stats on global, min/max matrix num-nonzeros:
+
+  double local_nnz = A.num_nonzeros();
+  double dglobal_nnz = 0, dmin_nnz = 0, dmax_nnz = 0;
+
+  get_global_min_max(local_nnz, dglobal_nnz, dmin_nnz, min_proc,
+                     dmax_nnz, max_proc);
+
+  double avg_nrows = global_nrows;
+  avg_nrows /= numprocs;
+  double avg_nnz = dglobal_nnz;
+  avg_nnz /= numprocs;
+
+  double mem_overhead_MB = parallel_memory_overhead_MB(A);
+
+  size_t global_nnz = static_cast<size_t>(std::ceil(dglobal_nnz));
+  size_t min_nnz = static_cast<size_t>(std::ceil(dmin_nnz));
+  size_t max_nnz = static_cast<size_t>(std::ceil(dmax_nnz));
+  size_t global_num_rows = global_nrows;
+
+  if (myproc == 0) {
+    ydoc.add("Matrix attributes","");
+    ydoc.get("Matrix attributes")->add("Global Nrows",global_num_rows);
+    ydoc.get("Matrix attributes")->add("Global NNZ",global_nnz);
+
+    //compute how much memory the matrix occupies:
+    //num-bytes = sizeof(GlobalOrdinal)*global_nrows   for A.rows
+    //          + sizeof(LocalOrdinal)*global_nrows    for A.rows_offsets
+    //          + sizeof(GlobalOrdinal)*global_nnz     for A.packed_cols
+    //          + sizeof(Scalar)*global_nnz            for A.packed_coefs
+
+    double invGB = 1.0/(1024*1024*1024);
+    double memGB = invGB*global_nrows*sizeof(GlobalOrdinal);
+    memGB += invGB*global_nrows*sizeof(LocalOrdinal);
+    memGB += invGB*global_nnz*sizeof(GlobalOrdinal);
+    memGB += invGB*global_nnz*sizeof(Scalar);
+    ydoc.get("Matrix attributes")->add("Global Memory (GB)",memGB);
+
+    ydoc.get("Matrix attributes")->add("Pll Memory Overhead (MB)",mem_overhead_MB);
+
+    size_t min_num_rows = min_nrows;
+    size_t max_num_rows = max_nrows;
+    ydoc.get("Matrix attributes")->add("Rows per proc MIN",min_num_rows);
+    ydoc.get("Matrix attributes")->add("Rows per proc MAX",max_num_rows);
+    ydoc.get("Matrix attributes")->add("Rows per proc AVG",avg_nrows);
+    ydoc.get("Matrix attributes")->add("NNZ per proc MIN",min_nnz);
+    ydoc.get("Matrix attributes")->add("NNZ per proc MAX",max_nnz);
+    ydoc.get("Matrix attributes")->add("NNZ per proc AVG",avg_nnz);
+  }
+
+  return global_nnz;
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/driver.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/driver.hpp
@@ -0,0 +1,412 @@
+#ifndef _driver_hpp_
+#define _driver_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+
+#include <box_utils.hpp>
+#include <Vector.hpp>
+
+#ifdef MINIFE_CSR_MATRIX
+#include <CSRMatrix.hpp>
+#elif defined(MINIFE_ELL_MATRIX)
+#include <ELLMatrix.hpp>
+#else
+#include <CSRMatrix.hpp>
+#endif
+
+#include <simple_mesh_description.hpp>
+
+#include <SparseMatrix_functions.hpp>
+
+#include <generate_matrix_structure.hpp>
+#include <assemble_FE_data.hpp>
+
+#include <verify_solution.hpp>
+
+#include <compute_matrix_stats.hpp>
+#include <make_local_matrix.hpp>
+#include <imbalance.hpp>
+#include <cg_solve.hpp>
+#if MINIFE_KERNELS != 0
+#include <time_kernels.hpp>
+#endif
+#include <outstream.hpp>
+#include <utils.hpp>
+#include <mytimer.hpp>
+#include <YAML_Doc.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#define RUN_TIMED_FUNCTION(msg, fn, time_inc, time_total) \
+{                                   \
+  /*if (myproc==0) {                  \
+    std::cout.width(30);            \
+    std::cout << msg;               \
+    std::cout.flush();              \
+  }*/                                 \
+  timer_type rtf_t0 = mytimer();    \
+  fn;                               \
+  time_inc = mytimer() - rtf_t0;    \
+  time_total += time_inc;           \
+  /*if (myproc==0) {                  \
+    std::cout << time_inc << "s, total time: " << time_total << std::endl; \
+  }*/                                 \
+}
+
+//This program assembles finite-element matrices into a global matrix and
+//vector, then solves the linear-system using Conjugate Gradients.
+//Each finite-element is a hexahedron with 8 vertex-nodes.
+//
+//Notes:
+//- In finite-element terms, the box dimensions are in elements, not nodes.
+//  In other words, a 2x2x2 box describes 8 elements, each of which has 8 nodes,
+//  so it is a 3x3x3 node domain (27 nodes).
+//  The assembled linear system will have 1 equation for each finite element node.
+//
+//- The coordinate origin is at the corner of the global box where x=0,
+//  y=0, z=0, and the box extends along the positive x-axis, positive y-axis,
+//  and the positive z-axis.
+//
+//- Some aspects of matrix-structure generation and finite-element assembly
+//  are convenient to do using global node identifiers.
+//  A global identifier for each node is obtained from coordinates plus
+//  global box dimensions. See the function 'get_id' in box_utils.hpp.
+//
+//- Each node corresponds to a row in the matrix. The RCB partitioning method
+//  we use to split the global box among processors results in some
+//  processors owning non-contiguous blocks of global node identifiers.
+//  Since it is convenient for matrices and vectors to store contiguously-
+//  numbered blocks of rows, we map global node identifiers to a separate
+//  space of row numbers such that each processor's nodes correspond to a
+//  contiguous block of row numbers.
+//
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal>
+int
+driver(const Box& global_box, Box& my_box,
+       Parameters& params, YAML_Doc& ydoc)
+{
+  int global_nx = global_box[0][1];
+  int global_ny = global_box[1][1];
+  int global_nz = global_box[2][1];
+
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (params.load_imbalance > 0) {
+    add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc);
+  }
+
+  float largest_imbalance = 0, std_dev = 0;
+  compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance,
+                                   std_dev, ydoc, true);
+
+
+  //Create a representation of the mesh:
+  //Note that 'simple_mesh_description' is a virtual or conceptual
+  //mesh that doesn't actually store mesh data.
+#ifdef TIME_IT
+  if (myproc==0) {
+    std::cout.width(30);
+    std::cout << "creating/filling mesh...";
+    std::cout.flush();
+  }
+#endif
+
+  timer_type t_start = mytimer();
+  timer_type t0 = mytimer();
+
+  simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box);
+
+  timer_type mesh_fill = mytimer() - t0;
+  timer_type t_total = mytimer() - t_start;
+
+#ifdef TIME_IT
+  if (myproc==0) {
+    std::cout << mesh_fill << "s, total time: " << t_total << std::endl;
+  }
+#endif
+
+  //next we will generate the matrix structure.
+
+  //Declare matrix object:
+
+#if defined(MINIFE_ELL_MATRIX)
+  typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
+#else
+  typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
+#endif
+
+  MatrixType A;
+
+  timer_type gen_structure;
+  RUN_TIMED_FUNCTION("generating matrix structure...",
+                     generate_matrix_structure(mesh, A),
+                     gen_structure, t_total);
+
+  GlobalOrdinal local_nrows = A.rows.size();
+  GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1;
+
+  Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows);
+  Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows);
+
+  //Assemble finite-element sub-matrices and sub-vectors into the global
+  //linear system:
+
+  timer_type fe_assembly;
+  RUN_TIMED_FUNCTION("assembling FE data...",
+                     assemble_FE_data(mesh, A, b, params),
+                     fe_assembly, t_total);
+
+  if (myproc == 0) {
+    ydoc.add("Matrix structure generation","");
+    ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure);
+    ydoc.add("FE assembly","");
+    ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly);
+  }
+
+#ifdef MINIFE_DEBUG
+  write_matrix("A_prebc.mtx", A);
+  write_vector("b_prebc.vec", b);
+#endif
+
+  //Now apply dirichlet boundary-conditions
+  //(Apply the 0-valued surfaces first, then the 1-valued surface last.)
+
+  timer_type dirbc_time;
+  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
+            impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total);
+  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
+            impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total);
+
+#ifdef MINIFE_DEBUG
+  write_matrix("A.mtx", A);
+  write_vector("b.vec", b);
+#endif
+
+  //Transform global indices to local, set up communication information:
+
+  timer_type make_local_time;
+  RUN_TIMED_FUNCTION("making matrix indices local...",
+                     make_local_matrix(A),
+                     make_local_time, t_total);
+
+#ifdef MINIFE_DEBUG
+  write_matrix("A_local.mtx", A);
+  write_vector("b_local.vec", b);
+#endif
+
+  size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc);
+
+  //Prepare to perform conjugate gradient solve:
+
+  LocalOrdinal max_iters = 200;
+  LocalOrdinal num_iters = 0;
+  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
+  magnitude rnorm = 0;
+  magnitude tol = std::numeric_limits<magnitude>::epsilon();
+
+  timer_type cg_times[NUM_TIMERS];
+
+  typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType;
+
+  t_total = mytimer() - t_start;
+
+  bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1;
+
+  int verify_result = 0;
+
+#if MINIFE_KERNELS != 0
+  if (myproc==0) {
+    std::cout.width(30);
+    std::cout << "Starting kernel timing loops ..." << std::endl;
+  }
+
+  max_iters = 500;
+  x.coefs[0] = 0.9;
+  if (matvec_with_comm_overlap) {
+    time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
+  }
+  else {
+    time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
+  }
+  num_iters = max_iters;
+  std::string title("Kernel timings");
+#else
+  if (myproc==0) {
+    std::cout << "Starting CG solver ... " << std::endl;
+  }
+
+  if (matvec_with_comm_overlap) {
+#ifdef MINIFE_CSR_MATRIX
+    rearrange_matrix_local_external(A);
+    cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol,
+           num_iters, rnorm, cg_times);
+#else
+    std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl;
+#endif
+  }
+  else {
+    cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol,
+           num_iters, rnorm, cg_times);
+    if (myproc == 0) {
+      std::cout << "Final Resid Norm: " << rnorm << std::endl;
+    }
+
+    if (params.verify_solution > 0) {
+      double tolerance = 0.06;
+      bool verify_whole_domain = false;
+  #ifdef MINIFE_DEBUG
+      verify_whole_domain = true;
+  #endif
+      if (myproc == 0) {
+        if (verify_whole_domain) std::cout << "verifying solution..." << std::endl;
+        else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl;
+      }
+      verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain);
+    }
+  }
+
+#ifdef MINIFE_DEBUG
+  write_vector("x.vec", x);
+#endif
+  std::string title("CG solve");
+#endif
+
+  if (myproc == 0) {
+    ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name());
+    ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name());
+    ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name());
+    ydoc.add(title,"");
+    ydoc.get(title)->add("Iterations",num_iters);
+    ydoc.get(title)->add("Final Resid Norm",rnorm);
+
+    GlobalOrdinal global_nrows = global_nx;
+    global_nrows *= global_ny*global_nz;
+
+    //flops-per-mv, flops-per-dot, flops-per-waxpy:
+    double mv_flops = global_nnz*2.0;
+    double dot_flops = global_nrows*2.0;
+    double waxpy_flops = global_nrows*3.0;
+
+#if MINIFE_KERNELS == 0
+//if MINIFE_KERNELS == 0 then we did a CG solve, and in that case
+//there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys.
+    mv_flops *= (num_iters+1);
+    dot_flops *= (2*num_iters);
+    waxpy_flops *= (3*num_iters+2);
+#else
+//if MINIFE_KERNELS then we did one of each operation per iteration.
+    mv_flops *= num_iters;
+    dot_flops *= num_iters;
+    waxpy_flops *= num_iters;
+#endif
+
+    double total_flops = mv_flops + dot_flops + waxpy_flops;
+
+    double mv_mflops = -1;
+    if (cg_times[MATVEC] > 1.e-4)
+      mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]);
+
+    double dot_mflops = -1;
+    if (cg_times[DOT] > 1.e-4)
+      dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]);
+
+    double waxpy_mflops = -1;
+    if (cg_times[WAXPY] > 1.e-4)
+      waxpy_mflops = 1.e-6 *  (waxpy_flops/cg_times[WAXPY]);
+
+    double total_mflops = -1;
+    if (cg_times[TOTAL] > 1.e-4)
+      total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]);
+
+    ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]);
+    ydoc.get(title)->add("WAXPY Flops",waxpy_flops);
+    if (waxpy_mflops >= 0)
+      ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops);
+    else
+      ydoc.get(title)->add("WAXPY Mflops","inf");
+
+    ydoc.get(title)->add("DOT Time",cg_times[DOT]);
+    ydoc.get(title)->add("DOT Flops",dot_flops);
+    if (dot_mflops >= 0)
+      ydoc.get(title)->add("DOT Mflops",dot_mflops);
+    else
+      ydoc.get(title)->add("DOT Mflops","inf");
+
+    ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]);
+    ydoc.get(title)->add("MATVEC Flops",mv_flops);
+    if (mv_mflops >= 0)
+      ydoc.get(title)->add("MATVEC Mflops",mv_mflops);
+    else
+      ydoc.get(title)->add("MATVEC Mflops","inf");
+
+#ifdef MINIFE_FUSED
+    ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]);
+    ydoc.get(title)->add("MATVECDOT Flops",mv_flops);
+    if (mv_mflops >= 0)
+      ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops);
+    else
+      ydoc.get(title)->add("MATVECDOT Mflops","inf");
+#endif
+
+#if MINIFE_KERNELS == 0
+    ydoc.get(title)->add("Total","");
+    ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]);
+    ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops);
+    if (total_mflops >= 0)
+      ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops);
+    else
+      ydoc.get(title)->get("Total")->add("Total CG Mflops","inf");
+    ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters);
+#endif
+  }
+
+  return verify_result;
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/exchange_externals.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/exchange_externals.hpp
@@ -0,0 +1,269 @@
+#ifndef _exchange_externals_hpp_
+#define _exchange_externals_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstdlib>
+#include <iostream>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <outstream.hpp>
+
+#include <TypeTraits.hpp>
+
+namespace miniFE {
+
+template<typename MatrixType,
+         typename VectorType>
+void
+exchange_externals(MatrixType& A,
+                   VectorType& x)
+{
+#ifdef HAVE_MPI
+#ifdef MINIFE_DEBUG
+  std::ostream& os = outstream();
+  os << "entering exchange_externals\n";
+#endif
+
+  int numprocs = 1;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+
+  if (numprocs < 2) return;
+
+  typedef typename MatrixType::ScalarType Scalar;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+  // Extract Matrix pieces
+
+  int local_nrow = A.rows.size();
+  int num_neighbors = A.neighbors.size();
+  const std::vector<LocalOrdinal>& recv_length = A.recv_length;
+  const std::vector<LocalOrdinal>& send_length = A.send_length;
+  const std::vector<int>& neighbors = A.neighbors;
+  const std::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
+
+  std::vector<Scalar>& send_buffer = A.send_buffer;
+
+  //
+  // first post receives, these are immediate receives
+  // Do not wait for result to come, will do that at the
+  // wait call below.
+  //
+
+  int MPI_MY_TAG = 99;
+
+  std::vector<MPI_Request>& request = A.request;
+
+  //
+  // Externals are at end of locals
+  //
+
+  std::vector<Scalar>& x_coefs = x.coefs;
+  Scalar* x_external = &(x_coefs[local_nrow]);
+
+  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
+
+  // Post receives first
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_recv = recv_length[i];
+    MPI_Irecv(x_external, n_recv, mpi_dtype, neighbors[i], MPI_MY_TAG,
+              MPI_COMM_WORLD, &request[i]);
+    x_external += n_recv;
+  }
+
+#ifdef MINIFE_DEBUG
+  os << "launched recvs\n";
+#endif
+
+  //
+  // Fill up send buffer
+  //
+
+  size_t total_to_be_sent = elements_to_send.size();
+#ifdef MINIFE_DEBUG
+  os << "total_to_be_sent: " << total_to_be_sent << std::endl;
+#endif
+
+  for(size_t i=0; i<total_to_be_sent; ++i) {
+#ifdef MINIFE_DEBUG
+    //expensive index range-check:
+    if (elements_to_send[i] < 0 || elements_to_send[i] > x.coefs.size()) {
+      os << "error, out-of-range. x.coefs.size()=="<<x.coefs.size()<<", elements_to_send[i]=="<<elements_to_send[i]<<std::endl;
+    }
+#endif
+    send_buffer[i] = x.coefs[elements_to_send[i]];
+  }
+
+  //
+  // Send to each neighbor
+  //
+
+  Scalar* s_buffer = &send_buffer[0];
+
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_send = send_length[i];
+    MPI_Send(s_buffer, n_send, mpi_dtype, neighbors[i], MPI_MY_TAG,
+             MPI_COMM_WORLD);
+    s_buffer += n_send;
+  }
+
+#ifdef MINIFE_DEBUG
+  os << "send to " << num_neighbors << std::endl;
+#endif
+
+  //
+  // Complete the reads issued above
+  //
+
+  MPI_Status status;
+  for(int i=0; i<num_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+  }
+
+#ifdef MINIFE_DEBUG
+  os << "leaving exchange_externals"<<std::endl;
+#endif
+
+//endif HAVE_MPI
+#endif
+}
+
+#ifdef HAVE_MPI
+static std::vector<MPI_Request> exch_ext_requests;
+#endif
+
+template<typename MatrixType,
+         typename VectorType>
+void
+begin_exchange_externals(MatrixType& A,
+                         VectorType& x)
+{
+#ifdef HAVE_MPI
+
+  int numprocs = 1, myproc = 0;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+  if (numprocs < 2) return;
+
+  typedef typename MatrixType::ScalarType Scalar;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+  // Extract Matrix pieces
+
+  int local_nrow = A.rows.size();
+  int num_neighbors = A.neighbors.size();
+  const std::vector<LocalOrdinal>& recv_length = A.recv_length;
+  const std::vector<LocalOrdinal>& send_length = A.send_length;
+  const std::vector<int>& neighbors = A.neighbors;
+  const std::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
+
+  std::vector<Scalar> send_buffer(elements_to_send.size(), 0);
+
+  //
+  // first post receives, these are immediate receives
+  // Do not wait for result to come, will do that at the
+  // wait call below.
+  //
+
+  int MPI_MY_TAG = 99;
+
+  exch_ext_requests.resize(num_neighbors);
+
+  //
+  // Externals are at end of locals
+  //
+
+  std::vector<Scalar>& x_coefs = x.coefs;
+  Scalar* x_external = &(x_coefs[local_nrow]);
+
+  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
+
+  // Post receives first
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_recv = recv_length[i];
+    MPI_Irecv(x_external, n_recv, mpi_dtype, neighbors[i], MPI_MY_TAG,
+              MPI_COMM_WORLD, &exch_ext_requests[i]);
+    x_external += n_recv;
+  }
+
+  //
+  // Fill up send buffer
+  //
+
+  size_t total_to_be_sent = elements_to_send.size();
+  for(size_t i=0; i<total_to_be_sent; ++i) send_buffer[i] = x.coefs[elements_to_send[i]];
+
+  //
+  // Send to each neighbor
+  //
+
+  Scalar* s_buffer = &send_buffer[0];
+
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_send = send_length[i];
+    MPI_Send(s_buffer, n_send, mpi_dtype, neighbors[i], MPI_MY_TAG,
+             MPI_COMM_WORLD);
+    s_buffer += n_send;
+  }
+#endif
+}
+
+inline
+void
+finish_exchange_externals(int num_neighbors)
+{
+#ifdef HAVE_MPI
+  //
+  // Complete the reads issued above
+  //
+
+  MPI_Status status;
+  for(int i=0; i<num_neighbors; ++i) {
+    if (MPI_Wait(&exch_ext_requests[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+  }
+
+//endif HAVE_MPI
+#endif
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/gauss_pts.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/gauss_pts.hpp
@@ -0,0 +1,67 @@
+#ifndef _gauss_pts_hpp_
+#define _gauss_pts_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KERNEL_PREFIX 
+#define KERNEL_PREFIX
+#endif
+
+namespace miniFE {
+
+template<typename Scalar>
+inline
+KERNEL_PREFIX void gauss_pts(int N, Scalar* pts, Scalar* wts)
+{
+  const Scalar x2 = 0.577350269; // 1.0/sqrt(3.0)
+  const Scalar x3 = 0.77459667; // sqrt(3.0/5.0)
+  const Scalar w1 = 0.55555556; // 5.0/9.0
+  const Scalar w2 = 0.88888889; // 8.0/9.0
+
+  switch(N) {
+  case 1:
+    pts[0] = 0.0; wts[0] = 2.0;
+    break;
+  case 2:
+    pts[0] = -x2; wts[0] = 1.0;
+    pts[1] = x2;  wts[1] = 1.0;
+    break;
+  case 3:
+    pts[0] =  -x3;  wts[0] = w1;
+    pts[1] =  0.0;  wts[1] = w2;
+    pts[2] =   x3;  wts[2] = w1;
+    break;
+  default:
+    break;
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/generate_matrix_structure.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/generate_matrix_structure.hpp
@@ -0,0 +1,154 @@
+#ifndef _generate_matrix_structure_hpp_
+#define _generate_matrix_structure_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <sstream>
+#include <stdexcept>
+#include <map>
+#include <algorithm>
+
+#include <simple_mesh_description.hpp>
+#include <SparseMatrix_functions.hpp>
+#include <box_utils.hpp>
+#include <utils.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType>
+int
+generate_matrix_structure(const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh,
+                          MatrixType& A)
+{
+  int myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  int threw_exc = 0;
+  try {
+
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+
+  int global_nodes_x = mesh.global_box[0][1]+1;
+  int global_nodes_y = mesh.global_box[1][1]+1;
+  int global_nodes_z = mesh.global_box[2][1]+1;
+  Box box;
+  copy_box(mesh.local_box, box);
+
+  //num-owned-nodes in each dimension is num-elems+1
+  //only if num-elems > 0 in that dimension *and*
+  //we are at the high end of the global range in that dimension:
+  if (box[0][1] > box[0][0] && box[0][1] == mesh.global_box[0][1]) ++box[0][1];
+  if (box[1][1] > box[1][0] && box[1][1] == mesh.global_box[1][1]) ++box[1][1];
+  if (box[2][1] > box[2][0] && box[2][1] == mesh.global_box[2][1]) ++box[2][1];
+
+  GlobalOrdinal global_nrows = global_nodes_x;
+  global_nrows *= global_nodes_y*global_nodes_z;
+
+  GlobalOrdinal nrows = get_num_ids<GlobalOrdinal>(box);
+  try {
+    A.reserve_space(nrows, 27);
+  }
+  catch(std::exception& exc) {
+    std::ostringstream osstr;
+    osstr << "One of A.rows.resize, A.row_offsets.resize, A.packed_cols.reserve or A.packed_coefs.reserve: nrows=" <<nrows<<": ";
+    osstr << exc.what();
+    std::string str1 = osstr.str();
+    throw std::runtime_error(str1);
+  }
+
+  std::vector<GlobalOrdinal> rows(nrows);
+  std::vector<LocalOrdinal> row_offsets(nrows+1);
+  std::vector<int> row_coords(nrows*3);
+
+  unsigned roffset = 0;
+  GlobalOrdinal nnz = 0;
+
+  for(int iz=box[2][0]; iz<box[2][1]; ++iz) {
+   for(int iy=box[1][0]; iy<box[1][1]; ++iy) {
+    for(int ix=box[0][0]; ix<box[0][1]; ++ix) {
+      GlobalOrdinal row_id =
+          get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                ix, iy, iz);
+      rows[roffset] = mesh.map_id_to_row(row_id);
+      row_coords[roffset*3] = ix;
+      row_coords[roffset*3+1] = iy;
+      row_coords[roffset*3+2] = iz;
+      row_offsets[roffset++] = nnz;
+
+      for(int sz=-1; sz<=1; ++sz) {
+       for(int sy=-1; sy<=1; ++sy) {
+        for(int sx=-1; sx<=1; ++sx) {
+          GlobalOrdinal col_id =
+              get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                   ix+sx, iy+sy, iz+sz);
+          if (col_id >= 0 && col_id < global_nrows) {
+            ++nnz;
+          }
+        }
+       }
+      }
+
+    }
+   }
+  }
+
+  if (roffset != nrows) {
+    throw std::runtime_error("ERROR in generate_matrix_structure, roffset != nrows.");
+  }
+  row_offsets[roffset] = nnz;
+
+  init_matrix(A, rows, row_offsets, row_coords,
+              global_nodes_x, global_nodes_y, global_nodes_z, global_nrows, mesh);
+  }
+  catch(...) {
+    std::cout << "proc " << myproc << " threw an exception in generate_matrix_structure, probably due to running out of memory." << std::endl;
+    threw_exc = 1;
+  }
+#ifdef HAVE_MPI
+  int global_throw = 0;
+  MPI_Allreduce(&threw_exc, &global_throw, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  threw_exc = global_throw;
+#endif
+  if (threw_exc) {
+    return 1;
+  }
+
+  return 0;
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/imbalance.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/imbalance.hpp
@@ -0,0 +1,298 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _imbalance_hpp_
+#define _imbalance_hpp_
+
+#include <cmath>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <box_utils.hpp>
+#include <utils.hpp>
+#include <YAML_Doc.hpp>
+
+namespace miniFE {
+
+const int X = 0;
+const int Y = 1;
+const int Z = 2;
+const int NONE = 3;
+
+const int LOWER = 0;
+const int UPPER = 1;
+
+template<typename GlobalOrdinal>
+void
+compute_imbalance(const Box& global_box,
+                  const Box& local_box,
+                  float& largest_imbalance,
+                  float& std_dev,
+                  YAML_Doc& doc,
+                  bool record_in_doc)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box);
+  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
+  int min_proc = myproc, max_proc = myproc;
+  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
+                     max_nrows, max_proc);
+
+  float avg_nrows = global_nrows;
+  avg_nrows /= numprocs;
+
+  //largest_imbalance will be the difference between the min (or max)
+  //rows-per-processor and avg_nrows, represented as a percentage:
+  largest_imbalance = percentage_difference<float>(min_nrows, avg_nrows);
+
+  float tmp = percentage_difference<float>(max_nrows, avg_nrows);
+  if (tmp > largest_imbalance) largest_imbalance = tmp;
+
+  std_dev = compute_std_dev_as_percentage<float>(local_nrows, avg_nrows);
+
+  if (myproc == 0 && record_in_doc) {
+    doc.add("Rows-per-proc Load Imbalance","");
+    doc.get("Rows-per-proc Load Imbalance")->add("Largest (from avg, %)",largest_imbalance);
+    doc.get("Rows-per-proc Load Imbalance")->add("Std Dev (%)",std_dev);
+  }
+}
+
+std::pair<int,int>
+decide_how_to_grow(const Box& global_box, const Box& local_box)
+{
+  std::pair<int,int> result(NONE,UPPER);
+
+  if (local_box[Z][UPPER] < global_box[Z][UPPER]) {
+    result.first = Z;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Z][LOWER] > global_box[Z][LOWER]) {
+    result.first = Z;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[Y][UPPER] < global_box[Y][UPPER]) {
+    result.first = Y;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Y][LOWER] > global_box[Y][LOWER]) {
+    result.first = Y;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[X][UPPER] < global_box[X][UPPER]) {
+    result.first = X;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[X][LOWER] > global_box[X][LOWER]) {
+    result.first = X;
+    result.second = LOWER;
+    return result;
+  }
+  return result;
+}
+
+std::pair<int,int>
+decide_how_to_shrink(const Box& global_box, const Box& local_box)
+{
+  std::pair<int,int> result(NONE,UPPER);
+
+  if (local_box[Z][UPPER] < global_box[Z][UPPER] && local_box[Z][UPPER]-local_box[Z][LOWER] > 2) {
+    result.first = Z;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Z][LOWER] > global_box[Z][LOWER] && local_box[Z][UPPER]-local_box[Z][LOWER] > 2) {
+    result.first = Z;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[Y][UPPER] < global_box[Y][UPPER] && local_box[Y][UPPER]-local_box[Y][LOWER] > 2) {
+    result.first = Y;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Y][LOWER] > global_box[Y][LOWER] && local_box[Y][UPPER]-local_box[Y][LOWER] > 2) {
+    result.first = Y;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[X][UPPER] < global_box[X][UPPER] && local_box[X][UPPER]-local_box[X][LOWER] > 2) {
+    result.first = X;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[X][LOWER] > global_box[X][LOWER] && local_box[X][UPPER]-local_box[X][LOWER] > 2) {
+    result.first = X;
+    result.second = LOWER;
+    return result;
+  }
+  return result;
+}
+
+template<typename GlobalOrdinal>
+void
+add_imbalance(const Box& global_box,
+              Box& local_box,
+              float imbalance,
+              YAML_Doc& doc)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs == 1) {
+    return;
+  }
+
+  float cur_imbalance = 0, cur_std_dev = 0;
+  compute_imbalance<GlobalOrdinal>(global_box, local_box,
+                                  cur_imbalance, cur_std_dev, doc, false);
+
+  while (cur_imbalance < imbalance) {
+    GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box);
+    GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
+    int min_proc = myproc, max_proc = myproc;
+    get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
+                       max_nrows, max_proc);
+
+    std::pair<int,int> grow(NONE,UPPER);
+    int grow_axis_val = -1;
+    std::pair<int,int> shrink(NONE,UPPER);
+    int shrink_axis_val = -1;
+
+    if (myproc == max_proc) {
+      grow = decide_how_to_grow(global_box, local_box);
+      if (grow.first != NONE) {
+        grow_axis_val = local_box[grow.first][grow.second];
+      }
+    }
+    if (myproc == min_proc) {
+      shrink = decide_how_to_shrink(global_box, local_box);
+      if (shrink.first != NONE) {
+        shrink_axis_val = local_box[shrink.first][shrink.second];
+      }
+    }
+
+    int grow_info[8] = {grow.first, grow.second,
+                        local_box[X][0], local_box[X][1],
+                        local_box[Y][0], local_box[Y][1],
+                        local_box[Z][0], local_box[Z][1]};
+
+    int shrink_info[8] = {shrink.first, shrink.second,
+                        local_box[X][0], local_box[X][1],
+                        local_box[Y][0], local_box[Y][1],
+                        local_box[Z][0], local_box[Z][1]};
+#ifdef HAVE_MPI
+    MPI_Bcast(&grow_info[0], 8, MPI_INT, max_proc, MPI_COMM_WORLD);
+    MPI_Bcast(&shrink_info[0], 8, MPI_INT, min_proc, MPI_COMM_WORLD);
+#endif
+
+    int grow_axis = grow_info[0];
+    int grow_end = grow_info[1];
+    int shrink_axis = shrink_info[0];
+    int shrink_end = shrink_info[1];
+    int grow_incr = 1;
+    if (grow_end == LOWER) grow_incr = -1;
+    int shrink_incr = -1;
+    if (shrink_end == LOWER) shrink_incr = 1;
+    if (grow_axis != NONE) grow_axis_val = grow_info[2+grow_axis*2+grow_end];
+    if (shrink_axis != NONE) shrink_axis_val = shrink_info[2+shrink_axis*2+shrink_end];
+
+    if (grow_axis == NONE && shrink_axis == NONE) break;
+
+    bool grow_status = grow_axis==NONE ? false : true;
+    if (grow_axis != NONE) {
+      if ((grow_incr ==  1 && local_box[grow_axis][0] == grow_axis_val) ||
+          (grow_incr == -1 && local_box[grow_axis][1] == grow_axis_val)) {
+        if (local_box[grow_axis][1] - local_box[grow_axis][0] < 2) {
+          grow_status = false;
+        }
+      }
+    }
+
+    bool shrink_status = shrink_axis==NONE ? false : true;
+    if (shrink_axis != NONE) {
+      if ((shrink_incr ==  1 && local_box[shrink_axis][0] == shrink_axis_val) ||
+          (shrink_incr == -1 && local_box[shrink_axis][1] == shrink_axis_val)) {
+        if (local_box[shrink_axis][1] - local_box[shrink_axis][0] < 2) {
+          shrink_status = false;
+        }
+      }
+    }
+
+#ifdef HAVE_MPI
+    int statusints[2] = { grow_status ? 0 : 1, shrink_status ? 0 : 1 };
+    int globalstatus[2] = { 0, 0 };
+    MPI_Allreduce(&statusints, &globalstatus, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    grow_status = globalstatus[0]>0 ? false : true;
+    shrink_status = globalstatus[1]>0 ? false : true;
+#endif
+
+    if (grow_status == false && shrink_status == false) break;
+
+    if (grow_status && grow_axis != NONE) {
+      if (local_box[grow_axis][0] == grow_axis_val) {
+        local_box[grow_axis][0] += grow_incr;
+      }
+
+      if (local_box[grow_axis][1] == grow_axis_val) {
+        local_box[grow_axis][1] += grow_incr;
+      }
+    }
+
+    if (shrink_status && shrink_axis != NONE) {
+      if (local_box[shrink_axis][0] == shrink_axis_val) {
+        local_box[shrink_axis][0] += shrink_incr;
+      }
+
+      if (local_box[shrink_axis][1] == shrink_axis_val) {
+        local_box[shrink_axis][1] += shrink_incr;
+      }
+    }
+
+    compute_imbalance<GlobalOrdinal>(global_box, local_box,
+                                    cur_imbalance, cur_std_dev, doc, false);
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/main.cpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/main.cpp
@@ -0,0 +1,248 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <iostream>
+#include <ctime>
+#include <cstdlib>
+#include <vector>
+
+#ifdef MINIFE_REPORT_RUSAGE
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#include <miniFE_version.h>
+
+#include <outstream.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <Box.hpp>
+#include <BoxPartition.hpp>
+#include <box_utils.hpp>
+#include <Parameters.hpp>
+#include <utils.hpp>
+#include <driver.hpp>
+#include <YAML_Doc.hpp>
+
+#if MINIFE_INFO != 0
+#include <miniFE_info.hpp>
+#else
+#include <miniFE_no_info.hpp>
+#endif
+
+//The following macros should be specified as compile-macros in the
+//makefile. They are defaulted here just in case...
+#ifndef MINIFE_SCALAR
+#define MINIFE_SCALAR double
+#endif
+#ifndef MINIFE_LOCAL_ORDINAL
+#define MINIFE_LOCAL_ORDINAL int
+#endif
+#ifndef MINIFE_GLOBAL_ORDINAL
+#define MINIFE_GLOBAL_ORDINAL int
+#endif
+
+// ************************************************************************
+
+void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params);
+void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads);
+void add_timestring_to_yaml(YAML_Doc& doc);
+
+//
+//We will create a 'box' of size nx X ny X nz, partition it among processors,
+//then call miniFE::driver which will use the partitioned box as the domain
+//from which to assemble finite-element matrices into a global matrix and
+//vector, then solve the linear-system using Conjugate Gradients.
+//
+
+int main(int argc, char** argv) {
+  miniFE::Parameters params;
+  miniFE::get_parameters(argc, argv, params);
+
+  int numprocs = 1, myproc = 0;
+  miniFE::initialize_mpi(argc, argv, numprocs, myproc);
+
+#ifdef HAVE_MPI
+#ifdef USE_MPI_PCONTROL
+  MPI_Pcontrol(0);
+#endif
+#endif
+
+  miniFE::timer_type start_time = miniFE::mytimer();
+
+#ifdef MINIFE_DEBUG
+  outstream(numprocs, myproc);
+#endif
+
+  //make sure each processor has the same parameters:
+  miniFE::broadcast_parameters(params);
+
+
+  Box global_box = { 0, params.nx, 0, params.ny, 0, params.nz };
+  std::vector<Box> local_boxes(numprocs);
+
+  box_partition(0, numprocs, 2, global_box, &local_boxes[0]);
+
+  Box& my_box = local_boxes[myproc];
+
+  MINIFE_GLOBAL_ORDINAL num_my_ids = miniFE::get_num_ids<MINIFE_GLOBAL_ORDINAL>(my_box);
+  MINIFE_GLOBAL_ORDINAL min_ids = num_my_ids;
+
+#ifdef HAVE_MPI
+  MPI_Datatype mpi_dtype = miniFE::TypeTraits<MINIFE_GLOBAL_ORDINAL>::mpi_type();
+  MPI_Allreduce(&num_my_ids, &min_ids, 1, mpi_dtype, MPI_MIN, MPI_COMM_WORLD);
+#endif
+
+  if (min_ids == 0) {
+    std::cout<<"One or more processors have 0 equations. Not currently supported. Exiting."<<std::endl;
+
+    miniFE::finalize_mpi();
+
+    return 1;
+  }
+
+  std::ostringstream osstr;
+  osstr << "miniFE." << params.nx << "x" << params.ny << "x" << params.nz;
+#ifdef HAVE_MPI
+  osstr << ".P"<<numprocs;
+#endif
+  osstr << ".";
+  if (params.name != "") osstr << params.name << ".";
+
+  YAML_Doc doc("miniFE", MINIFE_VERSION, ".", osstr.str());
+  if (myproc == 0) {
+    add_params_to_yaml(doc, params);
+    add_configuration_to_yaml(doc, numprocs, params.numthreads);
+    add_timestring_to_yaml(doc);
+  }
+
+  //Most of the program is performed in the 'driver' function, which is
+  //templated on < Scalar, LocalOrdinal, GlobalOrdinal >.
+  //To run miniFE with float instead of double, or 'long long' instead of int,
+  //etc., change these template-parameters by changing the macro definitions in
+  //the makefile or on the make command-line.
+
+  int return_code =
+     miniFE::driver< MINIFE_SCALAR, MINIFE_LOCAL_ORDINAL, MINIFE_GLOBAL_ORDINAL>(global_box, my_box, params, doc);
+
+  miniFE::timer_type total_time = miniFE::mytimer() - start_time;
+
+#ifdef MINIFE_REPORT_RUSAGE
+   struct rusage get_mem;
+   getrusage(RUSAGE_SELF, &get_mem);
+
+   long long int rank_rss = get_mem.ru_maxrss;
+   long long int global_rss = 0;
+   long long int max_rss = 0;
+
+#ifdef HAVE_MPI
+   MPI_Reduce(&rank_rss, &global_rss, 1,
+       	MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+   MPI_Reduce(&rank_rss, &max_rss, 1,
+       	MPI_LONG_LONG, MPI_MAX, 0, MPI_COMM_WORLD);
+   if (myproc == 0) {
+        doc.add("Global All-RSS (kB)", global_rss);
+       	doc.add("Global Max-RSS (kB)", max_rss);
+   }
+#else
+   doc.add("RSS (kB)", rank_rss);
+#endif
+#endif
+
+  if (myproc == 0) {
+    doc.add("Total Program Time",total_time);
+    doc.generateYAML();
+  }
+
+  miniFE::finalize_mpi();
+
+  return return_code;
+}
+
+void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params)
+{
+  doc.add("Global Run Parameters","");
+  doc.get("Global Run Parameters")->add("dimensions","");
+  doc.get("Global Run Parameters")->get("dimensions")->add("nx",params.nx);
+  doc.get("Global Run Parameters")->get("dimensions")->add("ny",params.ny);
+  doc.get("Global Run Parameters")->get("dimensions")->add("nz",params.nz);
+  doc.get("Global Run Parameters")->add("load_imbalance", params.load_imbalance);
+  if (params.mv_overlap_comm_comp == 1) {
+    std::string val("1 (yes)");
+    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
+  }
+  else {
+    std::string val("0 (no)");
+    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
+  }
+}
+
+void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads)
+{
+  doc.get("Global Run Parameters")->add("number of processors", numprocs);
+
+  doc.add("Platform","");
+  doc.get("Platform")->add("hostname",MINIFE_HOSTNAME);
+  doc.get("Platform")->add("kernel name",MINIFE_KERNEL_NAME);
+  doc.get("Platform")->add("kernel release",MINIFE_KERNEL_RELEASE);
+  doc.get("Platform")->add("processor",MINIFE_PROCESSOR);
+
+  doc.add("Build","");
+  doc.get("Build")->add("CXX",MINIFE_CXX);
+#if MINIFE_INFO != 0
+  doc.get("Build")->add("compiler version",MINIFE_CXX_VERSION);
+#endif
+  doc.get("Build")->add("CXXFLAGS",MINIFE_CXXFLAGS);
+  std::string using_mpi("no");
+#ifdef HAVE_MPI
+  using_mpi = "yes";
+#endif
+  doc.get("Build")->add("using MPI",using_mpi);
+}
+
+void add_timestring_to_yaml(YAML_Doc& doc)
+{
+  std::time_t rawtime;
+  struct tm * timeinfo;
+  std::time(&rawtime);
+  timeinfo = std::localtime(&rawtime);
+  std::ostringstream osstr;
+  osstr.fill('0');
+  osstr << timeinfo->tm_year+1900 << "-";
+  osstr.width(2); osstr << timeinfo->tm_mon+1 << "-";
+  osstr.width(2); osstr << timeinfo->tm_mday << ", ";
+  osstr.width(2); osstr << timeinfo->tm_hour << "-";
+  osstr.width(2); osstr << timeinfo->tm_min << "-";
+  osstr.width(2); osstr << timeinfo->tm_sec;
+  std::string timestring = osstr.str();
+  doc.add("Run Date/Time",timestring);
+}
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/make_local_matrix.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/make_local_matrix.hpp
@@ -0,0 +1,447 @@
+#ifndef _make_local_matrix_hpp_
+#define _make_local_matrix_hpp_
+#include <assert.h>
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <utils.hpp>
+
+#include <map>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType>
+void
+make_local_matrix(MatrixType& A)
+{
+#ifdef HAVE_MPI
+  int numprocs = 1, myproc = 0;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+  if (numprocs < 2) {
+    A.num_cols = A.rows.size();
+    A.has_local_indices = true;
+    return;
+  }
+
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  std::map<GlobalOrdinal,GlobalOrdinal> externals;
+  LocalOrdinal num_external = 0;
+
+  //Extract Matrix pieces
+
+  size_t local_nrow = A.rows.size();
+  GlobalOrdinal start_row = local_nrow>0 ? A.rows[0] : -1;
+  GlobalOrdinal stop_row  = local_nrow>0 ? A.rows[local_nrow-1] : -1;
+
+  // We need to convert the index values for the rows on this processor
+  // to a local index space. We need to:
+  // - Determine if each index reaches to a local value or external value
+  // - If local, subtract start_row from index value to get local index
+  // - If external, find out if it is already accounted for.
+  //   - If so, then do nothing,
+  //   - otherwise
+  //     - add it to the list of external indices,
+  //     - find out which processor owns the value.
+  //     - Set up communication for sparse MV operation
+
+  ///////////////////////////////////////////
+  // Scan the indices and transform to local
+  ///////////////////////////////////////////
+
+  std::vector<GlobalOrdinal>& external_index = A.external_index;
+
+  for(size_t i=0; i<A.rows.size(); ++i) {
+    GlobalOrdinal* Acols = NULL;
+    Scalar* Acoefs = NULL;
+    size_t row_len = 0;
+    A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);
+
+    for(size_t j=0; j<row_len; ++j) {
+      GlobalOrdinal cur_ind = Acols[j];
+      if (start_row <= cur_ind && cur_ind <= stop_row) {
+        Acols[j] -= start_row;
+      }
+      else { // Must find out if we have already set up this point
+        if (externals.find(cur_ind) == externals.end()) {
+          externals[cur_ind] = num_external++;
+          external_index.push_back(cur_ind);
+        }
+        // Mark index as external by adding 1 and negating it
+        Acols[j] = -(Acols[j] + 1);
+      }
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // Go through list of externals to find out which processors must be accessed.
+  ////////////////////////////////////////////////////////////////////////
+
+  std::vector<GlobalOrdinal> tmp_buffer(numprocs, 0); // Temp buffer space needed below
+
+  // Build list of global index offset
+
+  std::vector<GlobalOrdinal> global_index_offsets(numprocs, 0);
+
+  tmp_buffer[myproc] = start_row; // This is my start row
+
+  // This call sends the start_row of each ith processor to the ith
+  // entry of global_index_offsets on all processors.
+  // Thus, each processor knows the range of indices owned by all
+  // other processors.
+  // Note: There might be a better algorithm for doing this, but this
+  //       will work...
+
+  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
+  MPI_Allreduce(&tmp_buffer[0], &global_index_offsets[0], numprocs, mpi_dtype,
+                MPI_SUM, MPI_COMM_WORLD);
+
+  // Go through list of externals and find the processor that owns each
+  std::vector<int> external_processor(num_external);
+
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    GlobalOrdinal cur_ind = external_index[i];
+    for(int j=numprocs-1; j>=0; --j) {
+      if (global_index_offsets[j] <= cur_ind && global_index_offsets[j] >= 0) {
+        external_processor[i] = j;
+        break;
+      }
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  // Sift through the external elements. For each newly encountered external
+  // point assign it the next index in the sequence. Then look for other
+  // external elements who are updated by the same node and assign them the next
+  // set of index numbers in the sequence (ie. elements updated by the same node
+  // have consecutive indices).
+  /////////////////////////////////////////////////////////////////////////
+
+  size_t count = local_nrow;
+  std::vector<GlobalOrdinal>& external_local_index = A.external_local_index;
+  external_local_index.assign(num_external, -1);
+
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    if (external_local_index[i] == -1) {
+      external_local_index[i] = count++;
+
+      for(LocalOrdinal j=i+1; j<num_external; ++j) {
+        if (external_processor[j] == external_processor[i])
+          external_local_index[j] = count++;
+      }
+    }
+  }
+
+  for(size_t i=0; i<local_nrow; ++i) {
+    GlobalOrdinal* Acols = NULL;
+    Scalar* Acoefs = NULL;
+    size_t row_len = 0;
+    A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);
+
+    for(size_t j=0; j<row_len; ++j) {
+      if (Acols[j] < 0) { // Change index values of externals
+        GlobalOrdinal cur_ind = -Acols[j] - 1;
+        Acols[j] = external_local_index[externals[cur_ind]];
+      }
+    }
+  }
+
+  std::vector<int> new_external_processor(num_external, 0);
+
+  for(int i=0; i<num_external; ++i) {
+    new_external_processor[external_local_index[i]-local_nrow] =
+      external_processor[i];
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ///
+  // Count the number of neighbors from which we receive information to update
+  // our external elements. Additionally, fill the array tmp_neighbors in the
+  // following way:
+  //      tmp_neighbors[i] = 0   ==>  No external elements are updated by
+  //                              processor i.
+  //      tmp_neighbors[i] = x   ==>  (x-1)/numprocs elements are updated from
+  //                              processor i.
+  ///
+  ////////////////////////////////////////////////////////////////////////
+
+  std::vector<GlobalOrdinal> tmp_neighbors(numprocs, 0);
+
+  int num_recv_neighbors = 0;
+  int length             = 1;
+
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    if (tmp_neighbors[new_external_processor[i]] == 0) {
+      ++num_recv_neighbors;
+      tmp_neighbors[new_external_processor[i]] = 1;
+    }
+    tmp_neighbors[new_external_processor[i]] += numprocs;
+  }
+
+  /// sum over all processor all the tmp_neighbors arrays ///
+
+  MPI_Allreduce(&tmp_neighbors[0], &tmp_buffer[0], numprocs, mpi_dtype,
+                MPI_SUM, MPI_COMM_WORLD);
+
+  // decode the combined 'tmp_neighbors' (stored in tmp_buffer)
+  // array from all the processors
+
+  GlobalOrdinal num_send_neighbors = tmp_buffer[myproc] % numprocs;
+
+  /// decode 'tmp_buffer[myproc] to deduce total number of elements
+  //  we must send
+
+  GlobalOrdinal total_to_be_sent = (tmp_buffer[myproc] - num_send_neighbors) / numprocs;
+
+  ///////////////////////////////////////////////////////////////////////
+  ///
+  // Make a list of the neighbors that will send information to update our
+  // external elements (in the order that we will receive this information).
+  ///
+  ///////////////////////////////////////////////////////////////////////
+
+  std::vector<int> recv_list;
+  recv_list.push_back(new_external_processor[0]);
+  for(LocalOrdinal i=1; i<num_external; ++i) {
+    if (new_external_processor[i-1] != new_external_processor[i]) {
+      recv_list.push_back(new_external_processor[i]);
+    }
+  }
+
+  //
+  // Send a 0 length message to each of our recv neighbors
+  //
+
+  std::vector<int> send_list(num_send_neighbors, 0);
+
+  //
+  // first post receives, these are immediate receives
+  // Do not wait for result to come, will do that at the
+  // wait call below.
+  //
+  int MPI_MY_TAG = 99;
+
+  std::vector<MPI_Request> request(num_send_neighbors);
+  for(int i=0; i<num_send_neighbors; ++i) {
+    MPI_Irecv(&tmp_buffer[i], 1, mpi_dtype, MPI_ANY_SOURCE, MPI_MY_TAG,
+              MPI_COMM_WORLD, &request[i]);
+  }
+
+  // send messages
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    MPI_Send(&tmp_buffer[i], 1, mpi_dtype, recv_list[i], MPI_MY_TAG,
+             MPI_COMM_WORLD);
+  }
+
+  ///
+  // Receive message from each send neighbor to construct 'send_list'.
+  ///
+
+  MPI_Status status;
+  for(int i=0; i<num_send_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+    send_list[i] = status.MPI_SOURCE;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  ///
+  // Compare the two lists. In most cases they should be the same.
+  // However, if they are not then add new entries to the recv list
+  // that are in the send list (but not already in the recv list).
+  ///
+  //////////////////////////////////////////////////////////////////////
+
+  for(int j=0; j<num_send_neighbors; ++j) {
+    int found = 0;
+    for(int i=0; i<num_recv_neighbors; ++i) {
+      if (recv_list[i] == send_list[j]) found = 1;
+    }
+
+    if (found == 0) {
+      recv_list.push_back(send_list[j]);
+      ++num_recv_neighbors;
+    }
+  }
+
+  num_send_neighbors = num_recv_neighbors;
+  request.resize(num_send_neighbors);
+
+  A.elements_to_send.assign(total_to_be_sent, 0);
+  A.send_buffer.assign(total_to_be_sent, 0);
+
+  //
+  // Create 'new_external' which explicitly put the external elements in the
+  // order given by 'external_local_index'
+  //
+
+  std::vector<GlobalOrdinal> new_external(num_external);
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    new_external[external_local_index[i] - local_nrow] = external_index[i];
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  //
+  // Send each processor the global index list of the external elements in the
+  // order that I will want to receive them when updating my external elements.
+  //
+  /////////////////////////////////////////////////////////////////////////
+
+  std::vector<int> lengths(num_recv_neighbors);
+
+  ++MPI_MY_TAG;
+
+  // First post receives
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    int partner = recv_list[i];
+    MPI_Irecv(&lengths[i], 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD,
+              &request[i]);
+  }
+
+  std::vector<int>& neighbors = A.neighbors;
+  std::vector<int>& recv_length = A.recv_length;
+  std::vector<int>& send_length = A.send_length;
+
+  neighbors.resize(num_recv_neighbors, 0);
+  A.request.resize(num_recv_neighbors);
+  recv_length.resize(num_recv_neighbors, 0);
+  send_length.resize(num_recv_neighbors, 0);
+
+  LocalOrdinal j = 0;
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    int start = j;
+    int newlength = 0;
+
+    //go through list of external elements until updating
+    //processor changes
+
+    while((j < num_external) &&
+          (new_external_processor[j] == recv_list[i])) {
+      ++newlength;
+      ++j;
+      if (j == num_external) break;
+    }
+
+    recv_length[i] = newlength;
+    neighbors[i] = recv_list[i];
+
+    length = j - start;
+    MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  // Complete the receives of the number of externals
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+    send_length[i] = lengths[i];
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // Build "elements_to_send" list. These are the x elements I own
+  // that need to be sent to other processors.
+  ////////////////////////////////////////////////////////////////////////
+
+  ++MPI_MY_TAG;
+
+  j = 0;
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    MPI_Irecv(&A.elements_to_send[j], send_length[i], mpi_dtype, neighbors[i],
+              MPI_MY_TAG, MPI_COMM_WORLD, &request[i]);
+    j += send_length[i];
+  }
+
+  j = 0;
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    LocalOrdinal start = j;
+    LocalOrdinal newlength = 0;
+
+    // Go through list of external elements
+    // until updating processor changes. This is redundant, but
+    // saves us from recording this information.
+
+    while((j < num_external) &&
+          (new_external_processor[j] == recv_list[i])) {
+      ++newlength;
+      ++j;
+      if (j == num_external) break;
+    }
+    MPI_Send(&new_external[start], j-start, mpi_dtype, recv_list[i],
+             MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  // receive from each neighbor the global index list of external elements
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+  }
+
+  /// replace global indices by local indices ///
+
+  for(GlobalOrdinal i=0; i<total_to_be_sent; ++i) {
+    A.elements_to_send[i] -= start_row;
+    if (A.elements_to_send[i] >= A.rows.size()) {
+      std::cout<<"start_row: "<<start_row<<", A.elements_to_send[i]: "<<A.elements_to_send[i]<<", A.rows.size(): "<<A.rows.size()<<std::endl;
+    assert(A.elements_to_send[i] < A.rows.size());
+    }
+  }
+
+  //////////////////
+  // Finish up !!
+  //////////////////
+
+  A.num_cols = local_nrow + num_external;
+
+#else
+  A.num_cols = A.rows.size();
+#endif
+
+  A.has_local_indices = true;
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/matrix_algebra_3x3.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/matrix_algebra_3x3.hpp
@@ -0,0 +1,166 @@
+#ifndef _matrix_algebra_3x3_hpp_
+#define _matrix_algebra_3x3_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KERNEL_PREFIX
+#define KERNEL_PREFIX
+#endif
+
+namespace miniFE {
+
+template<typename Scalar>
+#ifdef __CUDACC__
+  __host__ __device__
+#endif
+KERNEL_PREFIX void fill(Scalar* begin, Scalar* end, const Scalar& val)
+{
+  while(begin != end) {*begin++ = val;}
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void inverse_and_determinant3x3(const Scalar* J, Scalar* invJ, Scalar& detJ)
+{
+  //hardwired "3x3" in function-name allows us to assume
+  //that J and invJ have length 9:
+
+  Scalar J00 = J[0];
+  Scalar J01 = J[1];
+  Scalar J02 = J[2];
+
+  Scalar J10 = J[3];
+  Scalar J11 = J[4];
+  Scalar J12 = J[5];
+
+  Scalar J20 = J[6];
+  Scalar J21 = J[7];
+  Scalar J22 = J[8];
+
+  Scalar term0 = J22*J11 - J21*J12;
+  Scalar term1 = J22*J01 - J21*J02;
+  Scalar term2 = J12*J01 - J11*J02;
+
+  detJ = J00*term0 - J10*term1 + J20*term2;
+
+  Scalar inv_detJ = 1.0/detJ;
+
+  invJ[0] =  term0*inv_detJ;
+  invJ[1] = -term1*inv_detJ;
+  invJ[2] =  term2*inv_detJ;
+
+  invJ[3] = -(J22*J10 - J20*J12)*inv_detJ;
+  invJ[4] =  (J22*J00 - J20*J02)*inv_detJ;
+  invJ[5] = -(J12*J00 - J10*J02)*inv_detJ;
+
+  invJ[6] =  (J21*J10 - J20*J11)*inv_detJ;
+  invJ[7] = -(J21*J00 - J20*J01)*inv_detJ;
+  invJ[8] =  (J11*J00 - J10*J01)*inv_detJ;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void matmat3x3(const Scalar* A, const Scalar* B, Scalar* C)
+{
+  //hardwired "3x3" in function-name allows us to assume args have length 9:
+  //A,B,C are all assumed to be ordered such that columns are contiguous.
+
+  const Scalar zero = 0;
+  miniFE::fill(C, C+9, zero);
+
+  for(int i=0; i<3; ++i) {
+    for(int j=0; j<3; ++j) {
+      C[i+j*3] = A[i+0]*B[j*3+0]
+               + A[i+3]*B[j*3+1]
+               + A[i+6]*B[j*3+2];
+    }
+  }
+}
+
+template<typename Scalar>
+KERNEL_PREFIX Scalar determinant3x3(const Scalar* J)
+{
+  //hardwired "3x3" in function-name allows us to assume that J has length 9:
+
+  Scalar J00 = J[0];
+  Scalar J01 = J[1];
+  Scalar J02 = J[2];
+
+  Scalar J10 = J[3];
+  Scalar J11 = J[4];
+  Scalar J12 = J[5];
+
+  Scalar J20 = J[6];
+  Scalar J21 = J[7];
+  Scalar J22 = J[8];
+
+  Scalar term0 = J22*J11 - J21*J12;
+  Scalar term1 = J22*J01 - J21*J02;
+  Scalar term2 = J12*J01 - J11*J02;
+
+  Scalar detJ = J00*term0 - J10*term1 + J20*term2;
+
+  return detJ;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void matmat3x3_X_3xn(const Scalar* A, int n, const Scalar* B, Scalar* C)
+{
+  //A is 3x3, B is 3xn. So C is also 3xn.
+  //A,B,C are all assumed to be ordered such that columns are contiguous.
+
+  Scalar* Cj = C;
+  const Scalar* Bj = B;
+  for(int j=0; j<n; ++j) {
+    Cj[0] = A[0]*Bj[0] + A[3]*Bj[1] + A[6]*Bj[2];
+    Cj[1] = A[1]*Bj[0] + A[4]*Bj[1] + A[7]*Bj[2];
+    Cj[2] = A[2]*Bj[0] + A[5]*Bj[1] + A[8]*Bj[2];
+    Bj += 3;
+    Cj += 3;
+  }
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void matTransMat3x3_X_3xn(const Scalar* A, int n, const Scalar* B, Scalar* C)
+{
+  //A is 3x3, B is 3xn. So C is also 3xn.
+  //A,B,C are all assumed to be ordered such that columns are contiguous.
+
+  Scalar* Cj = C;
+  const Scalar* Bj = B;
+  for(int j=0; j<n; ++j) {
+    Cj[0] = A[0]*Bj[0] + A[1]*Bj[1] + A[2]*Bj[2];
+    Cj[1] = A[3]*Bj[0] + A[4]*Bj[1] + A[5]*Bj[2];
+    Cj[2] = A[6]*Bj[0] + A[7]*Bj[1] + A[8]*Bj[2];
+    Bj += 3;
+    Cj += 3;
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.reference_output
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.reference_output
@@ -0,0 +1,14 @@
+Starting CG solver ... 
+Initial Residual = 65.0072
+Iteration = 20   Residual = 0.0409638
+Iteration = 40   Residual = 0.0116308
+Iteration = 60   Residual = 0.00330043
+Iteration = 80   Residual = 0.000152231
+Iteration = 100   Residual = 8.66762e-06
+Iteration = 120   Residual = 2.50799e-07
+Iteration = 140   Residual = 3.39847e-09
+Iteration = 160   Residual = 6.88585e-11
+Iteration = 180   Residual = 5.97875e-13
+Iteration = 200   Residual = 4.15881e-15
+Final Resid Norm: 4.15881e-15
+exit 0
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE_no_info.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE_no_info.hpp
@@ -0,0 +1,39 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef miniFE_no_info_hpp
+#define miniFE_no_info_hpp
+
+#define MINIFE_HOSTNAME "unknown"
+#define MINIFE_KERNEL_NAME "unknown"
+#define MINIFE_KERNEL_RELEASE "unknown"
+#define MINIFE_PROCESSOR "unknown"
+
+#define MINIFE_CXX "unknown"
+#define MINIFE_CXXFLAGS "unknown"
+
+#endif
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE_version.h
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE_version.h
@@ -0,0 +1,35 @@
+#ifndef _minife_version_h_
+#define _minife_version_h_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#define MINIFE_VERSION "2.0"
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/mytimer.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/mytimer.hpp
@@ -0,0 +1,52 @@
+#ifndef _mytimer_hpp_
+#define _mytimer_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+namespace miniFE {
+
+typedef double timer_type;
+
+timer_type mytimer();
+
+enum CG_TIMES {
+  WAXPY = 0,
+  DOT = 1,
+  MATVEC = 2,
+  MATVECDOT = 3,
+  TOTAL = 4,
+  NUM_TIMERS = 5
+};
+
+//Use TICK and TOCK to time a code section
+#define TICK() t0 = mytimer();
+#define TOCK(t) t += mytimer() - t0;
+
+}//namespace miniFE
+
+#endif
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/mytimer.cpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/mytimer.cpp
@@ -0,0 +1,132 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <cstdlib>
+#include <mytimer.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+/////////////////////////////////////////////////////////////////////////
+
+// Function to return time in seconds.
+// If compiled with no flags, return CPU time (user and system).
+// If compiled with -DWALL, returns elapsed time.
+
+/////////////////////////////////////////////////////////////////////////
+
+#if defined(HAVE_MPI) && defined(USE_MPI_WTIME)
+
+timer_type mytimer()
+{
+  return((timer_type) MPI_Wtime());
+}
+
+
+#elif defined(UseClock)
+
+#include <time.hpp>
+timer_type mytimer(void)
+{
+   clock_t t1;
+   static clock_t t0=0;
+   static timer_type CPS = CLOCKS_PER_SEC;
+   timer_type d;
+
+   if (t0 == 0) t0 = clock();
+   t1 = clock() - t0;
+   d = t1 / CPS;
+   return(d);
+}
+
+#elif defined(WALL)
+
+#include <cstdlib>
+#include <sys/time.h>
+#include <sys/resource.h>
+timer_type mytimer(void)
+{
+   struct timeval tp;
+   static long start=0, startu;
+   if (!start)
+   {
+      gettimeofday(&tp, NULL);
+      start = tp.tv_sec;
+      startu = tp.tv_usec;
+      return(0.0);
+   }
+   gettimeofday(&tp, NULL);
+   return( ((timer_type) (tp.tv_sec - start)) + (tp.tv_usec-startu)/1000000.0 );
+}
+
+#elif defined(UseTimes)
+
+#include <cstdlib>
+#include <sys/times.h>
+#include <unistd.h>
+timer_type mytimer(void)
+{
+   struct tms ts;
+   static timer_type ClockTick=0.0;
+
+   if (ClockTick == 0.0) ClockTick = (timer_type) sysconf(_SC_CLK_TCK);
+   times(&ts);
+   return( (timer_type) ts.tms_utime / ClockTick );
+}
+
+#else
+
+#include <cstdlib>
+#include <sys/time.h>
+#include <sys/resource.h>
+timer_type mytimer(void)
+{
+//This function now uses gettimeofday instead of getrusage. See note below.
+//
+  struct timeval tv;
+  struct timezone tz;
+  gettimeofday(&tv, &tz);
+  return ( (timer_type)tv.tv_sec + tv.tv_usec/1000000.0 );
+
+//The below use of 'getrusage' is not used because it doesn't do the right thing
+//for the case of using threads. It adds up the time spent in multiple threads,
+//rather than giving elapsed time.
+//
+//   struct rusage ruse;
+//   getrusage(RUSAGE_SELF, &ruse);
+//   return( (timer_type)(ruse.ru_utime.tv_sec+ruse.ru_utime.tv_usec / 1000000.0) );
+}
+
+#endif
+
+}//namespace miniFE
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/outstream.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/outstream.hpp
@@ -0,0 +1,45 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _outstream_hpp_
+#define _outstream_hpp_
+
+#include <fstream>
+#include <sstream>
+
+inline
+std::ostream& outstream(int np=1, int p=0)
+{
+  static bool first = true;
+  static std::ostringstream oss;
+  if (first) oss << "minife_debug."<<np<<"."<<p;
+  static std::ofstream ofs(oss.str().c_str(), std::ios::out);
+  first = false;
+  return ofs;
+}
+
+#endif
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/param_utils.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/param_utils.hpp
@@ -0,0 +1,160 @@
+#ifndef _param_utils_hpp_
+#define _param_utils_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <string>
+#include <sstream>
+
+//Parameter-parsing Utilities:
+//
+//The functions declared below are intended to assist with parsing
+//input-parameters which may be command-line arguments and/or lines in a
+//text file.
+//
+// Scenario: You want your program to accept parameters that are specified
+// as command-line arguments and/or as lines in a text file (such
+// as a YAML output file). i.e., your program can be run like this:
+// % program.exe foo=3.14159 bar: 42
+// or
+// % program.exe input_file=params.txt
+// or
+// % program.exe foo=3.14159 input_file = params.txt
+//
+//Example:
+// Here is example code to obtain parameters using the 3 functions
+// 'read_args_into_string', 'read_file_into_string' and 'parse_parameter':
+//
+//   std::string arg_string;
+//
+//   //put command-line-arguments into 'arg_string':
+//   read_args_into_string(argc, argv, arg_string);
+//
+//   //do the command-line-arguments specify an 'input_file'?
+//   std::string filename =
+//      parse_parameter<std::string>(arg_string,"input_file","none-specified");
+//
+//   if (filename != "none-specified") {
+//     std::string tmp;
+//     read_file_into_string(filename, tmp);
+//     arg_string += tmp;
+//   }
+//
+//  //now parse the parameters:
+//  float foo = parse_parameter<float>(arg_string, "foo", -9.9);
+//  int bar   = parse_parameter<int>(arg_string, "bar", -1);
+//
+//See the comments below for parse_parameter, for formatting requirements of
+//named parameter-value pairs.
+//
+
+namespace Mantevo {
+
+/**
+ * Concatenate command-line arguments into a single string.
+ *
+ * Note: this function is purely serial. If argc and argv have different
+ * values on different MPI processes, then you need to resolve that by
+ * broadcasting arg_string's contents.
+ */
+void read_args_into_string(int argc, char** argv, std::string& arg_string);
+
+/**
+ * Read the contents of a text-file into a single string.
+ *
+ * Note: this function is purely serial. If you want file_contents on multiple
+ * MPI processes, you need to broadcast it (or call this function on each
+ * MPI process...).
+ */
+void read_file_into_string(const std::string& filename,
+                           std::string& file_contents);
+
+/**
+ * Parse a named parameter value from input 'arg_string'.
+ *
+ * Search 'arg_string' for an occurrence of param_name and attempt to parse
+ * a value into the return-type. If param_name is not found, then default_value
+ * is returned.
+ *
+ * Example:
+ * arg_string = "foo = 3.14159";
+ * float foo = parse_parameter<float>(arg_string, "foo", -999.9);
+ * //foo should now contain the value 3.14159; if 'foo' was not found in
+ * //arg_string, then -999.9 would have been returned.
+ *
+ * Other legal name-value separators are ':' and ' '. Extra spaces are also ok,
+ * e.g. "foo : 3.114159".
+ *
+ * Note that if a YAML file is read into a string, that would be a valid input
+ * string for this function.
+ */
+template<typename T>
+T parse_parameter(const std::string& arg_string,
+                const std::string& param_name,
+                const T& default_value)
+{
+  std::string::size_type pos = arg_string.find(param_name);
+  if (pos == std::string::npos) {
+    //if param_name is not found in arg_string, return default_value:
+    return default_value;
+  }
+
+  pos += param_name.size();
+
+  if (arg_string.size() <= pos) return default_value;
+
+  //skip past ' ', '=' or ':':
+  while(pos < arg_string.size() &&
+        (arg_string[pos] == ' ' ||
+         arg_string[pos] == '=' ||
+         arg_string[pos] == ':'))
+  {
+    ++pos;
+  }
+
+  if (arg_string[pos] == '=' || arg_string[pos] == ':') ++pos;
+
+  std::string str = arg_string.substr(pos);
+
+  std::istringstream isstr(str);
+
+  T return_val = default_value;
+
+  //parse value into return_val:
+  isstr >> return_val;
+
+  //if parse failed, return default_value:
+  if (!isstr) return default_value;
+
+  return return_val;
+}
+
+}//namespace Mantevo
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/param_utils.cpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/param_utils.cpp
@@ -0,0 +1,58 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <param_utils.hpp>
+
+#include <sstream>
+#include <fstream>
+
+namespace Mantevo {
+
+//-------------------------------------------------------------
+void read_args_into_string(int argc, char** argv, std::string& arg_string)
+{
+  arg_string = argv[0];
+  for(int i=1; i<argc; ++i) {
+    arg_string += " " + std::string(argv[i]);
+  }
+}
+
+//-------------------------------------------------------------
+void read_file_into_string(const std::string& filename,
+                           std::string& file_contents)
+{
+  file_contents.clear();
+  std::ifstream ifs(filename.c_str());
+  char line[256];
+  while(!ifs.eof()) {
+    ifs.getline(line, 256);
+    file_contents += " " + std::string(line);
+  }
+}
+
+}//namespace Mantevo
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/perform_element_loop.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/perform_element_loop.hpp
@@ -0,0 +1,109 @@
+#ifndef _perform_element_loop_hpp_
+#define _perform_element_loop_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <BoxIterator.hpp>
+#include <simple_mesh_description.hpp>
+#include <SparseMatrix_functions.hpp>
+#include <box_utils.hpp>
+#include <Hex8_box_utils.hpp>
+#include <Hex8_ElemData.hpp>
+
+namespace miniFE {
+
+template<typename GlobalOrdinal,
+         typename MatrixType, typename VectorType>
+void
+perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
+                     const Box& local_elem_box,
+                     MatrixType& A, VectorType& b,
+                     Parameters& /*params*/)
+{
+  typedef typename MatrixType::ScalarType Scalar;
+
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+
+  //We will iterate the local-element-box (local portion of the mesh), and
+  //get element-IDs in preparation for later assembling the FE operators
+  //into the global sparse linear-system.
+
+  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
+  std::vector<GlobalOrdinal> elemIDs(num_elems);
+
+  BoxIterator iter = BoxIterator::begin(local_elem_box);
+  BoxIterator end  = BoxIterator::end(local_elem_box);
+
+  for(size_t i=0; iter != end; ++iter, ++i) {
+    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
+                                       iter.x, iter.y, iter.z);
+//#ifdef MINIFE_DEBUG
+//std::cout << "elem ID " << elemIDs[i] << " ("<<iter.x<<","<<iter.y<<","<<iter.z<<")"<<std::endl;
+//#endif
+  }
+
+  //Now do the actual finite-element assembly loop:
+
+  ElemData<GlobalOrdinal,Scalar> elem_data;
+
+  compute_gradient_values(elem_data.grad_vals);
+
+  timer_type t_gn = 0, t_ce = 0, t_si = 0;
+  timer_type t0 = 0;
+  for(size_t i=0; i<elemIDs.size(); ++i) {
+    //Given an element-id, populate elem_data with the
+    //element's node_ids and nodal-coords:
+
+    TICK();
+    get_elem_nodes_and_coords(mesh, elemIDs[i], elem_data);
+    TOCK(t_gn);
+
+    //Next compute element-diffusion-matrix and element-source-vector:
+
+    TICK();
+    compute_element_matrix_and_vector(elem_data);
+    TOCK(t_ce);
+
+    //Now assemble the (dense) element-matrix and element-vector into the
+    //global sparse linear system:
+
+    TICK();
+    sum_into_global_linear_system(elem_data, A, b);
+    TOCK(t_si);
+  }
+//std::cout << std::endl<<"get-nodes: " << t_gn << std::endl;
+//std::cout << "compute-elems: " << t_ce << std::endl;
+//std::cout << "sum-in: " << t_si << std::endl;
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/simple_mesh_description.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/simple_mesh_description.hpp
@@ -0,0 +1,248 @@
+
+#ifndef _simple_mesh_description_hpp_
+#define _simple_mesh_description_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <utils.hpp>
+#include <set>
+#include <map>
+
+namespace miniFE {
+
+template<typename GlobalOrdinal>
+class simple_mesh_description {
+public:
+  simple_mesh_description(const Box& global_box_in, const Box& local_box_in)
+  {
+   Box local_node_box;
+    for(int i=0; i<3; ++i) {
+      global_box[i][0] = global_box_in[i][0];
+      global_box[i][1] = global_box_in[i][1];
+      local_box[i][0] = local_box_in[i][0];
+      local_box[i][1] = local_box_in[i][1];
+      local_node_box[i][0] = local_box_in[i][0];
+      local_node_box[i][1] = local_box_in[i][1];
+      //num-owned-nodes == num-elems+1 in this dimension if the elem box is not empty
+      //and we are at the high end of the global range in that dimension:
+      if (local_box_in[i][1] > local_box_in[i][0] && local_box_in[i][1] == global_box[i][1]) local_node_box[i][1] += 1;
+    }
+
+    int max_node_x = global_box[0][1]+1;
+    int max_node_y = global_box[1][1]+1;
+    int max_node_z = global_box[2][1]+1;
+    create_map_id_to_row(max_node_x, max_node_y, max_node_z, local_node_box,
+                         map_ids_to_rows);
+
+    //As described in analytic_soln.hpp,
+    //we will impose a 0 boundary-condition on faces x=0, y=0, z=0, y=1, z=1
+    //we will impose a 1 boundary-condition on face x=1
+
+#ifdef MINIFE_DEBUG
+std::cout<<std::endl;
+#endif
+    const int X=0;
+    const int Y=1;
+    const int Z=2;
+
+    const int x1 = max_node_x - 1;
+    const int y1 = max_node_y - 1;
+    const int z1 = max_node_z - 1;
+
+    //if we're on the x=0 face:
+    if (global_box[X][0] == local_box[X][0]) {
+      int miny = local_node_box[Y][0];
+      int minz = local_node_box[Z][0];
+      int maxy = local_node_box[Y][1];
+      int maxz = local_node_box[Z][1];
+      //expand y and z dimensions to include ghost layer
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+
+      for(int iz=minz; iz<maxz; ++iz) {
+        for(int iy=miny; iy<maxy; ++iy) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             0, iy, iz);
+#ifdef MINIFE_DEBUG
+std::cout<<"x=0 BC, node "<<nodeID<<", (0,"<<iy<<","<<iz<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the y=0 face:
+    if (global_box[Y][0] == local_box[Y][0]) {
+      int minx = local_node_box[X][0];
+      int minz = local_node_box[Z][0];
+      int maxx = local_node_box[X][1];
+      int maxz = local_node_box[Z][1];
+      //expand x and z dimensions to include ghost layer
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+
+      for(int iz=minz; iz<maxz; ++iz) {
+        for(int ix=minx; ix<maxx; ++ix) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, 0, iz);
+#ifdef MINIFE_DEBUG
+std::cout<<"y=0 BC, node "<<nodeID<<", ("<<ix<<",0,"<<iz<<")"<<std::endl;
+#endif
+          GlobalOrdinal row = map_id_to_row(nodeID);
+          if (row < 0) {
+            std::cout<<"on the y==0 face (ix="<<ix<<", iz="<<iz<<"), ERROR: found negative row ("<<row<<") for nodeID="<<nodeID<<std::endl;
+          }
+          bc_rows_0.insert(row);
+        }
+      }
+    }
+
+    //if we're on the z=0 face:
+    if (global_box[Z][0] == local_box[Z][0]) {
+      int minx = local_node_box[X][0];
+      int miny = local_node_box[Y][0];
+      int maxx = local_node_box[X][1];
+      int maxy = local_node_box[Y][1];
+      //expand x and y dimensions to include ghost layer
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+
+      for(int iy=miny; iy<maxy; ++iy) {
+        for(int ix=minx; ix<maxx; ++ix) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, iy, 0);
+#ifdef MINIFE_DEBUG
+std::cout<<"z=0 BC, node "<<nodeID<<", ("<<ix<<","<<iy<<",0)"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the x=1 face:
+    if (global_box[X][1] == local_box[X][1]) {
+      int minz = local_node_box[Z][0];
+      int miny = local_node_box[Y][0];
+      int maxz = local_node_box[Z][1];
+      int maxy = local_node_box[Y][1];
+      //expand z and y dimensions to include ghost layer
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+
+      for(int iy=miny; iy<maxy; ++iy) {
+        for(int iz=minz; iz<maxz; ++iz) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             x1, iy, iz);
+          GlobalOrdinal row = map_id_to_row(nodeID);
+#ifdef MINIFE_DEBUG
+std::cout<<"x=1 BC, node "<<nodeID<<", row "<<row<<", ("<<x1<<","<<iy<<","<<iz<<")"<<std::endl;
+#endif
+          bc_rows_1.insert(row);
+        }
+      }
+    }
+
+    //if we're on the y=1 face:
+    if (global_box[Y][1] == local_box[Y][1]) {
+      int minz = local_node_box[Z][0];
+      int minx = local_node_box[X][0];
+      int maxz = local_node_box[Z][1];
+      int maxx = local_node_box[X][1];
+      //expand z and x dimensions to include ghost layer
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+
+      for(int ix=minx; ix<maxx; ++ix) {
+        for(int iz=minz; iz<maxz; ++iz) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, y1, iz);
+#ifdef MINIFE_DEBUG
+std::cout<<"y=1 BC, node "<<nodeID<<", ("<<ix<<","<<y1<<","<<iz<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the z=1 face:
+    if (global_box[Z][1] == local_box[Z][1]) {
+      int miny = local_node_box[Y][0];
+      int minx = local_node_box[X][0];
+      int maxy = local_node_box[Y][1];
+      int maxx = local_node_box[X][1];
+      //expand x and y dimensions to include ghost layer
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+
+      for(int ix=minx; ix<maxx; ++ix) {
+        for(int iy=miny; iy<maxy; ++iy) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, iy, z1);
+#ifdef MINIFE_DEBUG
+std::cout<<"z=1 BC, node "<<nodeID<<", ("<<ix<<","<<iy<<","<<z1<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+  }
+
+  GlobalOrdinal map_id_to_row(const GlobalOrdinal& id) const
+  {
+    return find_row_for_id(id, map_ids_to_rows);
+  }
+
+  GlobalOrdinal max_row_in_map() const {
+    if (map_ids_to_rows.empty()) return 0;
+    typename std::map<GlobalOrdinal,GlobalOrdinal>::const_iterator mend = map_ids_to_rows.end();
+    --mend;
+    return mend->second;
+  }
+  std::set<GlobalOrdinal> bc_rows_0;
+  std::set<GlobalOrdinal> bc_rows_1;
+  std::map<GlobalOrdinal,GlobalOrdinal> map_ids_to_rows;
+  Box global_box;
+  Box local_box;
+};//class simple_mesh_description
+
+}//namespace miniFE
+
+#endif
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/time_kernels.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/time_kernels.hpp
@@ -0,0 +1,139 @@
+#ifndef _time_kernels_hpp_
+#define _time_kernels_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+
+#include <Vector_functions.hpp>
+#include <mytimer.hpp>
+
+#ifdef MINIFE_HAVE_CUDA
+#include <cuda.h>
+#endif
+
+namespace miniFE {
+
+template<typename OperatorType,
+         typename VectorType,
+         typename Matvec>
+void
+time_kernels(OperatorType& A,
+             const VectorType& b,
+             VectorType& x,
+             Matvec matvec,
+             typename OperatorType::LocalOrdinalType max_iter,
+             typename OperatorType::ScalarType& xdotp,
+             timer_type* my_kern_times)
+{
+  typedef typename OperatorType::ScalarType ScalarType;
+  typedef typename OperatorType::LocalOrdinalType OrdinalType;
+  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;
+
+  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0;
+
+  int myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (!A.has_local_indices) {
+    std::cerr << "miniFE::time_kernels ERROR, A.has_local_indices is false, needs to be true. This probably means "
+       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::time_kernels."
+       << std::endl;
+    return;
+  }
+
+  OrdinalType nrows = A.rows.size();
+  OrdinalType ncols = A.num_cols;
+
+  VectorType p(0, ncols, b.compute_node);
+
+  ScalarType one = 1.0;
+  ScalarType zero = 0.0;
+
+  typedef typename VectorType::ComputeNodeType ComputeNodeType;
+  ComputeNodeType& compute_node = x.compute_node;
+
+  //The following lines that create and initialize buffers are no-ops in many
+  //cases, but perform actual allocations and copies if a off-cpu device such as
+  //a GPU is being used by compute_node.
+
+  //Do any required allocations for buffers that will be needed during CG:
+  ScalarType* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  ScalarType* d_p = compute_node.get_buffer(&p.coefs[0], p.coefs.size());
+  ScalarType* d_b = compute_node.get_buffer(&b.coefs[0], b.coefs.size());
+  OrdinalType* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
+  OrdinalType* d_Acols   = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
+  ScalarType* d_Acoefs  = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
+
+  //Copy data to buffers that need to be initialized from input data:
+  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
+  compute_node.copy_to_buffer(&b.coefs[0], b.coefs.size(), d_b);
+  compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
+  compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
+  compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);
+
+  TICK();
+  for(OrdinalType i=0; i<max_iter; ++i) {
+    waxpby(one, x, zero, x, p);
+  }
+#ifdef MINIFE_HAVE_CUDA
+  cudaThreadSynchronize();
+#endif
+  TOCK(tWAXPY);
+
+  TICK();
+  for(OrdinalType i=0; i<max_iter; ++i) {
+    matvec(A, p, x);
+  }
+#ifdef MINIFE_HAVE_CUDA
+  cudaThreadSynchronize();
+#endif
+  TOCK(tMATVEC);
+
+  TICK();
+  xdotp = 0;
+  for(OrdinalType i=0; i<max_iter; ++i) {
+    xdotp += dot(x, p);
+  }
+#ifdef MINIFE_HAVE_CUDA
+  cudaThreadSynchronize();
+#endif
+  TOCK(tDOT);
+
+  my_kern_times[WAXPY] = tWAXPY;
+  my_kern_times[DOT] = tDOT;
+  my_kern_times[MATVEC] = tMATVEC;
+  my_kern_times[TOTAL] = 0;
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/utils.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/utils.hpp
@@ -0,0 +1,204 @@
+#ifndef _utils_hpp_
+#define _utils_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstdlib>
+#include <cmath>
+#include <vector>
+#include <map>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <TypeTraits.hpp>
+#include <Parameters.hpp>
+
+namespace miniFE {
+
+void get_parameters(int argc, char** argv, Parameters& params);
+
+void broadcast_parameters(Parameters& params);
+
+void initialize_mpi(int argc, char** argv, int& numprocs, int& myproc);
+
+void finalize_mpi();
+
+template<typename Scalar>
+Scalar percentage_difference(Scalar value, Scalar average)
+{
+  //result will be the difference between value and average, represented as
+  //a percentage of average.
+  //Examples:
+  //  if value=100 and average=50, result is 100%
+  //  if value=500 and average=400, result is 25%
+
+  //Note: if average is 0, result is undefined. We'll return -1.0;
+
+  Scalar result = std::abs(value-average);
+  if (std::abs(average) > 1.e-5) {
+    result /= average;
+    result *= 100;
+  }
+  else result = -1;
+
+  return result;
+}
+
+template<typename GlobalOrdinal>
+void get_global_min_max(GlobalOrdinal local_n,
+                        GlobalOrdinal& global_n,
+                        GlobalOrdinal& min_n,
+                        int& min_proc,
+                        GlobalOrdinal& max_n,
+                        int& max_proc)
+{
+//Given a local_n, compute global_n, min/max, etc. All computed results
+//will be returned on all processors.
+//
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  std::vector<GlobalOrdinal> all_n(numprocs, 0);
+  all_n[myproc] = local_n;
+#ifdef HAVE_MPI
+  std::vector<GlobalOrdinal> tmp(all_n);
+  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
+  MPI_Allreduce(&tmp[0], &all_n[0], numprocs, mpi_dtype, MPI_MAX, MPI_COMM_WORLD);
+#endif
+
+  global_n = 0;
+  min_n= 5*local_n;
+  min_proc = 0;
+  max_n= 0;
+  max_proc = 0;
+
+  for(int i=0; i<numprocs; ++i) {
+    global_n += all_n[i];
+    //min_proc will be the lowest-numbered proc with n = min_n
+    if (all_n[i] < min_n) {
+      min_n = all_n[i];
+      min_proc = i;
+    }
+    //max_proc will be the highest-numbered proc with n = max_n
+    if (all_n[i] >= max_n) {
+      max_n = all_n[i];
+      max_proc = i;
+    }
+  }
+}
+
+template<typename Scalar>
+Scalar compute_std_dev_as_percentage(Scalar local_nrows,
+                                     Scalar avg_nrows)
+{
+//compute and return a standard deviation for the deviation of local_nrows from the average.
+//the std. dev. will be expressed as a percentage of avg_nrows.
+//
+//Input argument local_nrows is really a integer, but taking it as a floating-point scalar is
+//harmless.
+//
+#ifdef HAVE_MPI
+  int numprocs = 1, myproc = 0;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
+
+//If it's significantly more efficient, we may consider using MPI_Gather below instead of
+//MPI_Allgather. We really only need to compute std.dev. on proc 0...
+//
+//(But for now, use MPI_Allgather and compute on all procs.)
+
+  std::vector<Scalar> all_nrows(numprocs, 0);
+  MPI_Allgather(&local_nrows, 1, mpi_dtype, &all_nrows[0], 1, mpi_dtype, MPI_COMM_WORLD);
+
+  //turn all_nrows contents into deviations, add to sum-of-squares-of-deviations:
+  Scalar sum_sqr_dev = 0;
+  for(size_t i=0; i<all_nrows.size(); ++i) {
+    all_nrows[i] -= avg_nrows;
+    all_nrows[i] *= all_nrows[i];
+    sum_sqr_dev += all_nrows[i];
+  }
+  Scalar tmp1 = sum_sqr_dev;
+  Scalar std_dev = numprocs>1 ? std::sqrt(tmp1/(numprocs-1)) : 0;
+
+  //std_dev is now the standard deviation of rows-per-processor with respect
+  //to avg_nrows.
+  //Next turn std_dev into a percentage of avg_nrows:
+  std_dev /= avg_nrows;
+  std_dev *= 100;
+  return std_dev;
+#else
+  return 0;
+#endif
+}
+
+template<typename GlobalOrdinal>
+GlobalOrdinal find_row_for_id(GlobalOrdinal id,
+                              const std::map<GlobalOrdinal,GlobalOrdinal>& ids_to_rows)
+{
+  typename std::map<GlobalOrdinal,GlobalOrdinal>::const_iterator
+    iter = ids_to_rows.lower_bound(id);
+
+  if (iter == ids_to_rows.end() || iter->first != id) {
+    if (ids_to_rows.size() > 0) {
+      --iter;
+    }
+    else {
+      std::cout << "ERROR, failed to map id to row."<<std::endl;
+      return -99;
+    }
+  }
+
+  if (iter->first == id) {
+    return iter->second;
+  }
+
+  if (iter == ids_to_rows.begin() && iter->first > id) {
+    std::cout << "ERROR, id:" << id << ", ids_to_rows.begin(): " << iter->first<<std::endl;
+    return -99;
+  }
+
+  GlobalOrdinal offset = id - iter->first;
+
+  if (offset < 0) {
+    std::cout << "ERROR, negative offset in find_row_for_id for id="<<id<<std::endl;
+    return -99;
+  }
+
+  return iter->second + offset;
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/utils.cpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/utils.cpp
@@ -0,0 +1,136 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#ifdef MINIFE_HAVE_TPI
+#include <TPI.h>
+#endif
+
+#ifdef MINIFE_HAVE_TBB
+#include <tbb/task_scheduler_init.h>
+#endif
+
+#include <param_utils.hpp>
+#include <Parameters.hpp>
+#include <utils.hpp>
+
+namespace miniFE {
+
+//-------------------------------------------------------------
+void get_parameters(int argc, char** argv, Parameters& params)
+{
+  std::string argstring;
+  Mantevo::read_args_into_string(argc, argv, argstring);
+
+  std::string garbage("garbage");
+  std::string filename =
+      Mantevo::parse_parameter<std::string>(argstring, "input_file", garbage);
+
+  if (filename != garbage) {
+    Mantevo::read_file_into_string(filename, argstring);
+  }
+
+  params.nx = Mantevo::parse_parameter<int>(argstring, "nx", 10);
+  params.ny = Mantevo::parse_parameter<int>(argstring, "ny", params.nx);
+  params.nz = Mantevo::parse_parameter<int>(argstring, "nz", params.ny);
+  params.load_imbalance =
+      Mantevo::parse_parameter<float>(argstring, "load_imbalance", 0);
+  params.numthreads = Mantevo::parse_parameter<int>(argstring, "numthreads", 1);
+  params.mv_overlap_comm_comp = Mantevo::parse_parameter<int>(argstring, "mv_overlap_comm_comp", 0);
+  params.use_locking = Mantevo::parse_parameter<int>(argstring, "use_locking", 0);
+  params.name = Mantevo::parse_parameter<std::string>(argstring, "name","");
+  params.elem_group_size = Mantevo::parse_parameter<int>(argstring, "elem_group_size", 1);
+  params.use_elem_mat_fields = Mantevo::parse_parameter<int>(argstring, "use_elem_mat_fields", 1);
+  params.verify_solution = Mantevo::parse_parameter<int>(argstring, "verify_solution", 0);
+  params.device = Mantevo::parse_parameter<int>(argstring, "device", 0);
+  params.num_devices = Mantevo::parse_parameter<int>(argstring, "num_devices", 2);
+  params.skip_device = Mantevo::parse_parameter<int>(argstring, "skip_device", 9999);
+  params.numa = Mantevo::parse_parameter<int>(argstring, "numa", 1);
+}
+
+//-------------------------------------------------------------
+void broadcast_parameters(Parameters& params)
+{
+#ifdef HAVE_MPI
+  const int num_int_params = 13;
+  int iparams[num_int_params] = {params.nx, params.ny, params.nz, params.numthreads, params.mv_overlap_comm_comp, params.use_locking,
+		     params.elem_group_size, params.use_elem_mat_fields, params.verify_solution,
+		     params.device, params.num_devices,params.skip_device,params.numa};
+  MPI_Bcast(&iparams[0], num_int_params, MPI_INT, 0, MPI_COMM_WORLD);
+  params.nx = iparams[0];
+  params.ny = iparams[1];
+  params.nz = iparams[2];
+  params.numthreads = iparams[3];
+  params.mv_overlap_comm_comp = iparams[4];
+  params.use_locking = iparams[5];
+  params.elem_group_size = iparams[6];
+  params.use_elem_mat_fields = iparams[7];
+  params.verify_solution = iparams[8];
+  params.device = iparams[9];
+  params.num_devices = iparams[10];
+  params.skip_device = iparams[11];
+  params.numa = iparams[12];
+
+  float fparams[1] = {params.load_imbalance};
+  MPI_Bcast(&fparams[0], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
+  params.load_imbalance = fparams[0];
+
+#endif
+}
+
+//-------------------------------------------------------------
+void initialize_mpi(int argc, char** argv, int& numprocs, int& myproc)
+{
+#ifdef HAVE_MPI
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#else
+  numprocs = 1;
+  myproc = 0;
+#endif
+}
+
+//-------------------------------------------------------------
+void finalize_mpi()
+{
+#ifdef HAVE_MPI
+  MPI_Finalize();
+#endif
+}
+
+}//namespace miniFE
+
Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/verify_solution.hpp
===================================================================
--- /dev/null
+++ MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/verify_solution.hpp
@@ -0,0 +1,179 @@
+#ifndef _verify_solution_hpp_
+#define _verify_solution_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <sstream>
+#include <stdexcept>
+#include <map>
+#include <algorithm>
+
+#include <simple_mesh_description.hpp>
+#include <analytic_soln.hpp>
+#include <box_utils.hpp>
+#include <utils.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename Scalar>
+struct err_info {
+  Scalar err;
+  Scalar computed;
+  Scalar analytic;
+  Scalar coords[3];
+};
+
+template<typename VectorType>
+int
+verify_solution(const simple_mesh_description<typename VectorType::GlobalOrdinalType>& mesh,
+                const VectorType& x, double tolerance, bool verify_whole_domain = false)
+{
+  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename VectorType::ScalarType Scalar;
+
+  int global_nodes_x = mesh.global_box[0][1]+1;
+  int global_nodes_y = mesh.global_box[1][1]+1;
+  int global_nodes_z = mesh.global_box[2][1]+1;
+  Box box;
+  copy_box(mesh.local_box, box);
+
+  //num-owned-nodes in each dimension is num-elems+1
+  //only if num-elems > 0 in that dimension *and*
+  //we are at the high end of the global range in that dimension:
+  if (box[0][1] > box[0][0] && box[0][1] == mesh.global_box[0][1]) ++box[0][1];
+  if (box[1][1] > box[1][0] && box[1][1] == mesh.global_box[1][1]) ++box[1][1];
+  if (box[2][1] > box[2][0] && box[2][1] == mesh.global_box[2][1]) ++box[2][1];
+
+  std::vector<GlobalOrdinal> rows;
+  std::vector<Scalar> row_coords;
+
+  int roffset = 0;
+  for(int iz=box[2][0]; iz<box[2][1]; ++iz) {
+   for(int iy=box[1][0]; iy<box[1][1]; ++iy) {
+    for(int ix=box[0][0]; ix<box[0][1]; ++ix) {
+      GlobalOrdinal row_id =
+          get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                ix, iy, iz);
+      Scalar x, y, z;
+      get_coords(row_id, global_nodes_x, global_nodes_y, global_nodes_z, x, y, z);
+
+      bool verify_this_point = false;
+      if (verify_whole_domain) verify_this_point = true;
+      else if (std::abs(x - 0.5) < 0.05 && std::abs(y - 0.5) < 0.05 && std::abs(z - 0.5) < 0.05) {
+        verify_this_point = true;
+      }
+
+      if (verify_this_point) {
+        rows.push_back(roffset);
+        row_coords.push_back(x);
+        row_coords.push_back(y);
+        row_coords.push_back(z);
+      }
+
+      ++roffset;
+    }
+   }
+  }
+
+  int return_code = 0;
+
+  const int num_terms = 300;
+
+  err_info<Scalar> max_error;
+  max_error.err = 0.0;
+
+  for(size_t i=0; i<rows.size(); ++i) {
+    Scalar computed_soln = x.coefs[rows[i]];
+    Scalar x = row_coords[i*3];
+    Scalar y = row_coords[i*3+1];
+    Scalar z = row_coords[i*3+2];
+    Scalar analytic_soln = 0.0;
+    //set exact boundary-conditions:
+    if (x == 1.0) {
+      //x==1 is first, we want soln to be 1 even around the edges
+      //of the x==1 plane where y and/or z may be 0 or 1...
+      analytic_soln = 1;
+    }
+    else if (x == 0.0 || y == 0.0 || z == 0.0) {
+      analytic_soln = 0;
+    }
+    else if (y == 1.0 || z == 1.0) {
+      analytic_soln = 0;
+    }
+    else {
+      analytic_soln = soln(x, y, z, num_terms, num_terms);
+    }
+
+#ifdef MINIFE_DEBUG_VERBOSE
+std::cout<<"("<<x<<","<<y<<","<<z<<") row "<<rows[i]<<": computed: "<<computed_soln<<",  analytic: "<<analytic_soln<<std::endl;
+#endif
+    Scalar err = std::abs(analytic_soln - computed_soln);
+    if (err > max_error.err) {
+      max_error.err = err;
+      max_error.computed = computed_soln;
+      max_error.analytic = analytic_soln;
+      max_error.coords[0] = x;
+      max_error.coords[1] = y;
+      max_error.coords[2] = z;
+    }
+  }
+
+  Scalar local_max_err = max_error.err;
+  Scalar global_max_err = 0;
+#ifdef HAVE_MPI
+  MPI_Allreduce(&local_max_err, &global_max_err, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+#else
+  global_max_err = local_max_err;
+#endif
+
+  if (local_max_err == global_max_err) {
+    if (max_error.err > tolerance) {
+      std::cout << "max absolute error is "<<max_error.err<<":"<<std::endl;
+      std::cout << "   at position ("<<max_error.coords[0]<<","<<max_error.coords[1]<<","<<max_error.coords[2]<<"), "<<std::endl;
+      std::cout << "   computed solution: "<<max_error.computed<<",  analytic solution: "<<max_error.analytic<<std::endl;
+    }
+    else {
+      std::cout << "solution matches analytic solution to within "<<tolerance<<" or better."<<std::endl;
+    }
+  }
+
+  if (global_max_err > tolerance) {
+    return_code = 1;
+  }
+
+  return return_code;
+}
+
+}//namespace miniFE
+
+#endif
+
Index: MultiSource/Benchmarks/Makefile
===================================================================
--- MultiSource/Benchmarks/Makefile
+++ MultiSource/Benchmarks/Makefile
@@ -19,7 +19,7 @@
 endif
 
 ifndef DISABLE_CXX
-PARALLEL_DIRS += Prolangs-C++ PAQ8p tramp3d-v4 Bullet 7zip
+PARALLEL_DIRS += Prolangs-C++ PAQ8p tramp3d-v4 Bullet 7zip DOE-ProxyApps-C++
 endif
 
 ifndef SMALL_PROBLEM_SIZE