Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
@@ -4,3 +4,4 @@
 add_subdirectory(miniGMG)
 add_subdirectory(RSBench)
 add_subdirectory(SimpleMOC)
+add_subdirectory(CoMD)
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeLists.txt
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeLists.txt
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeLists.txt
@@ -0,0 +1,6 @@
+list(APPEND CFLAGS -std=c99 -DDOUBLE)
+list(APPEND LDFLAGS -lm)
+
+set(RUN_OPTIONS -e -x 10 -y 10 -z 10 --potDir .)
+llvm_multisource(CoMD)
+llvm_test_data(CoMD Cu_u6.eam) 
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c
@@ -0,0 +1,1107 @@
+/// \file
+/// Main program
+///
+/// \mainpage CoMD: A Classical Molecular Dynamics Mini-app
+///
+/// CoMD is a reference implementation of typical classical molecular
+/// dynamics algorithms and workloads.  It is created and maintained by
+/// The Exascale Co-Design Center for Materials in Extreme Environments
+/// (ExMatEx).  http://codesign.lanl.gov/projects/exmatex.  The
+/// code is intended to serve as a vehicle for co-design by allowing
+/// others to extend and/or reimplement it as needed to test performance of 
+/// new architectures, programming models, etc.
+///
+/// The current version of CoMD is available from:
+/// http://exmatex.github.io/CoMD
+///
+/// To contact the developers of CoMD send email to: exmatex-comd@llnl.gov.
+///
+/// Table of Contents
+/// =================
+///
+/// Click on the links below to browse the CoMD documentation.
+///
+/// \subpage pg_md_basics
+///
+/// \subpage pg_building_comd
+///
+/// \subpage pg_running_comd
+///
+/// \subpage pg_measuring_performance
+///
+/// \subpage pg_problem_selection_and_scaling
+///
+/// \subpage pg_verifying_correctness
+///
+/// \subpage pg_comd_architecture
+///
+/// \subpage pg_optimization_targets
+///
+/// \subpage pg_whats_new
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include "CoMDTypes.h"
+#include "decomposition.h"
+#include "linkCells.h"
+#include "eam.h"
+#include "ljForce.h"
+#include "initAtoms.h"
+#include "memUtils.h"
+#include "yamlOutput.h"
+#include "parallel.h"
+#include "performanceTimers.h"
+#include "mycommand.h"
+#include "timestep.h"
+#include "constants.h"
+
+#define REDIRECT_OUTPUT 0
+#define   MIN(A,B) ((A) < (B) ? (A) : (B))
+
+static SimFlat* initSimulation(Command cmd);
+static void destroySimulation(SimFlat** ps);
+
+static void initSubsystems(void);
+static void finalizeSubsystems(void);
+
+static BasePotential* initPotential(
+   int doeam, const char* potDir, const char* potName, const char* potType);
+static SpeciesData* initSpecies(BasePotential* pot);
+static Validate* initValidate(SimFlat* s);
+static void validateResult(const Validate* val, SimFlat *sim);
+
+static void sumAtoms(SimFlat* s);
+static void printThings(SimFlat* s, int iStep, double elapsedTime);
+static void printSimulationDataYaml(FILE* file, SimFlat* s);
+static void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8]);
+
+
+int main(int argc, char** argv)
+{
+   // Prolog
+   initParallel(&argc, &argv);
+   profileStart(totalTimer);
+   initSubsystems();
+   timestampBarrier("Starting Initialization\n");
+
+#ifdef PRINT_YAML
+   yamlAppInfo(yamlFile);
+#endif
+   yamlAppInfo(screenOut);
+
+   Command cmd = parseCommandLine(argc, argv);
+#ifdef PRINT_YAML
+   printCmdYaml(yamlFile, &cmd);
+#endif
+   printCmdYaml(screenOut, &cmd);
+
+   SimFlat* sim = initSimulation(cmd);
+#ifdef PRINT_YAML
+   printSimulationDataYaml(yamlFile, sim);
+#endif
+   printSimulationDataYaml(screenOut, sim);
+
+   Validate* validate = initValidate(sim); // atom counts, energy
+   timestampBarrier("Initialization Finished\n");
+
+   timestampBarrier("Starting simulation\n");
+
+   // This is the CoMD main loop
+   const int nSteps = sim->nSteps;
+   const int printRate = sim->printRate;
+   int iStep = 0;
+   profileStart(loopTimer);
+   for (; iStep<nSteps;)
+   {
+      startTimer(commReduceTimer);
+      sumAtoms(sim);
+      stopTimer(commReduceTimer);
+
+      printThings(sim, iStep, getElapsedTime(timestepTimer));
+
+      startTimer(timestepTimer);
+      timestep(sim, printRate, sim->dt);
+      stopTimer(timestepTimer);
+
+      iStep += printRate;
+   }
+   profileStop(loopTimer);
+
+   sumAtoms(sim);
+   printThings(sim, iStep, getElapsedTime(timestepTimer));
+   timestampBarrier("Ending simulation\n");
+
+   // Epilog
+   validateResult(validate, sim);
+   profileStop(totalTimer);
+
+#ifdef PRINT_TIMING
+   printPerformanceResults(sim->atoms->nGlobal, sim->printRate);
+   printPerformanceResultsYaml(yamlFile);
+#endif
+
+   destroySimulation(&sim);
+   comdFree(validate);
+   finalizeSubsystems();
+
+   timestampBarrier("CoMD Ending\n");
+   destroyParallel();
+
+   return 0;
+}
+
+/// Initialized the main CoMD data stucture, SimFlat, based on command
+/// line input from the user.  Also performs certain sanity checks on
+/// the input to screen out certain non-sensical inputs.
+///
+/// Simple data members such as the time step dt are initialized
+/// directly, substructures such as the potential, the link cells, the
+/// atoms, etc., are initialized by calling additional initialization
+/// functions (initPotential(), initLinkCells(), initAtoms(), etc.).
+/// Initialization order is set by the natural dependencies of the
+/// substructure such as the atoms need the link cells so the link cells
+/// must be initialized before the atoms.
+SimFlat* initSimulation(Command cmd)
+{
+   SimFlat* sim = comdMalloc(sizeof(SimFlat));
+   sim->nSteps = cmd.nSteps;
+   sim->printRate = cmd.printRate;
+   sim->dt = cmd.dt;
+   sim->domain = NULL;
+   sim->boxes = NULL;
+   sim->atoms = NULL;
+   sim->ePotential = 0.0;
+   sim->eKinetic = 0.0;
+   sim->atomExchange = NULL;
+
+   sim->pot = initPotential(cmd.doeam, cmd.potDir, cmd.potName, cmd.potType);
+   real_t latticeConstant = cmd.lat;
+   if (cmd.lat < 0.0)
+      latticeConstant = sim->pot->lat;
+
+   // ensure input parameters make sense.
+   sanityChecks(cmd, sim->pot->cutoff, latticeConstant, sim->pot->latticeType);
+
+   sim->species = initSpecies(sim->pot);
+
+   real3 globalExtent;
+   globalExtent[0] = cmd.nx * latticeConstant;
+   globalExtent[1] = cmd.ny * latticeConstant;
+   globalExtent[2] = cmd.nz * latticeConstant;
+
+   sim->domain = initDecomposition(
+      cmd.xproc, cmd.yproc, cmd.zproc, globalExtent);
+
+   sim->boxes = initLinkCells(sim->domain, sim->pot->cutoff);
+   sim->atoms = initAtoms(sim->boxes);
+
+   // create lattice with desired temperature and displacement.
+   createFccLattice(cmd.nx, cmd.ny, cmd.nz, latticeConstant, sim);
+   setTemperature(sim, cmd.temperature);
+   randomDisplacements(sim, cmd.initialDelta);
+
+   sim->atomExchange = initAtomHaloExchange(sim->domain, sim->boxes);
+
+   // Forces must be computed before we call the time stepper.
+   startTimer(redistributeTimer);
+   redistributeAtoms(sim);
+   stopTimer(redistributeTimer);
+
+   startTimer(computeForceTimer);
+   computeForce(sim);
+   stopTimer(computeForceTimer);
+
+   kineticEnergy(sim);
+
+   return sim;
+}
+
+/// frees all data associated with *ps and frees *ps
+void destroySimulation(SimFlat** ps)
+{
+   if ( ! ps ) return;
+
+   SimFlat* s = *ps;
+   if ( ! s ) return;
+
+   BasePotential* pot = s->pot;
+   if ( pot) pot->destroy(&pot);
+   destroyLinkCells(&(s->boxes));
+   destroyAtoms(s->atoms);
+   destroyHaloExchange(&(s->atomExchange));
+   comdFree(s->species);
+   comdFree(s->domain);
+   comdFree(s);
+   *ps = NULL;
+
+   return;
+}
+
+void initSubsystems(void)
+{
+#if REDIRECT_OUTPUT
+   freopen("testOut.txt","w",screenOut);
+#endif
+
+#ifdef PRINT_YAML
+   yamlBegin();
+#endif
+}
+
+void finalizeSubsystems(void)
+{
+#if REDIRECT_OUTPUT
+   fclose(screenOut);
+#endif
+
+#ifdef PRINT_YAML
+   yamlEnd();
+#endif
+}
+
+/// decide whether to get LJ or EAM potentials
+BasePotential* initPotential(
+   int doeam, const char* potDir, const char* potName, const char* potType)
+{
+   BasePotential* pot = NULL;
+
+   if (doeam) 
+      pot = initEamPot(potDir, potName, potType);
+   else 
+      pot = initLjPot();
+   assert(pot);
+   return pot;
+}
+
+SpeciesData* initSpecies(BasePotential* pot)
+{
+   SpeciesData* species = comdMalloc(sizeof(SpeciesData));
+
+   strcpy(species->name, pot->name);
+   species->atomicNo = pot->atomicNo;
+   species->mass = pot->mass;
+
+   return species;
+}
+
+Validate* initValidate(SimFlat* sim)
+{
+   sumAtoms(sim);
+   Validate* val = comdMalloc(sizeof(Validate));
+   val->eTot0 = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
+   val->nAtoms0 = sim->atoms->nGlobal;
+
+   if (printRank())
+   {
+      fprintf(screenOut, "\n");
+      printSeparator(screenOut);
+      fprintf(screenOut, "Initial energy : %14.12f, atom count : %d \n", 
+            val->eTot0, val->nAtoms0);
+      fprintf(screenOut, "\n");
+   }
+   return val;
+}
+
+void validateResult(const Validate* val, SimFlat* sim)
+{
+   if (printRank())
+   {
+      real_t eFinal = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
+
+      int nAtomsDelta = (sim->atoms->nGlobal - val->nAtoms0);
+
+      fprintf(screenOut, "\n");
+      fprintf(screenOut, "\n");
+      fprintf(screenOut, "Simulation Validation:\n");
+
+      fprintf(screenOut, "  Initial energy  : %14.12f\n", val->eTot0);
+      fprintf(screenOut, "  Final energy    : %14.12f\n", eFinal);
+      fprintf(screenOut, "  eFinal/eInitial : %f\n", eFinal/val->eTot0);
+      if ( nAtomsDelta == 0)
+      {
+         fprintf(screenOut, "  Final atom count : %d, no atoms lost\n",
+               sim->atoms->nGlobal);
+      }
+      else
+      {
+         fprintf(screenOut, "#############################\n");
+         fprintf(screenOut, "# WARNING: %6d atoms lost #\n", nAtomsDelta);
+         fprintf(screenOut, "#############################\n");
+      }
+   }
+}
+
+void sumAtoms(SimFlat* s)
+{
+   // sum atoms across all processers
+   s->atoms->nLocal = 0;
+   for (int i = 0; i < s->boxes->nLocalBoxes; i++)
+   {
+      s->atoms->nLocal += s->boxes->nAtoms[i];
+   }
+
+   startTimer(commReduceTimer);
+   addIntParallel(&s->atoms->nLocal, &s->atoms->nGlobal, 1);
+   stopTimer(commReduceTimer);
+}
+
+/// Prints current time, energy, performance etc to monitor the state of
+/// the running simulation.  Performance per atom is scaled by the
+/// number of local atoms per process this should give consistent timing
+/// assuming reasonable load balance
+void printThings(SimFlat* s, int iStep, double elapsedTime)
+{
+   // keep track previous value of iStep so we can calculate number of steps.
+   static int iStepPrev = -1; 
+   static int firstCall = 1;
+
+   int nEval = iStep - iStepPrev; // gives nEval = 1 for zeroth step.
+   iStepPrev = iStep;
+   
+   if (! printRank() )
+      return;
+
+   if (firstCall)
+   {
+      firstCall = 0;
+      fprintf(screenOut, 
+       "#                                                                                         Performance\n" 
+       "#  Loop   Time(fs)       Total Energy   Potential Energy     Kinetic Energy  Temperature   (us/atom)     # Atoms\n");
+      fflush(screenOut);
+   }
+
+   real_t time = iStep*s->dt;
+   real_t eTotal = (s->ePotential+s->eKinetic) / s->atoms->nGlobal;
+   real_t eK = s->eKinetic / s->atoms->nGlobal;
+   real_t eU = s->ePotential / s->atoms->nGlobal;
+   real_t Temp = (s->eKinetic / s->atoms->nGlobal) / (kB_eV * 1.5);
+
+   double timePerAtom = 1.0e6*elapsedTime/(double)(nEval*s->atoms->nLocal);
+#ifndef PRINT_TIMING
+   timePerAtom = 0.0;
+#endif
+
+   fprintf(screenOut, " %6d %10.2f %18.12f %18.12f %18.12f %12.4f %10.4f %12d\n",
+           iStep, time, eTotal, eU, eK, Temp, timePerAtom, s->atoms->nGlobal);
+}
+
+/// Print information about the simulation in a format that is (mostly)
+/// YAML compliant.
+void printSimulationDataYaml(FILE* file, SimFlat* s)
+{
+   // All ranks get maxOccupancy
+   int maxOcc = maxOccupancy(s->boxes);
+
+   // Only rank 0 prints
+   if (! printRank())
+      return;
+   
+   fprintf(file,"Simulation data: \n");
+   fprintf(file,"  Total atoms        : %d\n", 
+           s->atoms->nGlobal);
+   fprintf(file,"  Min global bounds  : [ %14.10f, %14.10f, %14.10f ]\n",
+           s->domain->globalMin[0], s->domain->globalMin[1], s->domain->globalMin[2]);
+   fprintf(file,"  Max global bounds  : [ %14.10f, %14.10f, %14.10f ]\n",
+           s->domain->globalMax[0], s->domain->globalMax[1], s->domain->globalMax[2]);
+   printSeparator(file);
+   fprintf(file,"Decomposition data: \n");
+   fprintf(file,"  Processors         : %6d,%6d,%6d\n", 
+           s->domain->procGrid[0], s->domain->procGrid[1], s->domain->procGrid[2]);
+   fprintf(file,"  Local boxes        : %6d,%6d,%6d = %8d\n", 
+           s->boxes->gridSize[0], s->boxes->gridSize[1], s->boxes->gridSize[2], 
+           s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2]);
+   fprintf(file,"  Box size           : [ %14.10f, %14.10f, %14.10f ]\n", 
+           s->boxes->boxSize[0], s->boxes->boxSize[1], s->boxes->boxSize[2]);
+   fprintf(file,"  Box factor         : [ %14.10f, %14.10f, %14.10f ] \n", 
+           s->boxes->boxSize[0]/s->pot->cutoff,
+           s->boxes->boxSize[1]/s->pot->cutoff,
+           s->boxes->boxSize[2]/s->pot->cutoff);
+   fprintf(file, "  Max Link Cell Occupancy: %d of %d\n",
+           maxOcc, MAXATOMS);
+   printSeparator(file);
+   fprintf(file,"Potential data: \n");
+   s->pot->print(file, s->pot);
+   
+   // Memory footprint diagnostics
+   int perAtomSize = 10*sizeof(real_t)+2*sizeof(int);
+   float mbPerAtom = perAtomSize/1024/1024;
+   float totalMemLocal = (float)(perAtomSize*s->atoms->nLocal)/1024/1024;
+   float totalMemGlobal = (float)(perAtomSize*s->atoms->nGlobal)/1024/1024;
+
+   int nLocalBoxes = s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2];
+   int nTotalBoxes = (s->boxes->gridSize[0]+2)*(s->boxes->gridSize[1]+2)*(s->boxes->gridSize[2]+2);
+   float paddedMemLocal = (float) nLocalBoxes*(perAtomSize*MAXATOMS)/1024/1024;
+   float paddedMemTotal = (float) nTotalBoxes*(perAtomSize*MAXATOMS)/1024/1024;
+
+   printSeparator(file);
+   fprintf(file,"Memory data: \n");
+   fprintf(file, "  Intrinsic atom footprint = %4d B/atom \n", perAtomSize);
+   fprintf(file, "  Total atom footprint     = %7.3f MB (%6.2f MB/node)\n", totalMemGlobal, totalMemLocal);
+   fprintf(file, "  Link cell atom footprint = %7.3f MB/node\n", paddedMemLocal);
+   fprintf(file, "  Link cell atom footprint = %7.3f MB/node (including halo cell data\n", paddedMemTotal);
+
+   fflush(file);      
+}
+
+/// Check that the user input meets certain criteria.
+void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8])
+{
+   int failCode = 0;
+
+   // Check that domain grid matches number of ranks. (fail code 1)
+   int nProcs = cmd.xproc * cmd.yproc * cmd.zproc;
+   if (nProcs != getNRanks())
+   {
+      failCode |= 1;
+      if (printRank() )
+         fprintf(screenOut,
+                 "\nNumber of MPI ranks must match xproc * yproc * zproc\n");
+   }
+
+   // Check whether simuation is too small (fail code 2)
+   double minx = 2*cutoff*cmd.xproc;
+   double miny = 2*cutoff*cmd.yproc;
+   double minz = 2*cutoff*cmd.zproc;
+   double sizex = cmd.nx*latticeConst;
+   double sizey = cmd.ny*latticeConst;
+   double sizez = cmd.nz*latticeConst;
+
+   if ( sizex < minx || sizey < miny || sizez < minz)
+   {
+      failCode |= 2;
+      if (printRank())
+         fprintf(screenOut,"\nSimulation too small.\n"
+                 "  Increase the number of unit cells to make the simulation\n"
+                 "  at least (%3.2f, %3.2f. %3.2f) Ansgstroms in size\n",
+                 minx, miny, minz);
+   }
+
+   // Check for supported lattice structure (fail code 4)
+   if (strcasecmp(latticeType, "FCC") != 0)
+   {
+      failCode |= 4;
+      if ( printRank() )
+         fprintf(screenOut,
+                 "\nOnly FCC Lattice type supported, not %s. Fatal Error.\n",
+                 latticeType);
+   }
+   int checkCode = failCode;
+   bcastParallel(&checkCode, sizeof(int), 0);
+   // This assertion can only fail if different tasks failed different
+   // sanity checks.  That should not be possible.
+   assert(checkCode == failCode);
+      
+   if (failCode != 0)
+      exit(failCode);
+}
+
+// --------------------------------------------------------------
+
+
+/// \page pg_building_comd Building CoMD
+///
+/// CoMD is written with portability in mind and should compile using
+/// practically any compiler that implements the C99 standard.  You will
+/// need to create a Makefile by copying the sample provided with the
+/// distribution (Makefile.vanilla).
+/// 
+///     $ cp Makefile.vanilla Makefile
+///
+/// and use the make command to build the code
+/// 
+///    $ make
+///
+/// The sample Makefile will compile the code on many platforms.  See
+/// comments in Makefile.vanilla for information about specifying the
+/// name of the C compiler, and/or additional compiler switches that
+/// might be necessary for your platform.
+/// 
+/// The main options available in the Makefile are toggling single/double 
+/// precision and enabling/disabling MPI. In the event MPI is not
+/// available, setting the DO_MPI flag to OFF will create a purely
+/// serial build (you will likely also need to change the setting of
+/// CC).
+/// 
+/// The makefile should handle all the dependency checking needed, via
+/// makedepend.
+/// 
+/// 'make clean' removes the object and dependency files. 
+/// 
+/// 'make distclean' additionally removes the executable file and the
+/// documentation files.
+/// 
+/// Other build options
+/// -------------------
+///
+/// Various other options are made available by \#define arguments within 
+/// some of the source files. 
+///
+/// #REDIRECT_OUTPUT in CoMD.c
+///
+/// Setting this to 1 will redirect all screen output to a file,
+/// currently set to 'testOut.txt'.
+///
+/// #POT_SHIFT in ljForce.c
+///
+/// This is set to 1.0 by default, and shifts the values of the cohesive
+/// energy given by the Lennard-Jones potential so it is zero at the
+/// cutoff radius.  This setting improves energy conservation
+/// step-to-step as it reduces the noise generated by atoms crossing the
+/// cutoff threshold. However, it does not affect the long-term energy
+/// conservation of the code.
+///
+/// #MAXATOMS in linkCells.h
+/// 
+/// The default value is 64, which allows ample padding of the linkCell
+/// structure to allow for density fluctuations. Reducing it may improve
+/// the efficiency of the code via improved thread utilization and
+/// reduced memory footprint.
+
+// --------------------------------------------------------------
+
+
+// --------------------------------------------------------------
+
+
+/// \page pg_measuring_performance Measuring Performance
+///
+/// CoMD implements a simple and extensible system of internal timers to
+/// measure the performance profile of the code.  As explained in
+/// performanceTimers.c, it is easy to create additional timers and
+/// associate them with code regions of specific interest.  In addition,
+/// the getTime() and getTick() functions can be easily reimplemented to
+/// take advantage of platform specific timing resources.
+///
+/// A timing report is printed at the end of each simulation. 
+///
+/// ~~~~
+/// Timings for Rank 0
+///         Timer        # Calls    Avg/Call (s)   Total (s)    % Loop
+/// ___________________________________________________________________
+/// total                      1      50.6701       50.6701      100.04
+/// loop                       1      50.6505       50.6505      100.00
+/// timestep                   1      50.6505       50.6505      100.00
+///   position             10000       0.0000        0.0441        0.09
+///   velocity             20000       0.0000        0.0388        0.08
+///   redistribute         10001       0.0003        3.4842        6.88
+///     atomHalo           10001       0.0002        2.4577        4.85
+///   force                10001       0.0047       47.0856       92.96
+///     eamHalo            10001       0.0001        1.0592        2.09
+/// commHalo               60006       0.0000        1.7550        3.46
+/// commReduce                12       0.0000        0.0003        0.00
+/// 
+/// Timing Statistics Across 8 Ranks:
+///         Timer        Rank: Min(s)       Rank: Max(s)      Avg(s)    Stdev(s)
+/// _____________________________________________________________________________
+/// total                3:   50.6697       0:   50.6701     50.6699      0.0001
+/// loop                 0:   50.6505       4:   50.6505     50.6505      0.0000
+/// timestep             0:   50.6505       4:   50.6505     50.6505      0.0000
+///   position           2:    0.0437       0:    0.0441      0.0439      0.0001
+///   velocity           2:    0.0380       4:    0.0392      0.0385      0.0004
+///   redistribute       0:    3.4842       1:    3.7085      3.6015      0.0622
+///     atomHalo         0:    2.4577       7:    2.6441      2.5780      0.0549
+///   force              1:   46.8624       0:   47.0856     46.9689      0.0619
+///     eamHalo          3:    0.2269       6:    1.2936      1.0951      0.3344
+/// commHalo             3:    1.0803       6:    2.1856      1.9363      0.3462
+/// commReduce           6:    0.0002       2:    0.0003      0.0003      0.0000
+/// 
+/// ---------------------------------------------------
+///  Average atom update rate:   9.39 us/atom/task
+/// ---------------------------------------------------
+///
+/// ~~~~
+/// This report consists of two blocks.  The upper block lists the absolute
+/// wall clock time spent in each timer on rank 0 of the job.  The lower
+/// block reports minimum, maximum, average, and standard deviation of
+/// times across all tasks.
+/// The ranks where the minimum and maximum values occured are also reported
+/// to aid in identifying hotspots or load imbalances.
+///
+/// The last line of the report gives the atom update rate in
+/// microseconds/atom/task.  Since this quantity is normalized by both
+/// the number of atoms and the number of tasks it provides a simple
+/// figure of merit to compare performance between runs with different
+/// numbers of atoms and different numbers of tasks.  Any increase in
+/// this number relative to a large number of atoms on a single task
+/// represents a loss of parallel efficiency.
+/// 
+/// Choosing the problem size correctly has important implications for the 
+/// reported performance. Small problem sizes may run entirely in the cache 
+/// of some architectures, leading to very good performance results. 
+/// For general characterization of performance, it is probably best to 
+/// choose problem sizes which force the code to access main memory, even
+/// though there may be strong scaling scenarios where the code is indeed 
+/// running mainly in cache.
+///
+/// *** Architecture/Configuration for above timing numbers:
+/// SGI XE1300 cluster with dual-socket Intel quad-core Nehalem processors. 
+/// Each node has 2 Quad-Core Xeon X5550 processors runnning at 2.66 GHz
+/// with 3 GB of memory per core.
+
+// --------------------------------------------------------------
+
+
+/// \page pg_problem_selection_and_scaling Problem Selection and Scaling
+///
+/// CoMD is a reference molecular dynamics simulation code as used in
+/// materials science.
+///
+/// Problem Specification  {#sec_problem_spec}
+/// ======================
+///
+/// The reference problem is solid Copper starting from a face-centered
+/// cubic (FCC) lattice.  The initial thermodynamic conditions
+/// (Temperature and Volume (via the lattice spacing, lat))can be specified
+/// from the command line input. The default is 600 K and standard
+/// volume (lat = 3.615 Angstroms).  
+/// Different temperatures (e.g. T =3000K) and volumes can be
+/// specified to melt the system and enhance the interchange of atoms
+/// between domains.
+///
+/// The dynamics is micro-canonical (NVE = constant Number of atoms,
+/// constant total system Volume, and constant total system Energy). As
+/// a result, the temperature is not fixed. Rather, the temperature will
+/// adjust from the initial temperature (as specified on the command line)
+/// to a final temperature as the total system kinetic energy comes into
+/// equilibrium with the total system potential energy.
+///
+/// The total size of the problem (number of atoms) is specified by the
+/// number (nx, ny, nz) of FCC unit cells in the x, y, z directions: nAtoms
+/// = 4 * nx * ny * nz. The default size is nx = ny = nz = 20 or 32,000 atoms.
+///
+/// The simulation models bulk copper by replicating itself in every
+/// direction using periodic boundary conditions.
+///
+/// Two interatomic force models are available: the Lennard-Jones (LJ)
+/// two-body potential (ljForce.c) and the many-body Embedded-Atom Model (EAM)
+/// potential (eam.c). The LJ potential is included for comparison and
+/// is a valid approximation for constant volume and uniform
+/// density. The EAM potential is a more accurate model of cohesion in
+/// simple metals like Copper and includes the energetics necessary to
+/// model non-uniform density and free surfaces.
+///
+/// Scaling Studies in CoMD  {#sec_scaling_studies}
+/// =======================
+///
+/// CoMD implements a simple geometric domain decomposition to divide
+/// the total problem space into domains, which are owned by MPI
+/// ranks. Each domain is a single-program multiple data (SPMD)
+/// partition of the larger problem.
+///
+/// Caution: When doing scaling studies, it is important to distinguish
+/// between the problem setup phase and the problem execution phase. Both
+/// are important to the workflow of doing molecular dynamics, but it
+/// is the execution phase we want to quantify in the scaling studies
+/// described below, for that dominates the execution time for long runs
+/// (millions of time steps). The problem setup can be an appreciable fraction
+/// of the execution time for short runs (the default is 100 time steps)
+/// and erroneous conclusions drawn.
+///
+/// This code is configured with timers. The times are reported per particle
+/// and the timers for the force calculation, timestep, etc start after the
+/// initialization phase is done.
+///
+/// Weak Scaling  {#ssec_weak_scaling}
+/// -----------
+///
+/// A weak scaling test fixes the amount of work per processor and
+/// compares the execution time over number of processors. Weak scaling
+/// keeps the ratio of inter-processor communication (surface) to
+/// intra-processor work (volume) fixed. The amount of inter-processor
+/// work scales with the number of processors in the domain and O(1000)
+/// atoms per domain are needed for reasonable performance.
+///
+/// Examples,
+///
+/// - Increase in processor count by 8: <br>
+///    (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=40)
+///
+/// - Increase in processor count by 2: <br>
+///    (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=20, nz=40)
+///
+/// In general, it is wise to keep the ratio of processor count to
+/// system size in each direction fixed (i.e. cubic domains): xproc_0 / nx_0 = xproc_1 /
+/// nx_1, since this minimizes surface area to volume. 
+/// Feel free to experiment, you might learn something about
+/// algorithms to optimize communication relative to work.
+///
+/// Strong Scaling {#ssec_strong_scaling}
+/// ---------------
+///
+/// A strong scaling test fixes the total problem size and compares the
+/// execution time for different numbers of processors. Strong scaling
+/// increases the ratio of inter-processor communication (surface) to
+/// intra-processor work (volume).
+///
+/// Examples,
+///
+/// - Increase in processor count by 8: <br>
+///    (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=20)
+///
+/// - Increase in processor count by 2: <br>
+///    (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=nz=20)
+///
+/// The domain decomposition requires O(1000) atoms per domain and
+/// begins to scale poorly for small numbers of atoms per domain. 
+/// Again, feel free to experiment, you might learn something here as
+/// well.  For example, when molecular dynamics codes were written for
+/// vector supercomputers, large lists of force pairs were created for
+/// the vector processor. These force lists provide a natural force
+/// decomposition for early parallel computers (Fast Parallel Algorithms
+/// for Short-Range Molecular Dynamics, S. J. Plimpton, J Comp Phys,
+/// 117, 1-19 (1995).) Using replicated data, force decomposition can
+/// scale to fewer than one atom per processor and is a natural
+/// mechanism to exploit intra-processor parallelism.
+///
+/// For further details see for example:
+/// https://support.scinet.utoronto.ca/wiki/index.php/Introduction_To_Performance
+
+
+// --------------------------------------------------------------
+
+
+/// \page pg_verifying_correctness Verifying Correctness
+///
+/// Verifying the correctness of an MD simulation is challenging.
+/// Because MD is Lyapunov unstable, any small errors, even harmless
+/// round-off errors, will lead to a long-term divergence in the atom
+/// trajectories.  Hence, comparing atom positions at the end of a run
+/// is not always a useful verification technique.  (Such divergences
+/// are not a problem for science applications of MD since they do not
+/// alter the statistical physics.)  Small, single-particle errors can
+/// also be difficult to detect in system-wide quantities such as the
+/// kinetic or potential energy that are averaged over a large number of
+/// particles.
+///
+/// In spite of these challenges, there are several methods which are
+/// likely to catch significant errors.
+///
+/// Cohesive Energy {#sec_ver_cohesive_energy}
+/// ===============
+///
+/// With a perfect lattice as the initial structure (this is the
+/// default), the potential energy per atom is the cohesive energy.
+/// This value should be computed correctly to many decimal places.  Any
+/// variation beyond the last 1 or 2 decimal places is cause for
+/// investigation.  The correct values for the cohesive energy are
+///
+/// | Potential      | Cohesive Energy |
+/// | :------------- | :-------------- |
+/// | Lennard-Jones  | -1.243619295058 |
+/// | EAM (Adams)    | -3.538079224691 |
+/// | EAM (Mishin)   | -3.539999969176 |
+///
+/// The \link sec_command_line_options command
+/// line options \endlink documentation explains the switches used to
+/// select the potential used in the simulation.
+///
+/// Note that the cohesive energy calculation is not sensitive to errors
+/// in forces.  It is also performed on a highly symmetric structure so
+/// there are many errors this will not catch.  Still, it is a good
+/// first check.
+///
+/// Energy Conservation {#sec_ver_energy_conservation}
+/// ===================
+///
+/// A correctly implemented force kernel, with an appropriate time step
+/// (the default value of 1 fs is conservative for temperatures under
+/// 10,000K) will conserve total energy over long times to 5 or more
+/// digits.  Any long term systematic drift in the total energy is a
+/// cause for concern.
+///
+/// To facilitate checking energy conservation CoMD prints the final and
+/// initial values of the total energy.  When comparing these values, pay
+/// careful attention to these details:
+///
+/// - It is common to observe an initial transient change in the total
+///   energy.  Differences in the total energy of 2-3% can be expected in
+///   the first 10-100 time steps.
+/// - The best way to check energy conservation is to run at least
+///   several thousand steps and look at the slope of the total energy
+///   ignoring at least the first one or two thousand steps.  More steps
+///   are even better.
+/// - Set the temperature to at least several hundred K.  This ensures
+///   that atoms will sample a large range of configurations and expose
+///   possible errors.
+/// - Fluctuations in the energy can make it difficult to tell if
+///   conservation is observed.  Increasing the number of atoms will reduce
+///   the fluctuations.
+/// 
+///
+/// Particle Conservation {#sec_ver_particle_conservation}
+/// =====================
+///
+/// The simulation should always end with the same number of particles
+/// it started with.  Any change is a bug.  CoMD checks the initial and
+/// final number of particles and prints a warning at the end of the
+/// simulation if they are not equal.
+///
+/// Reproducibility {#sec_ver_reproducibility}
+/// ===============
+///
+/// The same simulation run repeatedly on the same hardware should
+/// produce the same result.  Because parallel computing can add
+/// elements of non-determinism we do not expect perfect long term
+/// reproducibility, however over a few hundred to a few thousand time
+/// steps the energies should not exhibit run-to-run differences outside
+/// the last 1 or 2 decimal places.  Larger differences are a sign of
+/// trouble and should be investigated.  This kind of test is
+/// practically the only way to detect race conditions in shared memory
+/// parallelism.
+///
+/// Portability {#sec_ver_portability}
+/// ===========
+///
+/// In our experience, simulations that start from the same initial
+/// condition tend to produce very similar trajectories over short terms
+/// (100 to 1000 time step), even on different hardware platforms.
+/// Short term differences beyond the last 1 or 2 decimal places should
+/// likely be investigated.
+///
+/// General Principles {#sec_ver_general}
+/// =======================
+///
+/// - Simulations run at 0K are too trivial for verification, set
+///   the initial temperature to at least several hundred K.
+/// - Longer runs are better to check conservation.  Compare
+///   energies after initial transients are damped out.
+/// - Larger runs are better to check conservation.  Fluctuations in the
+///   energy are averaged out.
+/// - Short term (order 100 time steps) discrepancies from run-to-run
+///   or platform-to platform beyond the last one or two decimal places
+///   are reason for concern.  Differences in 4th or 5th decimal place
+///   is almost certainly a bug.
+/// - Contact the CoMD developers (exmatex-comd@llnl.gov) if you have
+///   questions about validation.
+///
+
+// --------------------------------------------------------------
+
+
+/// \page pg_comd_architecture CoMD Architecture
+///
+/// Program Flow {#sec_program_flow}
+/// ============
+///
+/// We have attempted to make the program flow in CoMD 1.1 as simple and
+/// transparent as possible.  The main program consists of three blocks:
+/// prolog, main loop, and epilog.
+///
+/// Prolog {#ssec_flow_prolog}
+/// -------
+///
+/// The job of the prolog is to initialize the simulation and prepare
+/// for the main loop.  Notable tasks in the prolog include calling
+/// - initParallel() to start MPI
+/// - parseCommandLine() to read the command line options
+/// - initSimulation() to initialize the main data structure, SimFlatSt.
+///   This includes tasks such as
+///   - initEamPot() to read tabular data for the potential function
+///   - initDecomposition() to set up the domain decomposition
+///   - createFccLattice() to generate an initial structure for the atoms
+/// - initValidate() to store initial data for a simple validation check
+///
+/// In CoMD 1.1 all atomic structures are internally generated so
+/// there is no need to read large files with atom coordinate data.
+///
+/// Main Loop {#ssec_flow_main_loop}
+/// ---------
+///
+/// The main loop calls
+/// - timestep(), the integrator to update particle positions,
+/// - printThings() to periodically prints simulation information
+///
+/// The timestep() function is the heart of the code as it choreographs
+/// updating the particle positions, along with computing forces
+/// (computeForce()) and communicating atoms between ranks
+/// (redistributeAtoms()).
+///
+/// Epilog {#ssec_flow_epilog}
+/// -------
+///
+/// The epilog code handles end of run bookkeeping such as
+/// - validateResult() to check validation
+/// - printPerformanceResults() to print a performance summary 
+/// - destroySimulation() to free memory
+///
+/// Key Data Structures {#sec_key_data_structures}
+/// ==================
+///
+/// Practically all data in CoMD belongs to the SimFlatSt structure.
+/// This includes:
+/// - BasePotentialSt A polymorphic structure for the potential model
+/// - HaloExchangeSt A polymorphic strcuture for communication halo data
+/// - DomainSt The parallel domain decomposition
+/// - LinkCellSt The link cells
+/// - AtomsSt The atom coordinates and velocities
+/// - SpeciesDataSt Properties of the atomic species being simulated.
+///
+/// Consult the individual pages for each of these structures to learn
+/// more.  The descriptions in haloExchange.c and initLinkCells() are
+/// especially useful to understand how the atoms are commuicated
+/// between tasks and stored in link cells for fast pair finding.
+
+// --------------------------------------------------------------
+
+
+/// \page pg_optimization_targets Optimization Targets
+///
+/// Computation {#sec_computation}
+/// ============
+///
+/// The computational effort of classical MD is usually highly focused
+/// in the force kernel.  The two force kernels supplied by CoMD are
+/// eamForce() and ljForce().  Both kernels are fundamentally loops over
+/// pairs of atoms with significant opportunity to exploit high levels
+/// of concurrency.  One potential challenge when reordering or
+/// parallelizing the pair loop structure is preventing race conditions
+/// that result if two concurrent pair evaluations try to simultaneously
+/// increment the forces and energies on the same atom.
+///
+/// The supplied EAM kernel uses interpolation from tabular data to
+/// evaluate functions.  Hence the interpolate() function is another
+/// potential optimization target.  Note that the two potential files
+/// distributed with CoMD have very different sizes.  The Adams
+/// potential (Cu_u6.eam) has 500 points per function in the table while
+/// the Mishin potential (Cu01.eam.alloy) has 10,000 points per
+/// function.  This difference could potentially impact important
+/// details such as cache miss rates.
+///
+/// Communication {#sec_communication}
+/// =============
+///  
+/// As the number of atoms per MPI rank decreases, the communication
+/// routines will start to require a significant fraction of the
+/// run time.  The main communication routine in CoMD is haloExchange().
+/// The halo exchange is simple nearest neighbor, point-to-point
+/// communication so it should scale well to practically any number of
+/// nodes.
+///
+/// The halo exchange in CoMD 1.1 is a very simple 6-direction
+/// structured halo exchange (see haloExchange.c).  Other exchange
+/// algorithms can be implemented without much difficulty.
+///
+/// The halo exchange function is called in two very different contexts.
+/// The main usage is to exchange halo particle information (see
+/// initAtomHaloExchange()).  This process is coordinated by the
+/// redistributeAtoms() function.
+///
+/// In addition to the atom exchange, when using the EAM potential, a
+/// halo exchange is performed in the force routine (see
+/// initForceHaloExchange()).
+
+
+// --------------------------------------------------------------
+
+
+/// \page pg_whats_new New Features and Changes in CoMD 1.1
+///
+/// The main goals of the 1.1 release were to add support for MPI and to
+/// improve the structure and clarity of the code.  Achieving these
+/// goals required considerable changes compared to the 1.0 release.
+/// However, the core structure of the most computationally intensive
+/// kernels (the force routines) is mostly unchanged.  We believe that
+/// lessons learned from optimizing 1.0 force kernels to specific
+/// hardware or programming models can be quickly transferred to kernels
+/// in the 1.1 release.
+///
+/// Significant changes in CoMD 1.1 include:
+///
+/// - MPI support.  Both MPI and single node serial executables can be
+///   built from the same source files.
+///
+/// - Improved modularity and code clarity.  Major data structures are
+///   now organized with their own structs and initialization routines.
+///
+/// - The build system has been simplified to use only standard
+///   Makefiles instead of CMake.
+///
+/// - The halo exchange operation needed to communicate remote particle
+///   data between MPI ranks also creates "image" particles in the
+///   serial build.
+///
+/// - Unified force kernels for both serial and MPI builds
+///
+///   - The addition of remote/image atoms allows periodic boundary
+///     conditions to be handled outside the force loop.
+///
+///   - An additional communication/data copy step to handle electron
+///     density on remote/image atoms has been added to the EAM force
+///     loop.
+///
+/// - The coordinate system has been simplified to a single global
+///   coordinate system for all particles.
+///
+/// - Evaluation of energies and forces using a Chebyshev polynomial
+///   fits has been removed.  Polynomial approximation of energies and
+///   forces will return in a future CoMD version.
+///
+/// - Atomic structures are now generated internally, eliminating the
+///   requirement to read, write, and distribute large atom
+///   configuration files.  Arbitrarily large initial structures can
+///   be generated with specified initial temperature and random
+///   displacements from lattice positions.  Code to read/write atomic
+///   positions has been removed.
+///
+/// - EAM potentials are now read from standard funcfl and setfl format
+///   files.  Voter style files are no longer supported.
+///
+/// - Collection of performance metrics is significantly improved.
+///   Developers can easily add new timers to regions of interest.  The
+///   system is also designed to allow easy integration with platform
+///   specific API's to high resolution timers, cycle counters,
+///   hardware counters, etc.
+///
+///
+/// - Hooks to in-situ analysis and visualization have been removed.
+///   In-situ analysis capabilities will return in a future CoMD release.
+///
+/// Please contact the CoMD developers (exematex-comd@llnl.gov) if
+/// any of the deleted features negative impacts your work.  We
+/// may be able to help produce a custom version that includes the code
+/// you need.
+
+
+// --------------------------------------------------------------
+
+
+/// \page pg_md_basics MD Basics
+///
+/// The molecular dynamics (MD) computer simulation method is a well
+/// established and important tool for the study of the dynamical
+/// properties of liquids, solids, and other systems of interest in
+/// Materials Science and Engineering, Chemistry and Biology. A material
+/// is represented in terms of atoms and molecules. The method of MD
+/// simulation involves the evaluation of the force acting on each atom
+/// due to all other atoms in the system and the numerical integration
+/// of the Newtonian equations of motion. Though MD was initially
+/// developed to compute the equilibrium thermodynamic behavior of
+/// materials (equation of state), most recent applications have used MD
+/// to study non-equilibrium processes.
+///
+/// Wikipeda offers a basic introduction to molecular dynamics with
+/// many references:
+///
+/// http://en.wikipedia.org/wiki/Molecular_dynamics
+///
+/// For a thorough treatment of MD methods, see:
+/// - "Computer simulation of liquids" by M.P. Allen and D.J. Tildesley
+///    (Oxford, 1989)
+///    ISBN-10: 0198556454 | ISBN-13: 978-0198556459.
+///
+/// For an understanding of MD simulations and application to statistical mechanics:
+/// - "Understanding Molecular Simulation, Second Edition: From Algorithms
+///    to Applications," by D. Frenkel and B. Smit (Academic Press, 2001)
+///    ISBN-10: 0122673514 | ISBN-13: 978-0122673511
+/// - "Statistical and Thermal Physics: With Computer Applications," by
+///    H. Gould and J. Tobochnik (Princeton, 2010)
+///    ISBN-10: 0691137447 | ISBN-13: 978-0691137445
+///
+/// CoMD implements both the Lennard-Jones Potential (ljForce.c) and the
+/// Embedded Atom Method Potential (eam.c).
+///
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.reference_output
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.reference_output
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.reference_output
@@ -0,0 +1,97 @@
+: Starting Initialization
+
+
+Mini-Application Name    : CoMD-serial
+Mini-Application Version : 1.1
+Platform:
+  hostname: test-suite
+  kernel name: 'test-suite'
+  kernel release: 'test-suite'
+  processor: 'test-suite'
+Build:
+  CC: 'test-suite'
+  compiler version: 'test-suite'
+  CFLAGS: '-std=c99 -DDOUBLE'
+  LDFLAGS: '-lm'
+  using MPI: false
+  Threading: none
+  Double Precision: true
+Run Date/Time: 
+
+Command Line Parameters:
+  doeam: 1
+  potDir: .
+  potName: Cu_u6.eam
+  potType: funcfl
+  nx: 10
+  ny: 10
+  nz: 10
+  xproc: 1
+  yproc: 1
+  zproc: 1
+  Lattice constant: -1 Angstroms
+  nSteps: 100
+  printRate: 10
+  Time step: 1 fs
+  Initial Temperature: 600 K
+  Initial Delta: 0 Angstroms
+
+Simulation data: 
+  Total atoms        : 4000
+  Min global bounds  : [   0.0000000000,   0.0000000000,   0.0000000000 ]
+  Max global bounds  : [  36.1500000000,  36.1500000000,  36.1500000000 ]
+
+Decomposition data: 
+  Processors         :      1,     1,     1
+  Local boxes        :      7,     7,     7 =      343
+  Box size           : [   5.1642857143,   5.1642857143,   5.1642857143 ]
+  Box factor         : [   1.0432900433,   1.0432900433,   1.0432900433 ] 
+  Max Link Cell Occupancy: 14 of 64
+
+Potential data: 
+  Potential type  : EAM
+  Species name    : Cu
+  Atomic number   : 29
+  Mass            : 63.55 amu
+  Lattice type    : FCC
+  Lattice spacing : 3.615 Angstroms
+  Cutoff          : 4.95 Angstroms
+
+Memory data: 
+  Intrinsic atom footprint =   88 B/atom 
+  Total atom footprint     =   0.336 MB (  0.34 MB/node)
+  Link cell atom footprint =   1.842 MB/node
+  Link cell atom footprint =   3.916 MB/node (including halo cell data
+
+
+Initial energy : -3.460523233088, atom count : 4000 
+
+: Initialization Finished
+
+: Starting simulation
+
+#                                                                                         Performance
+#  Loop   Time(fs)       Total Energy   Potential Energy     Kinetic Energy  Temperature   (us/atom)     # Atoms
+      0       0.00    -3.460523233088    -3.538079224688     0.077555991600     600.0000     0.0000         4000
+     10      10.00    -3.460522591854    -3.529960109661     0.069437517806     537.1927     0.0000         4000
+     20      20.00    -3.460524196370    -3.509840489408     0.049316293038     381.5279     0.0000         4000
+     30      30.00    -3.460527804962    -3.488665662974     0.028137858012     217.6842     0.0000         4000
+     40      40.00    -3.460532043042    -3.477611644844     0.017079601802     132.1337     0.0000         4000
+     50      50.00    -3.460536332237    -3.479795957699     0.019259625462     148.9991     0.0000         4000
+     60      60.00    -3.460538324455    -3.488980357513     0.028442033059     220.0374     0.0000         4000
+     70      70.00    -3.460537014134    -3.496718290600     0.036181276467     279.9109     0.0000         4000
+     80      80.00    -3.460534287349    -3.499004390616     0.038470103268     297.6180     0.0000         4000
+     90      90.00    -3.460531717488    -3.497316540132     0.036784822643     284.5801     0.0000         4000
+    100     100.00    -3.460530357326    -3.495731031242     0.035200673916     272.3246     0.0000         4000
+: Ending simulation
+
+
+
+Simulation Validation:
+  Initial energy  : -3.460523233088
+  Final energy    : -3.460530357326
+  eFinal/eInitial : 1.000002
+  Final atom count : 4000, no atoms lost
+: CoMD Ending
+
+exit 0
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMDTypes.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMDTypes.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMDTypes.h
@@ -0,0 +1,85 @@
+/// \file
+/// CoMD data structures.
+
+#ifndef __COMDTYPES_H_
+#define __COMDTYPES_H_
+
+#include <stdio.h>
+#include "mytype.h"
+#include "haloExchange.h"
+#include "linkCells.h"
+#include "decomposition.h"
+#include "initAtoms.h"
+
+struct SimFlatSt;
+
+/// The base struct from which all potentials derive.  Think of this as an
+/// abstract base class.
+///
+/// CoMD uses the following units:
+///  - distance is in Angstroms
+///  - energy is in eV
+///  - time is in fs
+///  - force in in eV/Angstrom
+///
+///  The choice of distance, energy, and time units means that the unit
+///  of mass is eV*fs^2/Angstrom^2.  Hence, we must convert masses that
+///  are input in AMU (atomic mass units) into internal mass units.
+typedef struct BasePotentialSt 
+{
+   real_t cutoff;          //!< potential cutoff distance in Angstroms
+   real_t mass;            //!< mass of atoms in intenal units
+   real_t lat;             //!< lattice spacing (angs) of unit cell
+   char latticeType[8];    //!< lattice type, e.g. FCC, BCC, etc.
+   char  name[3];	   //!< element name
+   int	 atomicNo;	   //!< atomic number  
+   int  (*force)(struct SimFlatSt* s); //!< function pointer to force routine
+   void (*print)(FILE* file, struct BasePotentialSt* pot);
+   void (*destroy)(struct BasePotentialSt** pot); //!< destruction of the potential
+} BasePotential;
+
+
+/// species data: chosen to match the data found in the setfl/funcfl files
+typedef struct SpeciesDataSt
+{
+   char  name[3];   //!< element name
+   int	 atomicNo;  //!< atomic number  
+   real_t mass;     //!< mass in internal units
+} SpeciesData;
+
+/// Simple struct to store the initial energy and number of atoms. 
+/// Used to check energy conservation and atom conservation. 
+typedef struct ValidateSt
+{
+   double eTot0; //<! Initial total energy
+   int nAtoms0;  //<! Initial global number of atoms
+} Validate;
+
+/// 
+/// The fundamental simulation data structure with MAXATOMS in every
+/// link cell.
+/// 
+typedef struct SimFlatSt
+{
+   int nSteps;            //<! number of time steps to run
+   int printRate;         //<! number of steps between output
+   double dt;             //<! time step
+   
+   Domain* domain;        //<! domain decomposition data
+
+   LinkCell* boxes;       //<! link-cell data
+
+   Atoms* atoms;          //<! atom data (positions, momenta, ...)
+
+   SpeciesData* species;  //<! species data (per species, not per atom)
+   
+   real_t ePotential;     //!< the total potential energy of the system
+   real_t eKinetic;       //!< the total kinetic energy of the system
+
+   BasePotential *pot;	  //!< the potential
+
+   HaloExchange* atomExchange;
+   
+} SimFlat;
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD_info.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD_info.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD_info.h
@@ -0,0 +1,15 @@
+#ifndef CoMD_info_hpp
+#define CoMD_info_hpp
+
+#define CoMD_VARIANT "CoMD-serial"
+#define CoMD_HOSTNAME "test-suite"
+#define CoMD_KERNEL_NAME "'test-suite'"
+#define CoMD_KERNEL_RELEASE "'test-suite'"
+#define CoMD_PROCESSOR "'test-suite'"
+
+#define CoMD_COMPILER "'test-suite'"
+#define CoMD_COMPILER_VERSION "'test-suite'"
+#define CoMD_CFLAGS "'-std=c99 -DDOUBLE'"
+#define CoMD_LDFLAGS "'-lm'"
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/Cu_u6.eam
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/Cu_u6.eam
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/Cu_u6.eam
@@ -0,0 +1,303 @@
+Cu functions (universal 4) - JB Adams et al J. Mater. Res., 4(1), 102 (1989)
+   29     63.550         3.6150    FCC
+  500  5.0100200400801306e-04  500  1.0000000000000009e-02  4.9499999999999886e+00
+  0.                     -3.1589719908208558e-01 -5.2405175291223927e-01 -6.9885553834123115e-01 -8.5420409172727574e-01
+ -9.9627285782417374e-01 -1.1284756487892835e+00 -1.2529454148045645e+00 -1.3711252149943363e+00 -1.4840478277127076e+00
+ -1.5924840805403662e+00 -1.6954285804552853e+00 -1.7942937845174001e+00 -1.8901318213864968e+00 -1.9832501645057476e+00
+ -2.0739063371790252e+00 -2.1623187777983759e+00 -2.2486748239473968e+00 -2.3331367007241965e+00 -2.4158460932269890e+00
+ -2.4969276936450484e+00 -2.5764919917168783e+00 -2.6546374977553171e+00 -2.7314525337618534e+00 -2.8070166913917660e+00
+ -2.8814020298474361e+00 -2.9546740684873072e+00 -3.0268926157701515e+00 -3.0981124665488835e+00 -3.1683839923973949e+00
+ -3.2377536446699082e+00 -3.3062643854300688e+00 -3.3739560586949437e+00 -3.4408657118035535e+00 -3.5070278749074930e+00
+ -3.5724748051301987e+00 -3.6372367006631805e+00 -3.7013418893374563e+00 -3.7648169952241943e+00 -3.8276870863304424e+00
+ -3.8899758061050136e+00 -3.9517054906525857e+00 -4.0128972737054482e+00 -4.0735711808432313e+00 -4.1324020819066334e+00
+ -4.1903921077370399e+00 -4.2479051536742531e+00 -4.3049572584920952e+00 -4.3615637243846948e+00 -4.4177391662932166e+00
+ -4.4734975570518145e+00 -4.5288522686579995e+00 -4.5838161101054880e+00 -4.6384013621277234e+00 -4.6926198091528875e+00
+ -4.7464827687459206e+00 -4.8000011187718314e+00 -4.8531853225280486e+00 -4.9060454519053280e+00 -4.9585912090057604e+00
+ -5.0108319460781274e+00 -5.0627766840981678e+00 -5.1144341300432075e+00 -5.1658126929883963e+00 -5.2169204991179470e+00
+ -5.2677654057399081e+00 -5.3183550143895957e+00 -5.3686966830900360e+00 -5.4187975378393958e+00 -5.4686644833797118e+00
+ -5.5183042133078573e+00 -5.5677232195759814e+00 -5.6169278014231168e+00 -5.6659240737851633e+00 -5.7147179752168427e+00
+ -5.7633152753617765e+00 -5.8117215820040258e+00 -5.8599423477251662e+00 -5.9079828761981332e+00 -5.9558483281427925e+00
+ -6.0035437269616239e+00 -6.0510739641062798e+00 -6.0984438040355258e+00 -6.1456578891786364e+00 -6.1927207443901580e+00
+ -6.2396367812953599e+00 -6.2864103024253382e+00 -6.3330455051271599e+00 -6.3795464852936732e+00 -6.4259172409138614e+00
+ -6.4721616754587501e+00 -6.5182836011053098e+00 -6.5642867418169999e+00 -6.6101747362826870e+00 -6.6559511407215268e+00
+ -6.7016194315664848e+00 -6.7471830080289408e+00 -6.7926321787266488e+00 -6.8376565597087335e+00 -6.8825828851093718e+00
+ -6.9274142262937062e+00 -6.9721535889762833e+00 -7.0168039151812138e+00 -7.0613680851242009e+00 -7.1058489190182854e+00
+ -7.1502491788078260e+00 -7.1945715698338404e+00 -7.2388187424386103e+00 -7.2829932935017325e+00 -7.3270977679243288e+00
+ -7.3711346600521210e+00 -7.4151064150498769e+00 -7.4590154302223652e+00 -7.5028640562861142e+00 -7.5466545985988489e+00
+ -7.5903893183397599e+00 -7.6340704336532212e+00 -7.6777001207475166e+00 -7.7212805149568453e+00 -7.7648137117691078e+00
+ -7.8083017678126225e+00 -7.8517467018166371e+00 -7.8951504955327323e+00 -7.9385150946301337e+00 -7.9818424095582259e+00
+ -8.0251343163853335e+00 -8.0683926576019758e+00 -8.1116192429086027e+00 -8.1548158499697934e+00 -8.1979842251487867e+00
+ -8.2411260842174556e+00 -8.2842431130446244e+00 -8.3273369682627845e+00 -8.3704092779143480e+00 -8.4134616420805628e+00
+ -8.4564956334858721e+00 -8.4995127980902225e+00 -8.5425146556591471e+00 -8.5855027003218538e+00 -8.6284784011075430e+00
+ -8.6714432024708685e+00 -8.7143985247980709e+00 -8.7573457649043576e+00 -8.8002862965079203e+00 -8.8432214707042931e+00
+ -8.8861526164113798e+00 -8.9290810408153902e+00 -8.9720080297988716e+00 -9.0149348483554377e+00 -9.0578627409975070e+00
+ -9.1007929321486927e+00 -9.1437266265307358e+00 -9.1866650095359432e+00 -9.2296092475897353e+00 -9.2725604885091570e+00
+ -9.3155198618426311e+00 -9.3584884792123262e+00 -9.4014674346366292e+00 -9.4444578048540961e+00 -9.4874606496305205e+00
+ -9.5304770120671378e+00 -9.5735079188909253e+00 -9.6165543807549625e+00 -9.6596173924971254e+00 -9.7026979334374914e+00
+ -9.7457969676344760e+00 -9.7889154441094774e+00 -9.8320542972711564e+00 -9.8749024758928954e+00 -9.9176676449848173e+00
+ -9.9604518241702067e+00 -1.0003255855809869e+01 -1.0046080568707225e+01 -1.0088926778514065e+01 -1.0131795287896693e+01
+ -1.0174686886752681e+01 -1.0217602352416293e+01 -1.0260542449857894e+01 -1.0303507931886486e+01 -1.0346499539340471e+01
+ -1.0389518001277565e+01 -1.0432564035159942e+01 -1.0475638347037830e+01 -1.0518741631724708e+01 -1.0561874572971817e+01
+ -1.0605037843637035e+01 -1.0648232105854731e+01 -1.0691458011195323e+01 -1.0734716200826654e+01 -1.0778007305671338e+01
+ -1.0821331946557279e+01 -1.0864690734371720e+01 -1.0908084270204256e+01 -1.0951513145493493e+01 -1.0994977942169044e+01
+ -1.1038479232788461e+01 -1.1082017580672300e+01 -1.1125593540039802e+01 -1.1169207656138724e+01 -1.1212860465371193e+01
+ -1.1256552495422056e+01 -1.1300284265381379e+01 -1.1344056285864497e+01 -1.1387869059131503e+01 -1.1431723079201220e+01
+ -1.1475618831971758e+01 -1.1519556795323240e+01 -1.1563537439236882e+01 -1.1607561225897086e+01 -1.1651628609799957e+01
+ -1.1695740037856638e+01 -1.1739895949496542e+01 -1.1784096776765693e+01 -1.1828342944444898e+01 -1.1872634870038326e+01
+ -1.1916972964140655e+01 -1.1961357630185660e+01 -1.2005789264757027e+01 -1.2050268257623998e+01 -1.2094794991829531e+01
+ -1.2139369843773920e+01 -1.2183993183309894e+01 -1.2228665373815033e+01 -1.2273386772283743e+01 -1.2318157729378811e+01
+ -1.2362978589648264e+01 -1.2407849691329261e+01 -1.2452771366701370e+01 -1.2497743942026659e+01 -1.2542767737650308e+01
+ -1.2587843068072686e+01 -1.2632970242029501e+01 -1.2678149562554552e+01 -1.2723381327055449e+01 -1.2768665827383359e+01
+ -1.2814003349896723e+01 -1.2859394175532202e+01 -1.2904838579871580e+01 -1.2950336833201561e+01 -1.2995889200586475e+01
+ -1.3041495941923358e+01 -1.3087157312007946e+01 -1.3132873560597147e+01 -1.3178644932465090e+01 -1.3224471667467185e+01
+ -1.3270354000597138e+01 -1.3316292162042373e+01 -1.3362286377246335e+01 -1.3408336866957768e+01 -1.3454443847292225e+01
+ -1.3500607529781746e+01 -1.3546828121432384e+01 -1.3593105824775080e+01 -1.3639440837916936e+01 -1.3685833354596070e+01
+ -1.3732283564231011e+01 -1.3778791651965662e+01 -1.3825357798727850e+01 -1.3871982181271051e+01 -1.3918664972226225e+01
+ -1.3965406340150423e+01 -1.4012206449566122e+01 -1.4059065461010562e+01 -1.4105983531084178e+01 -1.4152960812497383e+01
+ -1.4199997454109450e+01 -1.4247093600900882e+01 -1.4294249394311976e+01 -1.4341464971842868e+01 -1.4388740467389482e+01
+ -1.4436076011212833e+01 -1.4483471729977452e+01 -1.4530927746798284e+01 -1.4578444181270356e+01 -1.4626021149517328e+01
+ -1.4673658764226502e+01 -1.4721357134688901e+01 -1.4769116366835874e+01 -1.4816936563275590e+01 -1.4864817823335784e+01
+ -1.4912760243093658e+01 -1.4960763915414873e+01 -1.5008828929991182e+01 -1.5056955373373171e+01 -1.5105143329006637e+01
+ -1.5153392877265333e+01 -1.5201704095489163e+01 -1.5250077058012721e+01 -1.5298511836202465e+01 -1.5347008498487980e+01
+ -1.5395567110395177e+01 -1.5444187734579373e+01 -1.5492870430854509e+01 -1.5541615256228738e+01 -1.5590422264928975e+01
+ -1.5639291508440465e+01 -1.5688223035530768e+01 -1.5737216892280117e+01 -1.5786273122116540e+01 -1.5835391765839859e+01
+ -1.5884572861650554e+01 -1.5933816445184789e+01 -1.5983122549535665e+01 -1.6032491205288238e+01 -1.6081922440538847e+01
+ -1.6131416280932740e+01 -1.6180972749683292e+01 -1.6230591867602584e+01 -1.6280273653129598e+01 -1.6330018122352271e+01
+ -1.6379825289037512e+01 -1.6429695164657005e+01 -1.6479627758410516e+01 -1.6529623077253063e+01 -1.6579681125920047e+01
+ -1.6629801906948160e+01 -1.6679985420709272e+01 -1.6730231665424185e+01 -1.6780540637196850e+01 -1.6830912330026536e+01
+ -1.6881346735842499e+01 -1.6931843844523655e+01 -1.6982403643917451e+01 -1.7033026119869078e+01 -1.7083711256241486e+01
+ -1.7134459034938232e+01 -1.7185269435924170e+01 -1.7236142437252170e+01 -1.7287078015076759e+01 -1.7338076143685498e+01
+ -1.7389136795514560e+01 -1.7440259941169757e+01 -1.7491445549452351e+01 -1.7542693587371559e+01 -1.7594004020176158e+01
+ -1.7645376811364258e+01 -1.7696811922712527e+01 -1.7748309314288690e+01 -1.7799868944476657e+01 -1.7851490769996190e+01
+ -1.7903174745917113e+01 -1.7954920825684781e+01 -1.8006728961136105e+01 -1.8058599102520361e+01 -1.8110531198513968e+01
+ -1.8162525196246406e+01 -1.8214581041310112e+01 -1.8266698677789350e+01 -1.8318878048276588e+01 -1.8371119093861239e+01
+ -1.8423421754189917e+01 -1.8475785967356501e+01 -1.8528211670354381e+01 -1.8580698798442995e+01 -1.8633247285599964e+01
+ -1.8685857064432412e+01 -1.8738528066186518e+01 -1.8791260220768891e+01 -1.8844053456762026e+01 -1.8896907701446821e+01
+ -1.8949822880805868e+01 -1.9002798919552447e+01 -1.9055835741138708e+01 -1.9108933267775342e+01 -1.9162091420446473e+01
+ -1.9215310118921138e+01 -1.9268589281777849e+01 -1.9321928826411295e+01 -1.9375328669053715e+01 -1.9428788724780020e+01
+ -1.9482308907539277e+01 -1.9535889130155283e+01 -1.9589529304345319e+01 -1.9643229340734365e+01 -1.9696989148876355e+01
+ -1.9750808637254977e+01 -1.9804687713312433e+01 -1.9858626283450008e+01 -1.9912624253055810e+01 -1.9966681526506250e+01
+ -2.0020798007185476e+01 -2.0074973597498911e+01 -2.0129208198888136e+01 -2.0183501711838630e+01 -2.0237854035899204e+01
+ -2.0292265069692348e+01 -2.0346734710925716e+01 -2.0401262856409858e+01 -2.0455849402064473e+01 -2.0510494242935920e+01
+ -2.0565197273209719e+01 -2.0619958386219423e+01 -2.0674777474463212e+01 -2.0729654429611969e+01 -2.0784589142526556e+01
+ -2.0839581503263844e+01 -2.0894631401093307e+01 -2.0949738724508393e+01 -2.1004903361235051e+01 -2.1060125198247988e+01
+ -2.1115404121775100e+01 -2.1170740017318053e+01 -2.1226132769657397e+01 -2.1281582262894972e+01 -2.1337088380382852e+01
+ -2.1392651004661843e+01 -2.1448270018015251e+01 -2.1503945301662043e+01 -2.1559676736299366e+01 -2.1615464201984196e+01
+ -2.1671307578140954e+01 -2.1727206743568217e+01 -2.1783161576455427e+01 -2.1839171954393919e+01 -2.1895237754379082e+01
+ -2.1951358852829799e+01 -2.2007535125591289e+01 -2.2063766447950343e+01 -2.2120052694642595e+01 -2.2176393739860259e+01
+ -2.2232789457272474e+01 -2.2289239719961643e+01 -2.2345744400729018e+01 -2.2402303371517178e+01 -2.2458916504038370e+01
+ -2.2515583669430725e+01 -2.2572304738325670e+01 -2.2629079581086330e+01 -2.2685908067306855e+01 -2.2742790066346515e+01
+ -2.2799725447076526e+01 -2.2856714077931429e+01 -2.2913755826927172e+01 -2.2970850561669295e+01 -2.3027998149359064e+01
+ -2.3085198456800299e+01 -2.3142451350411534e+01 -2.3199756696229770e+01 -2.3257114359926845e+01 -2.3314524206806482e+01
+ -2.3371986101824632e+01 -2.3429499909581864e+01 -2.3487065494349963e+01 -2.3544682720213359e+01 -2.3602351450489550e+01
+ -2.3660071548650194e+01 -2.3717842877714475e+01 -2.3775665300345281e+01 -2.3833538678952209e+01 -2.3891462875653588e+01
+ -2.3949437752298763e+01 -2.4007463170460369e+01 -2.4065538991453877e+01 -2.4123665076338057e+01 -2.4181841285923610e+01
+ -2.4240067480782272e+01 -2.4298343521253173e+01 -2.4356669267446705e+01 -2.4415044579252253e+01 -2.4473469316351384e+01
+ -2.4531943338216706e+01 -2.4590466504118467e+01 -2.4649038673143195e+01 -2.4707659704179378e+01 -2.4766329455946106e+01
+ -2.4825047786983760e+01 -2.4883814555664912e+01 -2.4942629620207981e+01 -2.5001492838670174e+01 -2.5060404068966136e+01
+ -2.5119363168864538e+01 -2.5178369996001948e+01 -2.5237424407882145e+01 -2.5296526261886129e+01 -2.5355675415276437e+01
+ -2.5414871725205558e+01 -2.5474115048716612e+01 -2.5533405242759045e+01 -2.5592742164177480e+01 -2.5652125669732186e+01
+ -2.5711555616102487e+01 -2.5771031859886762e+01 -2.5830554257610174e+01 -2.5890122665731269e+01 -2.5949736940650155e+01
+ -2.6009396938703958e+01 -2.6069102516181829e+01 -2.6128853529326307e+01 -2.6188649834339685e+01 -2.6248491287389243e+01
+ -2.6308377744605878e+01 -2.6368309062100934e+01 -2.6428285095962565e+01 -2.6488305702088610e+01 -2.6548370736885317e+01
+ -2.6608480056235521e+01 -2.6668633516188947e+01 -2.6728830972768947e+01 -2.6789072282060033e+01 -2.6849357300140582e+01
+  1.0000000000000000e+01  1.0801250630455797e+01  1.0617301586939504e+01  1.0436833263885262e+01  1.0259774441010791e+01
+  1.0086055417347552e+01  9.9156079783837754e+00  9.7483653639170598e+00  9.5842622365983630e+00  9.4232346511569745e+00
+  9.2652200242883396e+00  9.1101571051919450e+00  8.9579859467472716e+00  8.8086478773091699e+00  8.6620854731154395e+00
+  8.5182425312888768e+00  8.3770640434238430e+00  8.2384961697429731e+00  8.1024862138128810e+00  7.9689825978078090e+00
+  7.8379348383064951e+00  7.7092935226140753e+00  7.5830102855955488e+00  7.4590377870116811e+00  7.3373296893445286e+00
+  7.2178406361032330e+00  7.1005262306008490e+00  6.9853430151894997e+00  6.8722484509459889e+00  6.7612008977976643e+00
+  6.6521595950793255e+00  6.5450846425111706e+00  6.4399369815891703e+00  6.3366783773799114e+00  6.2352714007088537e+00
+  6.1356794107365431e+00  6.0378665379119241e+00  5.9417976672963562e+00  5.8474384222482740e+00  5.7547551484639143e+00
+  5.6637148983634233e+00  5.5742854158160355e+00  5.4864351211977294e+00  5.4001330967728620e+00  5.3153490723937580e+00
+  5.2320534115112594e+00  5.1502170974889339e+00  5.0698117202151138e+00  4.9908094630057178e+00  4.9131830897922271e+00
+  4.8369059325884791e+00  4.7619518792290876e+00  4.6882953613761629e+00  4.6159113427851821e+00  4.5447753078280471e+00
+  4.4748632502644341e+00  4.4061516622586225e+00  4.3386175236344116e+00  4.2722382913651700e+00  4.2069918892916007e+00
+  4.1428566980642358e+00  4.0798115453044659e+00  4.0178356959802954e+00  3.9569088429914245e+00  3.8970110979596200e+00
+  3.8381229822198435e+00  3.7802254180071628e+00  3.7232997198366320e+00  3.6673275860703995e+00  3.6122910906684780e+00
+  3.5581726751199767e+00  3.5049551405497539e+00  3.4526216399971048e+00  3.4011556708634600e+00  3.3505410675235936e+00
+  3.3007619940993322e+00  3.2518029373895416e+00  3.2036486999552665e+00  3.1562843933552358e+00  3.1096954315288059e+00
+  3.0638675243237259e+00  3.0187866711643210e+00  2.9744391548584872e+00  2.9308115355394193e+00  2.8878906447394712e+00
+  2.8456635795935767e+00  2.8041176971686212e+00  2.7632406089169876e+00  2.7230201752508236e+00  2.6834445002347138e+00
+  2.6445019263941703e+00  2.6061810296373835e+00  2.5684706142875200e+00  2.5313597082236612e+00  2.4948375581276281e+00
+  2.4588936248343458e+00  2.4235175787838017e+00  2.3886992955719677e+00  2.3544288515990814e+00  2.3206965198123726e+00
+  2.2874927655419270e+00  2.2548082424273730e+00  2.2226337884330718e+00  2.1909604219504502e+00  2.1597793379851566e+00
+  2.1290819044273803e+00  2.0988596584032564e+00  2.0691043027062364e+00  2.0398077023055379e+00  2.0109618809312195e+00
+  1.9825590177335428e+00  1.9545914440145751e+00  1.9270516400317490e+00  1.8999322318704799e+00  1.8732259883849522e+00
+  1.8469258182056052e+00  1.8210247668116040e+00  1.7955160136671395e+00  1.7703928694199718e+00  1.7456487731607453e+00
+  1.7212772897420763e+00  1.6972721071559391e+00  1.6736270339679251e+00  1.6503359968072218e+00  1.6273930379113750e+00
+  1.6047923127241077e+00  1.5825280875452847e+00  1.5605947372321296e+00  1.5389867429502644e+00  1.5176986899731588e+00
+  1.4967252655299603e+00  1.4760612566992890e+00  1.4557015483494027e+00  1.4356411211222593e+00  1.4158750494618957e+00
+  1.3963984996850130e+00  1.3772067280935900e+00  1.3582950791282968e+00  1.3396589835618542e+00  1.3212939567312461e+00
+  1.3031955968085143e+00  1.2853595831085869e+00  1.2677816744339125e+00  1.2504577074545153e+00  1.2333835951233212e+00
+  1.2165553251254906e+00  1.1999689583611897e+00  1.1836206274610817e+00  1.1675065353339207e+00  1.1516229537450258e+00
+  1.1359662219258198e+00  1.1205327452130547e+00  1.1053189937170771e+00  1.0903215010191687e+00  1.0755368628964561e+00
+  1.0609617360743258e+00  1.0465928370056616e+00  1.0324269406761744e+00  1.0184608794353309e+00  1.0046915418523596e+00
+  9.9111587159659820e-01  9.7773086634202144e-01  9.6453357669496853e-01  9.5152110514480626e-01  9.3869060503712376e-01
+  9.2603927956864140e-01  9.1356438080370239e-01  9.0126320871155485e-01  8.8913311022427521e-01  8.7717147831462938e-01
+  8.6537575109353426e-01  8.5374341092675010e-01  8.4227198357020328e-01  8.3095903732382581e-01  8.1980218220310874e-01
+  8.0879906912829824e-01  7.9794738913080465e-01  7.8724487257618136e-01  7.7668928840378371e-01  7.6627844338220541e-01
+  7.5601018138057441e-01  7.4588238265517859e-01  7.3589296315095609e-01  7.2603987381787150e-01  7.1632109994155613e-01
+  7.0673466048788924e-01  6.9727860746149162e-01  6.8795102527759866e-01  6.7875003014695068e-01  6.6967376947370028e-01
+  6.6072042126578623e-01  6.5188819355765304e-01  6.4317532384495735e-01  6.3458007853101606e-01  6.2610075238490026e-01
+  6.1773566801053903e-01  6.0948317532703555e-01  6.0134165105970538e-01  5.9330949824153834e-01  5.8538514572508404e-01
+  5.7756704770434197e-01  5.6985368324655994e-01  5.6224355583360364e-01  5.5473519291279416e-01  5.4732714545698968e-01
+  5.4001798753364838e-01  5.3280631588273764e-01  5.2569074950327632e-01  5.1866992924830413e-01  5.1174251742814292e-01
+  5.0490719742172274e-01  4.9816267329578068e-01  4.9150766943189694e-01  4.8494093016099704e-01  4.7846121940521869e-01
+  4.7206732032718257e-01  4.6575803498626733e-01  4.5953218400168616e-01  4.5338860622255339e-01  4.4732615840449164e-01
+  4.4134371489266400e-01  4.3544016731128998e-01  4.2961442425926322e-01  4.2386541101190822e-01  4.1819206922867025e-01
+  4.1259335666658892e-01  4.0706824689955035e-01  4.0161572904300691e-01  3.9623480748423034e-01  3.9092450161784775e-01
+  3.8568384558664448e-01  3.8051188802739233e-01  3.7540769182181144e-01  3.7037033385227858e-01  3.6539890476236891e-01
+  3.6049250872219041e-01  3.5565026319808801e-01  3.5087129872705347e-01  3.4615475869537526e-01  3.4149979912160333e-01
+  3.3690558844382679e-01  3.3237130731094133e-01  3.2789614837797743e-01  3.2347931610542879e-01  3.1912002656234151e-01
+  3.1481750723327195e-01  3.1057099682888989e-01  3.0637974510021770e-01  3.0224301265637266e-01  2.9816007078585649e-01
+  2.9413020128113843e-01  2.9015269626666473e-01  2.8622685803004089e-01  2.8235199885637741e-01  2.7852744086590420e-01
+  2.7475251585446614e-01  2.7102656513704559e-01  2.6734893939429405e-01  2.6371899852185088e-01  2.6013611148246873e-01
+  2.5659965616091007e-01  2.5310901922152595e-01  2.4966359596849230e-01  2.4626279020852770e-01  2.4290601411627044e-01
+  2.3959268810202516e-01  2.3632224068197960e-01  2.3309410835077227e-01  2.2990773545640408e-01  2.2676257407740064e-01
+  2.2365808390219311e-01  2.2059373211072231e-01  2.1756899325815038e-01  2.1458334916067301e-01  2.1163628878335583e-01
+  2.0872730813005891e-01  2.0585591013519533e-01  2.0302160455757168e-01  2.0022390787598532e-01  1.9746234318676770e-01
+  1.9473644010305513e-01  1.9204573465593366e-01  1.8938976919722261e-01  1.8676809230404867e-01  1.8418025868501697e-01
+  1.8162582908807767e-01  1.7910437020998504e-01  1.7661545460728956e-01  1.7415866060892160e-01  1.7173357223026997e-01
+  1.6933977908869480e-01  1.6697687632057967e-01  1.6464446449973735e-01  1.6234214955720816e-01  1.6006954270248031e-01
+  1.5782626034600167e-01  1.5561192402300605e-01  1.5342616031865575e-01  1.5126860079441595e-01  1.4913888191569047e-01
+  1.4703664498065105e-01  1.4496153605028095e-01  1.4291320587954814e-01  1.4089130984976528e-01  1.3889550790202332e-01
+  1.3692546447178433e-01  1.3498084842450275e-01  1.3306133299233025e-01  1.3116659571184286e-01  1.2929631836281086e-01
+  1.2745018690792786e-01  1.2562789143357200e-01  1.2382912609148811e-01  1.2205358904140517e-01  1.2030098239464682e-01
+  1.1857101215854371e-01  1.1686338818183373e-01  1.1517782410088362e-01  1.1351403728677267e-01  1.1187174879324324e-01
+  1.1025068330544618e-01  1.0865056908951143e-01  1.0707113794291789e-01  1.0551212514563968e-01  1.0397326941204543e-01
+  1.0245431284357487e-01  1.0095500088214582e-01  9.9475082264262937e-02  9.8014308975870268e-02  9.6572436207889467e-02
+  9.5149222312433945e-02  9.3744428759716225e-02  9.2357820095600562e-02  9.0989163899807046e-02  8.9638230744780500e-02
+  8.8304794155128263e-02  8.6988630567732095e-02  8.5689519292436067e-02  8.4407242473327759e-02  8.3141585050604316e-02
+  8.1892334723027815e-02  8.0659281910912206e-02  7.9442219719687568e-02  7.8240943904000826e-02  7.7055252832346710e-02
+  7.5884947452225848e-02  7.4729831255814450e-02  7.3589710246154016e-02  7.2464392903814900e-02  7.1353690154065674e-02
+  7.0257415334517681e-02  6.9175384163253639e-02  6.8107414707390124e-02  6.7053327352138758e-02  6.6012944770277748e-02
+  6.4986091892085707e-02  6.3972595875702698e-02  6.2972286077917161e-02  6.1984994025379159e-02  6.1010553386208866e-02
+  6.0048799942037157e-02  5.9099571560412123e-02  5.8162708167624810e-02  5.7238051721903105e-02  5.6325446186999528e-02
+  5.5424737506136967e-02  5.4535773576326108e-02  5.3658404223060785e-02  5.2792481175329975e-02  5.1937858041017471e-02
+  5.1094390282629742e-02  5.0261935193351093e-02  4.9440351873451860e-02  4.8629501207008818e-02  4.7829245838954204e-02
+  4.7039450152438045e-02  4.6259980246510013e-02  4.5490703914087494e-02  4.4731490620259606e-02  4.3982211480854794e-02
+  4.3242739241325268e-02  4.2512948255901239e-02  4.1792714467042247e-02  4.1081915385161816e-02  4.0380430068628126e-02
+  3.9688139104032016e-02  3.9004924586723888e-02  3.8330670101623943e-02  3.7665260704261794e-02  3.7008582902100740e-02
+  3.6360524636098734e-02  3.5720975262514720e-02  3.5089825534961649e-02  3.4466967586696873e-02  3.3852294913154113e-02
+  3.3245702354694373e-02  3.2647086079598653e-02  3.2056343567280043e-02  3.1473373591718756e-02  3.0898076205114089e-02
+  3.0330352721755771e-02  2.9770105702100036e-02  2.9217238937061962e-02  2.8671657432515429e-02  2.8133267393987360e-02
+  2.7601976211552692e-02  2.7077692444942292e-02  2.6560325808820506e-02  2.6049787158272331e-02  2.5545988474477754e-02
+  2.5048842850559749e-02  2.4558264477628988e-02  2.4074168631003978e-02  2.3596471656606832e-02  2.3125090957536454e-02
+  2.2659944980823798e-02  2.2200953204335683e-02  2.1748036123873327e-02  2.1301115240412782e-02  2.0860113047532769e-02
+  2.0424953018977066e-02  1.9995559596398205e-02  1.9571858177249157e-02  1.9153775102828896e-02  1.8741237646474174e-02
+  1.8334174001923720e-02  1.7932513271801676e-02  1.7536185456265119e-02  1.7145121441799471e-02  1.6759252990136364e-02
+  1.6378512727332817e-02  1.6002834132978205e-02  1.5632151529535232e-02  1.5266400071822006e-02  1.4905515736628239e-02
+  1.4549435312452008e-02  1.4198096389378967e-02  1.3851437349077567e-02  1.3509397354926733e-02  1.3171916342264223e-02
+  1.2838935008766206e-02  1.2510394804928215e-02  1.2186237924689647e-02  1.1866407296154180e-02  1.1550846572444651e-02
+  1.1239500122655843e-02  1.0932313022929629e-02  1.0629231047647014e-02  1.0330200660712663e-02  1.0035169006965661e-02
+  9.7440839036889715e-03  9.4568938322237006e-03  9.1735479296908284e-03  8.8939959808188584e-03  8.6181884098633921e-03
+  8.3460762726380588e-03  8.0776112486364848e-03  7.8127456332576228e-03  7.5514323301215658e-03  7.2936248434884998e-03
+  7.0392772707632556e-03  6.7883442950981143e-03  6.5407811780883174e-03  6.2965437525517309e-03  6.0555884154033235e-03
+  5.8178721206175732e-03  5.5833523722726985e-03  5.3519872176873706e-03  5.1237352406410253e-03  4.8985555546731119e-03
+  4.6764077964680517e-03  4.4572521193242398e-03  4.2410491867018174e-03  4.0277601658472717e-03  3.8173467214993595e-03
+  3.6097710096757996e-03  3.4049956715283547e-03  3.2029838272795708e-03  3.0036990702375643e-03  2.8071054608720947e-03
+  2.6131675209760674e-03  2.4218502278956500e-03  2.2331190088270558e-03  2.0469397351849938e-03  1.8632787170468346e-03
+  1.6821026976564513e-03  1.5033788479984489e-03  1.3270747614446132e-03  1.1531584484569257e-03  9.8159833136179930e-04
+  8.1236323918953968e-04  6.4542240257081662e-04  4.8074544870146951e-04  3.1830239637042901e-04  1.5806365104042985e-04
+  0.                      0.                      0.                      0.                      0.
+  0.                      5.4383329664155645e-05  9.3944898415945083e-04  4.3251847212615047e-03  1.2334244035325348e-02
+  2.7137722173468548e-02  5.0697119791449641e-02  8.4607638668976470e-02  1.3001641279549414e-01  1.8759487452762702e-01
+  2.5754900895683441e-01  3.3965493779430744e-01  4.3331024634064264e-01  5.3759384878832961e-01  6.5132908316254046e-01
+  7.7314622535699939e-01  9.0154178511424377e-01  1.0349328562818201e+00  1.1717054897399350e+00  1.3102565818166738e+00
+  1.4490291582473986e+00  1.5865412121263560e+00  1.7214084470448441e+00  1.8523614026473965e+00  1.9782575145276269e+00
+  2.0980886961566938e+00  2.2109850373516764e+00  2.3162151996095730e+00  2.4131840597491703e+00  2.5014281146549706e+00
+  2.5806091153285706e+00  2.6505063508648590e+00  2.7110079545661563e+00  2.7621015568249447e+00  2.8038645637913220e+00
+  2.8364542979766156e+00  2.8600981973448825e+00  2.8750842333755031e+00  2.8817516761559574e+00  2.8804823057701157e+00
+  2.8716921439699092e+00  2.8558237581894161e+00  2.8333391711552594e+00  2.8047133934346959e+00  2.7704285829676252e+00
+  2.7309688247181469e+00  2.6868155147671331e+00  2.6384433262347358e+00  2.5863167291097398e+00  2.5308870321738226e+00
+  2.4725899125317596e+00  2.4118433966060167e+00  2.3490462556752334e+00  2.2845767789603002e+00  2.2187918877813502e+00
+  2.1520265552815943e+00  2.0845934975626363e+00  2.0167831036919637e+00  1.9488635738636404e+00  1.8810812369508270e+00
+  1.8136610207193371e+00  1.7468070500507196e+00  1.6807033505858371e+00  1.6155146372447149e+00  1.5513871690559142e+00
+  1.4884496536383409e+00  1.4268141864958608e+00  1.3665772120042590e+00  1.3078204945836447e+00  1.2506120900523854e+00
+  1.1950073085502879e+00  1.1410496616995687e+00  1.0887717878420631e+00  1.0381963502565981e+00  9.8933690422003551e-01
+  9.4219872964247031e-01  8.9677962677415124e-01  8.5307067316958651e-01  8.1105694069385592e-01  7.7071817188505065e-01
+  7.3202941544290212e-01  6.9496162100761794e-01  6.5948219372701189e-01  6.2555550939233484e-01  5.9314339115629977e-01
+  5.6220554903693554e-01  5.3269998356387660e-01  5.0458335504023211e-01  4.7781131998032222e-01  4.5233883634534777e-01
+  4.2812043923464138e-01  4.0511048870905242e-01  3.8326339142174781e-01  3.6253379771729577e-01  3.4287677583286325e-01
+  3.2424796479760154e-01  3.0660370758054967e-01  2.8990116598452254e-01  2.7409841872609064e-01  2.5915454407883409e-01
+  2.4502968839369110e-01  2.3168512174254197e-01  2.1908328186436687e-01  2.0718780752542632e-01  1.9596356233750800e-01
+  1.8537665001230508e-01  1.7539442196444632e-01  1.6598547811304609e-01  1.5711966166996927e-01  1.4876804864444715e-01
+  1.4090293273673637e-01  1.3349780623990259e-01  1.2652733751724909e-01  1.1996734557434463e-01  1.1379477219856060e-01
+  1.0798765209582406e-01  1.0252508141368288e-01  9.7387185001678311e-02  9.2555082724584015e-02  8.8010855111109620e-02
+  8.3737508589961873e-02  7.9718940536826377e-02  7.5939904329596963e-02  7.2385974585237101e-02  6.9043512729294765e-02
+  6.5899633029043336e-02  6.2942169202580001e-02  6.0159641699440547e-02  5.7541225732930634e-02  5.5076720130546430e-02
+  5.2756517056398833e-02  5.0571572648238083e-02  4.8513378601664936e-02  4.6573934725081756e-02  4.4745722480991068e-02
+  4.3021679522073253e-02  4.1395175224364866e-02  3.9859987214311721e-02  3.8410278881708670e-02  3.7040577866510604e-02
+  3.5745755503880039e-02  3.4521007208912380e-02  3.3361833779917971e-02  3.2264023597108116e-02  3.1223635691821294e-02
+  3.0236983660070216e-02  2.9300620393215571e-02  2.8411323597772320e-02  2.7566082075896281e-02  2.6762082737777249e-02
+  2.5996698317105604e-02  2.5267475760840985e-02  2.4572125264713973e-02  2.3908509926274246e-02  2.3274635987705516e-02
+  2.2668643641204911e-02  2.2088798370316409e-02  2.1533482801290083e-02  2.1001189039288493e-02  2.0490511464994254e-02
+  2.0000139967999431e-02  1.9528853594166895e-02  1.9075514584991349e-02  1.8639062787818239e-02  1.8218510416650235e-02
+  1.7812937144080498e-02  1.7421485505751177e-02  1.7043356599549031e-02  1.6677806062561751e-02  1.6324140309613155e-02
+  1.5981713017976018e-02  1.5649921843605585e-02  1.5328205354974755e-02  1.5016040171312250e-02  1.4712938292708366e-02
+  1.4418444610242331e-02  1.4132134584901757e-02  1.3853612084676337e-02  1.3582507369821917e-02  1.3318475216818060e-02
+  1.3061193172097418e-02  1.2810359927147186e-02  1.2565693807050415e-02  1.2326931365025051e-02  1.2093826075940506e-02
+  1.1866147122233661e-02  1.1643678266026136e-02  1.1426216801644407e-02  1.1213572583084475e-02  1.1005567121320226e-02
+  1.0802032746662471e-02  1.0602811831688208e-02  1.0407756070544782e-02  1.0216725810699157e-02  1.0029589433467268e-02
+  9.8462227798860602e-03  9.6665086187306404e-03  9.4903361536790021e-03  9.3176005668363371e-03  9.1482025960089031e-03
+  8.9820481433065535e-03  8.8190479128032462e-03  8.6591170751522117e-03  8.5021749571883021e-03  8.3481447546937537e-03
+  8.1969532666261724e-03  8.0485306492223962e-03  7.9028101885199598e-03  7.7597280899136256e-03  7.6192232834934315e-03
+  7.4812372439735375e-03  7.3457138241272979e-03  7.2125991007052359e-03  7.0818412319012813e-03  6.9533903254870300e-03
+  6.8271983168139705e-03  6.7032188559211503e-03  6.5814072030662141e-03  6.4617201320263939e-03  6.3441158405819764e-03
+  6.2285538676237207e-03  6.1149950163802147e-03  6.0034012832899109e-03  5.8937357920846312e-03  5.7859627326801166e-03
+  5.6800473044990030e-03  5.5759556638887986e-03  5.4736548753111791e-03  5.3731128660109428e-03  5.2742983838981461e-03
+  5.1771809583849582e-03  5.0817308639591330e-03  4.9879190862693046e-03  4.8957172905357560e-03  4.8050977921015592e-03
+  4.7160335289582467e-03  4.6284980360953021e-03  4.5424654215287241e-03  4.4579103438822931e-03  4.3748079913988880e-03
+  4.2931340622749670e-03  4.2128647462132407e-03  4.1339767071033873e-03  4.0564470667446839e-03  3.9802533895282599e-03
+  3.9053736680121076e-03  3.8317863093158128e-03  3.7594701222811860e-03  3.6884043053326127e-03  3.6185684349951674e-03
+  3.5499424550168301e-03  3.4825066660512660e-03  3.4162417158645347e-03  3.3511285900229004e-03  3.2871486030347646e-03
+  3.2242833899080170e-03  3.1625148980992668e-03  3.1018253798278661e-03  3.0421973847258310e-03  2.9836137528083811e-03
+  2.9260576077371064e-03  2.8695123503632708e-03  2.8139616525287708e-03  2.7593894511106498e-03  2.7057799422959966e-03
+  2.6531175760685227e-03  2.6013870509009052e-03  2.5505733086344240e-03  2.5006615295404683e-03  2.4516371275501436e-03
+  2.4034857456453340e-03  2.3561932514012535e-03  2.3097457326723414e-03  2.2641294934160616e-03  2.2193310496436136e-03
+  2.1753371254977782e-03  2.1321346494441173e-03  2.0897107505768314e-03  2.0480527550303662e-03  2.0071481824917164e-03
+  1.9669847428123305e-03  1.9275503327108034e-03  1.8888330325659355e-03  1.8508211032951805e-03  1.8135029833145980e-03
+  1.7768672855772646e-03  1.7409027946878666e-03  1.7055984640891586e-03  1.6709434133182904e-03  1.6369269253308227e-03
+  1.6035384438881917e-03  1.5707675710093030e-03  1.5386040644797400e-03  1.5070378354209296e-03  1.4760589459142243e-03
+  1.4456576066784674e-03  1.4158241748004133e-03  1.3865491515145517e-03  1.3578231800324136e-03  1.3296370434173130e-03
+  1.3019816625059188e-03  1.2748480938728074e-03  1.2482275278369870e-03  1.2221112865106742e-03  1.1964908218862064e-03
+  1.1713577139624703e-03  1.1467036689077198e-03  1.1225205172586891e-03  1.0988002121543120e-03  1.0755348276031765e-03
+  1.0527165567835728e-03  1.0303377103750150e-03  1.0083907149206553e-03  9.8686811121878604e-04  9.6576255274356815e-04
+  9.4506680409354657e-04  9.2477373946662708e-04  9.0487634116191706e-04  8.8536769810608137e-04  8.6624100440530968e-04
+  8.4748955791986991e-04  8.2910675886310736e-04  8.1108610842155551e-04  7.9342120739794852e-04  7.7610575487466887e-04
+  7.5913354689786591e-04  7.4249847518158968e-04  7.2619452583109687e-04  7.1021577808524222e-04  6.9455640307671332e-04
+  6.7921066261025093e-04  6.6417290795844214e-04  6.4943757867335500e-04  6.3499920141575628e-04  6.2085238879914031e-04
+  6.0699183824991856e-04  5.9341233088238896e-04  5.8010873038847818e-04  5.6707598194186137e-04  5.5430911111587280e-04
+  5.4180322281523891e-04  5.2955350022104025e-04  5.1755520374872563e-04  5.0580367001857793e-04  4.9429431083891986e-04
+  4.8302261220136561e-04  4.7198413328763435e-04  4.6117450548847222e-04  4.5058943143359842e-04  4.4022468403297037e-04
+  4.3007610552883886e-04  4.2013960655883260e-04  4.1041116522908330e-04  4.0088682619821882e-04  3.9156269977118005e-04
+  3.8243496100300207e-04  3.7349984881274514e-04  3.6475366510662147e-04  3.5619277391102898e-04  3.4781360051482253e-04
+  3.3961263062063513e-04  3.3158640950565685e-04  3.2373154119109092e-04  3.1604468762060252e-04  3.0852256784754707e-04
+  3.0116195723081836e-04  2.9395968663908575e-04  2.8691264166377101e-04  2.8001776184017647e-04  2.7327203987681688e-04
+  2.6667252089326854e-04  2.6021630166557681e-04  2.5390052988028163e-04  2.4772240339593181e-04  2.4167916951265550e-04
+  2.3576812424967210e-04  2.2998661163024531e-04  2.2433202297460642e-04  2.1880179620031078e-04  2.1339341513026532e-04
+  2.0810440880823181e-04  2.0293235082175821e-04  1.9787485863260665e-04  1.9292959291436311e-04  1.8809425689761319e-04
+  1.8336659572205580e-04  1.7874439579616125e-04  1.7422548416372047e-04  1.6980772787763936e-04  1.6548903338088530e-04
+  1.6126734589430591e-04  1.5714064881157744e-04  1.5310696310104604e-04  1.4916434671449329e-04  1.4531089400280153e-04
+  1.4154473513841234e-04  1.3786403554466153e-04  1.3426699533172857e-04  1.3075184873951283e-04  1.2731686358694039e-04
+  1.2396034072819674e-04  1.2068061351527565e-04  1.1747604726729168e-04  1.1434503874632306e-04  1.1128601563955686e-04
+  1.0829743604811193e-04  1.0537778798212988e-04  1.0252558886227753e-04  9.9739385027582898e-05  9.7017751249615057e-05
+  9.4359290252773662e-05  9.1762632240957511e-05  8.9226434430383569e-05  8.6749380588361721e-05  8.4330180578390864e-05
+  8.1967569911181246e-05  7.9660309301724484e-05  7.7407184232279429e-05  7.5207004521348451e-05  7.3058603898526649e-05
+  7.0960839585107720e-05  6.8912591880629977e-05  6.6912763755002085e-05  6.4960280446513426e-05  6.3054089065330086e-05
+  6.1193158202771814e-05  5.9376477546041213e-05  5.7603057498502742e-05  5.5871928805544500e-05  5.4182142185708361e-05
+  5.2532767967318744e-05  5.0922895730446966e-05  4.9351633954125953e-05  4.7818109668823321e-05  4.6321468114150300e-05
+  4.4860872401664663e-05  4.3435503182825573e-05  4.2044558321957873e-05  4.0687252574273750e-05  3.9362817268785450e-05
+  3.8070499996214428e-05  3.6809564301621984e-05  3.5579289382025496e-05  3.4378969788611451e-05  3.3207915133769052e-05
+  3.2065449802711312e-05  3.0950912669766876e-05  2.9863656819185611e-05  2.8803049270468119e-05  2.7768470708167169e-05
+  2.6759315216115260e-05  2.5774990015931323e-05  2.4814915209964844e-05  2.3878523528387922e-05  2.2965260080560611e-05
+  2.2074582110528148e-05  2.1205958756658535e-05  2.0358870815317476e-05  1.9532810508535560e-05  1.8727281255713447e-05
+  1.7941797449145505e-05  1.7175884233475961e-05  1.6429077288930018e-05  1.5700922618341645e-05  1.4990976337865471e-05
+  1.4298804471386687e-05  1.3623982748522034e-05  1.2966096406226424e-05  1.2324739993882115e-05  1.1699517181902770e-05
+  1.1090040573734860e-05  1.0495931521266495e-05  9.9168199435395021e-06  9.3523441487842465e-06  8.8021506596591475e-06
+  8.2658940417265321e-06  7.7432367350197678e-06  7.2338488887770244e-06  6.7374081991923703e-06  6.2535997501888662e-06
+  5.7821158571569505e-06  5.3226559136389283e-06  4.8749262408651290e-06  4.4386399401326240e-06  4.0135167480073166e-06
+  3.5992828942305738e-06  3.1956709623667747e-06  2.8024197531120341e-06  2.4192741502208947e-06  2.0459849890155880e-06
+  1.6823089274468580e-06  1.3280083196495871e-06  9.8285109196557868e-07  6.4661062138351467e-07  3.1906561636122974e-07
+  0.                      0.                      0.                      0.                      0.
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/LICENSE.md
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/LICENSE.md
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/LICENSE.md
@@ -0,0 +1,18 @@
+License (BSD)
+-------------
+
+Copyright (c) 2013, Los Alamos National Security, LLC  
+All rights reserved.
+
+Copyright 2013. Los Alamos National Security, LLC. This software was produced under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National Laboratory (LANL), which is operated by Los Alamos National Security, LLC for the U.S. Department of Energy. The U.S. Government has rights to use, reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified to produce derivative works, such modified software should be clearly marked, so as not to confuse it with the version available from LANL.
+
+Additionally, redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+* Neither the name of Los Alamos National Security, LLC, Los Alamos National Laboratory, LANL, the U.S. Government, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/Makefile
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/Makefile
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/Makefile
@@ -0,0 +1,8 @@
+LEVEL = ../../../..
+
+PROG     = CoMD
+CPPFLAGS = -std=c99 -DDOUBLE
+LDFLAGS  = -lm
+RUN_OPTIONS = -e -x 10 -y 10 -z 10 --potDir .
+include $(LEVEL)/MultiSource/Makefile.multisrc
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/README.TXT
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/README.TXT
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/README.TXT
@@ -0,0 +1,81 @@
+**************************************************************************
+LLVM Test-suite Note:
+**************************************************************************
+Beyond this paragraph is the original READMD.TXT from the src-mpi
+directory.  A Makefile and CMakeLists.txt have been created for
+integration into the LLVM test-suite.  The test-suite builds a serial
+version of the code with hardware configuration and timing output
+replaced.
+**************************************************************************
+
+This is CoMD version 1.1
+========================
+
+To contact the developers of CoMD send email to exmatex-comd@llnl.gov.
+
+CoMD is a reference implementation of typical classical molecular
+dynamics algorithms and workloads.  It is created and maintained by
+The Exascale Co-Design Center for Materials in Extreme Environments
+(ExMatEx).  http://codesign.lanl.gov/projects/exmatex.  The
+code is intended to serve as a vehicle for co-design by allowing
+others to extend and/or reimplement it as needed to test performance of 
+new architectures, programming models, etc.
+
+The current version of CoMD is available from:
+
+https://github.com/exmatex/CoMD
+
+
+Obtaining Documentation
+=======================
+
+CoMD documentation is produced by doxygen (www.doxygen.org).  
+In order to generate the call graphs, you will also need graphviz 
+installed. Alternatively, you can set 
+
+HAVE_DOT = NO 
+
+in the file 'Doxyfile' in the source directory. 
+
+To build the documentation, navigate to the source directory 
+(src-mpi) and run the command:
+
+$ doxygen
+
+To read the documenation, point a browser at 'html/index.html'.  
+If you are unable to run doxygen on your local system you can download a
+pre-built html directory from
+
+XXXXXXXXXXXXXXXXXXXXXXX
+
+Or, point your browser at
+
+XXXXXXXXXXXXXXXXXXXXXX
+
+
+Building CoMD
+=============
+
+CoMD is written with portability in mind and should compile using
+practically any compiler that implements the C99 standard.  You will
+need to create a Makefile by copying the sample provided with the
+distribution (Makefile.vanilla).
+
+$ cp Makefile.vanilla Makefile
+
+and use the make command to build the code
+
+$ make
+
+The sample Makefile will compile the code on many platforms.  See
+comments in Makefile.vanilla for information about specifying the name
+of the C compiler and/or additional compiler switches that might be
+necessary for your platform.
+
+
+What's New
+==========
+
+For information about what has changed in this version of CoMD, please
+consult the html documentation.
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/cmdLineParser.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/cmdLineParser.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/cmdLineParser.h
@@ -0,0 +1,40 @@
+/// \file 
+/// A parser for command line arguments.
+///
+/// \author Sriram Swaminarayan
+/// \date July 24, 2007
+
+#ifndef CMDLINEPARSER_H_
+#define CMDLINEPARSER_H_
+
+/// Specifies a command line argument that should be accepted by the program.
+/// \param [in]  longOption  The long name of option i.e., --optionname
+/// \param [in]  shortOption The short name of option i.e., -o
+/// \param [in]  has_arg  Whether this option has an argument i.e., -o value.
+///                       If has_arg is 0, then dataPtr must be an integer
+///                       pointer.
+/// \param [in]  type  The type of the argument. Valid values are:
+///                    -  i   integer
+///                    -  f   float
+///                    -  d   double
+///                    -  s   string
+///                    -  c   character
+///
+/// \param [in]  dataPtr  A pointer to where the value will be stored.
+/// \param [in]  dataSize The length of dataPtr, only useful for character
+///                       strings.
+/// \param [in]  help     A short help string, preferably a single line or
+///                       less.
+int addArg(const char *longOption, const char shortOption,
+           int has_arg, const char type, void *dataPtr, int dataSize,
+           const char *help);
+
+/// Call this to process your arguments.
+void processArgs(int argc, char **argv);
+
+/// Prints the arguments to the stdout stream.
+void printArgs(void);
+
+void freeArgs(void);
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/cmdLineParser.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/cmdLineParser.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/cmdLineParser.c
@@ -0,0 +1,228 @@
+/// \file
+/// A parser for command line arguments.
+///
+/// A general purpose command line parser that uses getopt_long() to parse
+/// the command line.
+///
+/// \author Sriram Swaminarayan
+/// \date July 24, 2007
+
+#include "cmdLineParser.h"
+
+#include <stdio.h>
+#include <getopt.h>
+#include <string.h>
+
+#include "mytype.h"
+#include "memUtils.h"
+
+#define nextOption(o) ((MyOption*) o->next)
+
+typedef struct MyOptionSt
+{
+   char* help;
+   char* longArg;
+   unsigned char shortArg[2];
+   int argFlag;
+   char type;
+   int sz;
+   void* ptr;
+   void* next;
+} MyOption;
+
+static int longest = 1;
+static MyOption* myargs=NULL;
+
+static char* dupString(const char* s)
+{
+   char* d;
+   if ( ! s ) s = "";
+   d = (char*)comdCalloc((strlen(s)+1),sizeof(char));
+   strcpy(d, s);
+   return d;
+}
+
+static MyOption* myOptionAlloc(
+   const char* longOption, const char shortOption,
+   int has_arg, const char type, void* dataPtr, int dataSize, const char* help)
+{
+   static int iBase=129;
+   MyOption* o = (MyOption*)comdCalloc(1, sizeof(MyOption));
+   o->help = dupString(help);
+   o->longArg = dupString(longOption);
+   if(shortOption) o->shortArg[0] = (unsigned char)shortOption;
+   else 
+   {
+      o->shortArg[0] = iBase; 
+      iBase++;
+   }
+   o->argFlag = has_arg;
+   o->type = type;
+   o->ptr = dataPtr;
+   o->sz = dataSize;
+   if(longOption) longest = (longest>strlen(longOption)?longest:strlen(longOption));
+   return o;
+}
+
+static MyOption* myOptionFree(MyOption* o)
+{
+   MyOption* r;
+   if(!o) return NULL;
+   r = nextOption(o);
+   if(o->longArg)free(o->longArg);
+   if(o->help)free(o->help);
+   free(o);
+   return r;
+}
+
+static MyOption* lastOption(MyOption* o)
+{
+   if ( ! o) return o;
+   while(nextOption(o)) o = nextOption(o);
+   return o;
+}
+
+static MyOption* findOption(MyOption* o, unsigned char shortArg)
+{
+   while(o)
+   {
+      if (o->shortArg[0] == shortArg) return o;
+      o = nextOption(o);
+   }
+   return o;
+}
+
+
+int addArg(const char* longOption, const char shortOption,
+           int has_arg, const char type, void* dataPtr, int dataSize,
+           const char* help)
+{
+   MyOption* o;
+   MyOption* p;
+   o = myOptionAlloc(longOption,shortOption,has_arg,type,dataPtr,dataSize, help);
+   if ( ! o ) return 1;
+   if ( ! myargs) myargs = o;
+   else 
+   {
+      p = lastOption(myargs);
+      p->next = (void *)o;
+   }
+   return 0;
+}
+
+
+void freeArgs()
+{
+   while(myargs)
+   {
+      myargs = myOptionFree(myargs);
+   } 
+   return;
+}
+
+void printArgs()
+{
+   MyOption* o = myargs;
+   char s[4096];
+   unsigned char *shortArg;
+   fprintf(screenOut,"\n"
+      "  Arguments are: \n");
+   sprintf(s,"   --%%-%ds",longest);
+   while(o)
+   {
+      if(o->shortArg[0]<0xFF) shortArg = o->shortArg;
+      else shortArg = (unsigned char *) "---";
+      fprintf(screenOut,s,o->longArg);
+      fprintf(screenOut," -%c  arg=%1d type=%c  %s\n",shortArg[0],o->argFlag,o->type,o->help);
+      o = nextOption(o);
+
+   }
+   fprintf(screenOut,"\n\n");
+   return;
+}
+
+void processArgs(int argc, char** argv)
+{
+   MyOption* o;
+   int n=0;
+   int i;
+   struct option* opts;
+   char* sArgs;
+   int c;
+
+   if ( ! myargs) return;
+   o = myargs;
+   while(o)
+   {n++,o=nextOption(o);}
+
+   o = myargs;
+   sArgs= (char*)comdCalloc(2*(n+2),sizeof(char));
+   opts = (struct option*)comdCalloc(n,sizeof(struct option));
+   for (i=0; i<n; i++)
+   {
+      opts[i].name = o->longArg;
+      opts[i].has_arg = o->argFlag;
+      opts[i].flag    = 0;
+      opts[i].val     = o->shortArg[0];
+
+      strcat(sArgs,(char*) o->shortArg);
+      if(o->argFlag) strcat(sArgs,":");
+      o = nextOption(o);
+   }
+
+   while(1)
+   {
+
+      int option_index = 0;
+
+      c = getopt_long (argc, argv, sArgs, opts, &option_index);
+      if ( c == -1) break;
+      o = findOption(myargs,c);
+      if ( ! o )
+      {
+         fprintf(screenOut,"\n\n"
+            "    invalid switch : -%c in getopt()\n"
+            "\n\n",
+            c);
+         break;
+      }      
+      if(! o->argFlag)
+      {
+         int* i = (int*)o->ptr;
+         *i = 1;
+      }
+      else 
+      {
+         switch(o->type)
+         {
+            case 'i':
+               sscanf(optarg,"%d",(int*)o->ptr);
+               break;
+            case 'f':
+               sscanf(optarg,"%f",(float*)o->ptr);
+               break;
+            case 'd':
+               sscanf(optarg,"%lf",(double*)o->ptr);
+               break;
+            case 's':
+               strncpy((char*)o->ptr,(char*)optarg,o->sz);
+               ((char*)o->ptr)[o->sz-1] = '\0';
+               break;
+            case 'c':
+               sscanf(optarg,"%c",(char*)o->ptr);
+               break;
+            default:
+               fprintf(screenOut,"\n\n"
+                  "    invalid type : %c in getopt()\n"
+                  "    valid values are 'e', 'z'. 'i','d','f','s', and 'c'\n"
+                  "\n\n",
+                  c);      
+         }
+      }
+   }
+
+   free(opts);
+   free(sArgs);
+
+   return;
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/constants.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/constants.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/constants.h
@@ -0,0 +1,41 @@
+/// \file
+/// Contains constants for unit conversions.
+///
+/// The units for this code are:
+///     - Time in femtoseconds (fs)
+///     - Length in Angstroms (Angs)
+///     - Energy in electron Volts (eV)
+///     - Mass read in as Atomic Mass Units (amu) and then converted for
+///       consistency (energy*time^2/length^2)
+/// Values are taken from NIST, http://physics.nist.gov/cuu/Constants/
+
+#ifndef _CONSTANTS_H_
+#define _CONSTANTS_H_
+
+/// 1 amu in kilograms
+#define amuInKilograms  1.660538921e-27
+
+/// 1 fs in seconds
+#define fsInSeconds     1.0e-15
+
+/// 1 Ang in meters
+#define AngsInMeters    1.0e-10
+
+/// 1 eV in Joules
+#define eVInJoules      1.602176565e-19
+
+/// Internal mass units are eV * fs^2 / Ang^2
+static const double amuToInternalMass =
+         amuInKilograms * AngsInMeters * AngsInMeters
+         / (fsInSeconds * fsInSeconds  * eVInJoules);
+
+/// Boltmann constant in eV's
+static const double kB_eV = 8.6173324e-5;  // eV/K
+
+/// Hartrees to eVs
+static const double hartreeToEv = 27.21138505;
+
+/// Bohrs to Angstroms
+static const double bohrToAngs = 0.52917721092;
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.h
@@ -0,0 +1,33 @@
+/// \file
+/// Parallel domain decomposition.
+
+#ifndef _GEOMETRY_H_
+#define _GEOMETRY_H_
+
+#include "mytype.h"
+
+/// Domain decomposition information.
+typedef struct DomainSt
+{
+   // process-layout data
+   int procGrid[3];        //!< number of processors in each dimension
+   int procCoord[3];       //!< i,j,k for this processor
+
+   // global bounds data
+   real3 globalMin;        //!< minimum global coordinate (angstroms)
+   real3 globalMax;        //!< maximum global coordinate (angstroms)
+   real3 globalExtent;     //!< global size: globalMax - globalMin
+
+   // local bounds data
+   real3 localMin;         //!< minimum coordinate on local processor
+   real3 localMax;         //!< maximum coordinate on local processor
+   real3 localExtent;      //!< localMax - localMin
+} Domain;
+
+struct DomainSt* initDecomposition(int xproc, int yproc, int zproc,
+                                   real3 globalExtent);
+
+/// Find the MPI rank of a neighbor domain from a relative coordinate.
+int processorNum(Domain* domain, int dix, int diy, int dik);
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.c
@@ -0,0 +1,66 @@
+/// \file
+/// Parallel domain decomposition.  This version of CoMD uses a simple
+/// spatial Cartesian domain decomposition.  The simulation box is
+/// divided into equal size bricks by a grid that is xproc by yproc by
+/// zproc in size.
+
+#include "decomposition.h"
+
+#include <assert.h>
+
+#include "memUtils.h"
+#include "parallel.h"
+
+/// \param [in] xproc x-size of domain decomposition grid.
+/// \param [in] yproc y-size of domain decomposition grid.
+/// \param [in] zproc z-size of domain decomposition grid.
+/// \param [in] globalExtent Size of the simulation domain (in Angstroms).
+Domain* initDecomposition(int xproc, int yproc, int zproc, real3 globalExtent)
+{
+   assert( xproc * yproc * zproc == getNRanks());
+
+   Domain* dd = comdMalloc(sizeof(Domain));
+   dd->procGrid[0] = xproc;
+   dd->procGrid[1] = yproc;
+   dd->procGrid[2] = zproc;
+   // calculate grid coordinates i,j,k for this processor
+   int myRank = getMyRank();
+   dd->procCoord[0] = myRank % dd->procGrid[0];
+   myRank /= dd->procGrid[0];
+   dd->procCoord[1] = myRank % dd->procGrid[1];
+   dd->procCoord[2] = myRank / dd->procGrid[1];
+
+   // initialialize global bounds
+   for (int i = 0; i < 3; i++)
+   {
+      dd->globalMin[i] = 0;
+      dd->globalMax[i] = globalExtent[i];
+      dd->globalExtent[i] = dd->globalMax[i] - dd->globalMin[i];
+   }
+   
+   // initialize local bounds on this processor
+   for (int i = 0; i < 3; i++)
+   {
+      dd->localExtent[i] = dd->globalExtent[i] / dd->procGrid[i];
+      dd->localMin[i] = dd->globalMin[i] +  dd->procCoord[i]    * dd->localExtent[i];
+      dd->localMax[i] = dd->globalMin[i] + (dd->procCoord[i]+1) * dd->localExtent[i];
+   }
+
+   return dd;
+}
+
+/// \details
+/// Calculates the rank of the processor with grid coordinates
+/// (ix+dix, iy+diy, iz+diz) where (ix, iy, iz) are the grid coordinates
+/// of the local rank.  Assumes periodic boundary conditions.  The
+/// deltas cannot be smaller than -procGrid[ii].
+int processorNum(Domain* domain, int dix, int diy, int diz)
+{
+   const int* procCoord = domain->procCoord; // alias
+   const int* procGrid  = domain->procGrid;  // alias
+   int ix = (procCoord[0] + dix + procGrid[0]) % procGrid[0];
+   int iy = (procCoord[1] + diy + procGrid[1]) % procGrid[1];
+   int iz = (procCoord[2] + diz + procGrid[2]) % procGrid[2];
+
+   return ix + procGrid[0] *(iy + procGrid[1]*iz);
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/eam.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/eam.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/eam.h
@@ -0,0 +1,23 @@
+/// \file
+/// Compute forces for the Embedded Atom Model (EAM).
+
+#ifndef __EAM_H
+#define __EAM_H
+
+#include "mytype.h"
+
+struct BasePotentialSt;
+struct LinkCellSt;
+
+/// Pointers to the data that is needed in the load and unload functions
+/// for the force halo exchange.
+/// \see loadForceBuffer
+/// \see unloadForceBuffer
+typedef struct ForceExchangeDataSt
+{
+   real_t* dfEmbed; //<! derivative of embedding energy
+   struct LinkCellSt* boxes;
+}ForceExchangeData;
+
+struct BasePotentialSt* initEamPot(const char* dir, const char* file, const char* type);
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/eam.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/eam.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/eam.c
@@ -0,0 +1,839 @@
+/// \file
+/// Compute forces for the Embedded Atom Model (EAM).
+///
+/// The Embedded Atom Model (EAM) is a widely used model of atomic
+/// interactions in simple metals.
+/// 
+/// http://en.wikipedia.org/wiki/Embedded_atom_model
+///
+/// In the EAM, the total potential energy is written as a sum of a pair
+/// potential and the embedding energy, F:
+///
+/// \f[
+///   U = \sum_{ij} \varphi(r_{ij}) + \sum_i F({\bar\rho_i})
+/// \f]
+///
+/// The pair potential \f$\varphi_{ij}\f$ is a two-body inter-atomic
+/// potential, similar to the Lennard-Jones potential, and
+/// \f$F(\bar\rho)\f$ is interpreted as the energy required to embed an
+/// atom in an electron field with density \f$\bar\rho\f$.  The local
+/// electon density at site i is calulated by summing the "effective
+/// electron density" due to all neighbors of atom i:
+///
+/// \f[
+/// \bar\rho_i = \sum_j \rho_j(r_{ij})
+/// \f]
+///
+/// The force on atom i, \f${\bf F}_i\f$ is given by
+///
+/// \f{eqnarray*}{
+///   {\bf F}_i & = & -\nabla_i \sum_{jk} U(r_{jk})\\
+///       & = & - \sum_j\left\{
+///                  \varphi'(r_{ij}) +
+///                  [F'(\bar\rho_i) + F'(\bar\rho_j)]\rho'(r_{ij})
+///                \right\} \hat{r}_{ij}
+/// \f}
+///
+/// where primes indicate the derivative of a function with respect to
+/// its argument and \f$\hat{r}_{ij}\f$ is a unit vector in the
+/// direction from atom i to atom j.
+///
+/// The form of this force expression has two significant consequences.
+/// First, unlike with a simple pair potential, it is not possible to
+/// compute the potential energy and the forces on the atoms in a single
+/// loop over the pairs.  The terms involving \f$ F'(\bar\rho) \f$
+/// cannot be calculated until \f$ \bar\rho \f$ is known, but
+/// calculating \f$ \bar\rho \f$ requires a loop over the pairs.  Hence
+/// the EAM force routine contains three loops.
+///
+///   -# Loop over all pairs, compute the two-body
+///   interaction and the electron density at each atom
+///   -# Loop over all atoms, compute the embedding energy and its
+///   derivative for each atom
+///   -# Loop over all pairs, compute the embedding
+///   energy contribution to the force and add to the two-body force
+///
+/// The second loop over pairs doubles the data motion requirement
+/// relative to a simple pair potential.
+///
+/// The second consequence of the force expression is that computing the
+/// forces on all atoms requires additional communication beyond the
+/// coordinates of all remote atoms within the cutoff distance.  This is
+/// again because of the terms involving \f$ F'(\bar\rho_j) \f$.  If
+/// atom j is a remote atom, the local task cannot compute \f$
+/// \bar\rho_j \f$.  (Such a calculation would require all the neighbors
+/// of atom j, some of which can be up to 2 times the cutoff distance
+/// away from a local atom---outside the typical halo exchange range.)
+///
+/// To obtain the needed remote density we introduce a second halo
+/// exchange after loop number 2 to communicate \f$ F'(\bar\rho) \f$ for
+/// remote atoms.  This provides the data we need to complete the third
+/// loop, but at the cost of introducing a communication operation in
+/// the middle of the force routine.
+///
+/// At least two alternate methods can be used to deal with the remote
+/// density problem.  One possibility is to extend the halo exchange
+/// radius for the atom exchange to twice the potential cutoff distance.
+/// This is likely undesirable due to large increase in communication
+/// volume.  The other possibility is to accumulate partial force terms
+/// on the tasks where they can be computed.  In this method, tasks will
+/// compute force contributions for remote atoms, then communicate the
+/// partial forces at the end of the halo exchange.  This method has the
+/// advantage that the communication is deffered until after the force
+/// loops, but the disadvantage that three times as much data needs to
+/// be set (three components of the force vector instead of a single
+/// scalar \f$ F'(\bar\rho) \f$.
+
+
+#include "eam.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#include "constants.h"
+#include "memUtils.h"
+#include "parallel.h"
+#include "linkCells.h"
+#include "CoMDTypes.h"
+#include "performanceTimers.h"
+#include "haloExchange.h"
+
+#define MAX(A,B) ((A) > (B) ? (A) : (B))
+
+/// Handles interpolation of tabular data.
+///
+/// \see initInterpolationObject
+/// \see interpolate
+typedef struct InterpolationObjectSt 
+{
+   int n;          //!< the number of values in the table
+   real_t x0;      //!< the starting ordinate range
+   real_t invDx;   //!< the inverse of the table spacing
+   real_t* values; //!< the abscissa values
+} InterpolationObject;
+
+/// Derived struct for an EAM potential.
+/// Uses table lookups for function evaluation.
+/// Polymorphic with BasePotential.
+/// \see BasePotential
+typedef struct EamPotentialSt 
+{
+   real_t cutoff;          //!< potential cutoff distance in Angstroms
+   real_t mass;            //!< mass of atoms in intenal units
+   real_t lat;             //!< lattice spacing (angs) of unit cell
+   char latticeType[8];    //!< lattice type, e.g. FCC, BCC, etc.
+   char  name[3];	   //!< element name
+   int	 atomicNo;	   //!< atomic number  
+   int  (*force)(SimFlat* s); //!< function pointer to force routine
+   void (*print)(FILE* file, BasePotential* pot);
+   void (*destroy)(BasePotential** pot); //!< destruction of the potential
+   InterpolationObject* phi;  //!< Pair energy
+   InterpolationObject* rho;  //!< Electron Density
+   InterpolationObject* f;    //!< Embedding Energy
+
+   real_t* rhobar;        //!< per atom storage for rhobar
+   real_t* dfEmbed;       //!< per atom storage for derivative of Embedding
+   HaloExchange* forceExchange;
+   ForceExchangeData* forceExchangeData;
+} EamPotential;
+
+// EAM functionality
+static int eamForce(SimFlat* s);
+static void eamPrint(FILE* file, BasePotential* pot);
+static void eamDestroy(BasePotential** pot); 
+static void eamBcastPotential(EamPotential* pot);
+
+
+// Table interpolation functionality
+static InterpolationObject* initInterpolationObject(
+   int n, real_t x0, real_t dx, real_t* data);
+static void destroyInterpolationObject(InterpolationObject** table);
+static void interpolate(InterpolationObject* table, real_t r, real_t* f, real_t* df);
+static void bcastInterpolationObject(InterpolationObject** table);
+static void printTableData(InterpolationObject* table, const char* fileName);
+
+
+// Read potential tables from files.
+static void eamReadSetfl(EamPotential* pot, const char* dir, const char* potName);
+static void eamReadFuncfl(EamPotential* pot, const char* dir, const char* potName);
+static void fileNotFound(const char* callSite, const char* filename);
+static void notAlloyReady(const char* callSite);
+static void typeNotSupported(const char* callSite, const char* type);
+
+
+/// Allocate and initialize the EAM potential data structure.
+///
+/// \param [in] dir   The directory in which potential table files are found.
+/// \param [in] file  The name of the potential table file.
+/// \param [in] type  The file format of the potential file (setfl or funcfl).
+BasePotential* initEamPot(const char* dir, const char* file, const char* type)
+{
+   EamPotential* pot = comdMalloc(sizeof(EamPotential));
+   assert(pot);
+   pot->force = eamForce;
+   pot->print = eamPrint;
+   pot->destroy = eamDestroy;
+   pot->phi = NULL;
+   pot->rho = NULL;
+   pot->f   = NULL;
+
+   // Initialization of the next three items requires information about
+   // the parallel decomposition and link cells that isn't available
+   // with the potential is initialized.  Hence, we defer their
+   // initialization until the first time we call the force routine.
+   pot->dfEmbed = NULL;
+   pot->rhobar  = NULL;
+   pot->forceExchange = NULL;
+
+   if (getMyRank() == 0)
+   {
+      if (strcmp(type, "setfl" ) == 0)
+         eamReadSetfl(pot, dir, file);
+      else if (strcmp(type,"funcfl") == 0)
+         eamReadFuncfl(pot, dir, file);
+      else
+         typeNotSupported("initEamPot", type);
+   }
+   eamBcastPotential(pot);
+   
+   return (BasePotential*) pot;
+}
+
+/// Calculate potential energy and forces for the EAM potential.
+///
+/// Three steps are required:
+///
+///   -# Loop over all atoms and their neighbors, compute the two-body
+///   interaction and the electron density at each atom
+///   -# Loop over all atoms, compute the embedding energy and its
+///   derivative for each atom
+///   -# Loop over all atoms and their neighbors, compute the embedding
+///   energy contribution to the force and add to the two-body force
+/// 
+int eamForce(SimFlat* s)
+{
+   EamPotential* pot = (EamPotential*) s->pot;
+   assert(pot);
+
+   // set up halo exchange and internal storage on first call to forces.
+   if (pot->forceExchange == NULL)
+   {
+      int maxTotalAtoms = MAXATOMS*s->boxes->nTotalBoxes;
+      pot->dfEmbed = comdMalloc(maxTotalAtoms*sizeof(real_t));
+      pot->rhobar  = comdMalloc(maxTotalAtoms*sizeof(real_t));
+      pot->forceExchange = initForceHaloExchange(s->domain, s->boxes);
+      pot->forceExchangeData = comdMalloc(sizeof(ForceExchangeData));
+      pot->forceExchangeData->dfEmbed = pot->dfEmbed;
+      pot->forceExchangeData->boxes = s->boxes;
+   }
+   
+   real_t rCut2 = pot->cutoff*pot->cutoff;
+
+   // zero forces / energy / rho /rhoprime
+   real_t etot = 0.0;
+   memset(s->atoms->f,  0, s->boxes->nTotalBoxes*MAXATOMS*sizeof(real3));
+   memset(s->atoms->U,  0, s->boxes->nTotalBoxes*MAXATOMS*sizeof(real_t));
+   memset(pot->dfEmbed, 0, s->boxes->nTotalBoxes*MAXATOMS*sizeof(real_t));
+   memset(pot->rhobar,  0, s->boxes->nTotalBoxes*MAXATOMS*sizeof(real_t));
+
+   int nbrBoxes[27];
+   // loop over local boxes
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; iBox++)
+   {
+      int nIBox = s->boxes->nAtoms[iBox];
+      int nNbrBoxes = getNeighborBoxes(s->boxes, iBox, nbrBoxes);
+      // loop over neighbor boxes of iBox (some may be halo boxes)
+      for (int jTmp=0; jTmp<nNbrBoxes; jTmp++)
+      {
+         int jBox = nbrBoxes[jTmp];
+         if (jBox < iBox ) continue;
+
+         int nJBox = s->boxes->nAtoms[jBox];
+         // loop over atoms in iBox
+         for (int iOff=MAXATOMS*iBox,ii=0; ii<nIBox; ii++,iOff++)
+         {
+            // loop over atoms in jBox
+            for (int jOff=MAXATOMS*jBox,ij=0; ij<nJBox; ij++,jOff++)
+            {
+               if ( (iBox==jBox) &&(ij <= ii) ) continue;
+
+               double r2 = 0.0;
+               real3 dr;
+               for (int k=0; k<3; k++)
+               {
+                  dr[k]=s->atoms->r[iOff][k]-s->atoms->r[jOff][k];
+                  r2+=dr[k]*dr[k];
+               }
+               if(r2>rCut2) continue;
+
+               double r = sqrt(r2);
+
+               real_t phiTmp, dPhi, rhoTmp, dRho;
+               interpolate(pot->phi, r, &phiTmp, &dPhi);
+               interpolate(pot->rho, r, &rhoTmp, &dRho);
+
+               for (int k=0; k<3; k++)
+               {
+                  s->atoms->f[iOff][k] -= dPhi*dr[k]/r;
+                  s->atoms->f[jOff][k] += dPhi*dr[k]/r;
+               }
+
+               // update energy terms
+               // calculate energy contribution based on whether
+               // the neighbor box is local or remote
+               if (jBox < s->boxes->nLocalBoxes)
+                  etot += phiTmp;
+               else
+                  etot += 0.5*phiTmp;
+
+               s->atoms->U[iOff] += 0.5*phiTmp;
+               s->atoms->U[jOff] += 0.5*phiTmp;
+
+               // accumulate rhobar for each atom
+               pot->rhobar[iOff] += rhoTmp;
+               pot->rhobar[jOff] += rhoTmp;
+
+            } // loop over atoms in jBox
+         } // loop over atoms in iBox
+      } // loop over neighbor boxes
+   } // loop over local boxes
+
+   // Compute Embedding Energy
+   // loop over all local boxes
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; iBox++)
+   {
+      int iOff;
+      int nIBox =  s->boxes->nAtoms[iBox];
+
+      // loop over atoms in iBox
+      for (int iOff=MAXATOMS*iBox,ii=0; ii<nIBox; ii++,iOff++)
+      {
+         real_t fEmbed, dfEmbed;
+         interpolate(pot->f, pot->rhobar[iOff], &fEmbed, &dfEmbed);
+         pot->dfEmbed[iOff] = dfEmbed; // save derivative for halo exchange
+         etot += fEmbed; 
+         s->atoms->U[iOff] += fEmbed;
+      }
+   }
+
+   // exchange derivative of the embedding energy with repsect to rhobar
+   startTimer(eamHaloTimer);
+   haloExchange(pot->forceExchange, pot->forceExchangeData);
+   stopTimer(eamHaloTimer);
+
+   // third pass
+   // loop over local boxes
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; iBox++)
+   {
+      int nIBox =  s->boxes->nAtoms[iBox];
+      int nNbrBoxes = getNeighborBoxes(s->boxes, iBox, nbrBoxes);
+      // loop over neighbor boxes of iBox (some may be halo boxes)
+      for (int jTmp=0; jTmp<nNbrBoxes; jTmp++)
+      {
+         int jBox = nbrBoxes[jTmp];
+         if(jBox < iBox) continue;
+
+         int nJBox = s->boxes->nAtoms[jBox];
+         // loop over atoms in iBox
+         for (int iOff=MAXATOMS*iBox,ii=0; ii<nIBox; ii++,iOff++)
+         {
+            // loop over atoms in jBox
+            for (int jOff=MAXATOMS*jBox,ij=0; ij<nJBox; ij++,jOff++)
+            { 
+               if ((iBox==jBox) && (ij <= ii))  continue;
+
+               double r2 = 0.0;
+               real3 dr;
+               for (int k=0; k<3; k++)
+               {
+                  dr[k]=s->atoms->r[iOff][k]-s->atoms->r[jOff][k];
+                  r2+=dr[k]*dr[k];
+               }
+               if(r2>=rCut2) continue;
+
+               real_t r = sqrt(r2);
+
+               real_t rhoTmp, dRho;
+               interpolate(pot->rho, r, &rhoTmp, &dRho);
+
+               for (int k=0; k<3; k++)
+               {
+                  s->atoms->f[iOff][k] -= (pot->dfEmbed[iOff]+pot->dfEmbed[jOff])*dRho*dr[k]/r;
+                  s->atoms->f[jOff][k] += (pot->dfEmbed[iOff]+pot->dfEmbed[jOff])*dRho*dr[k]/r;
+               }
+
+            } // loop over atoms in jBox
+         } // loop over atoms in iBox
+      } // loop over neighbor boxes
+   } // loop over local boxes
+
+   s->ePotential = (real_t) etot;
+
+   return 0;
+}
+
+void eamPrint(FILE* file, BasePotential* pot)
+{
+   EamPotential *eamPot = (EamPotential*) pot;
+   fprintf(file, "  Potential type  : EAM\n");
+   fprintf(file, "  Species name    : %s\n", eamPot->name);
+   fprintf(file, "  Atomic number   : %d\n", eamPot->atomicNo);
+   fprintf(file, "  Mass            : "FMT1" amu\n", eamPot->mass/amuToInternalMass); // print in amu
+   fprintf(file, "  Lattice type    : %s\n", eamPot->latticeType);
+   fprintf(file, "  Lattice spacing : "FMT1" Angstroms\n", eamPot->lat);
+   fprintf(file, "  Cutoff          : "FMT1" Angstroms\n", eamPot->cutoff);
+}
+
+void eamDestroy(BasePotential** pPot)
+{
+   if ( ! pPot ) return;
+   EamPotential* pot = *(EamPotential**)pPot;
+   if ( ! pot ) return;
+   destroyInterpolationObject(&(pot->phi));
+   destroyInterpolationObject(&(pot->rho));
+   destroyInterpolationObject(&(pot->f));
+   destroyHaloExchange(&(pot->forceExchange));
+   comdFree(pot);
+   *pPot = NULL;
+
+   return;
+}
+
+/// Broadcasts an EamPotential from rank 0 to all other ranks.
+/// If the table coefficients are read from a file only rank 0 does the
+/// read.  Hence we need to broadcast the potential to all other ranks.
+void eamBcastPotential(EamPotential* pot)
+{
+   assert(pot);
+   
+   struct 
+   {
+      real_t cutoff, mass, lat;
+      char latticeType[8];
+      char name[3];
+      int atomicNo;
+   } buf;
+   if (getMyRank() == 0)
+   {
+      buf.cutoff   = pot->cutoff;
+      buf.mass     = pot->mass;
+      buf.lat      = pot->lat;
+      buf.atomicNo = pot->atomicNo;
+      strcpy(buf.latticeType, pot->latticeType);
+      strcpy(buf.name, pot->name);
+   }
+   bcastParallel(&buf, sizeof(buf), 0);
+   pot->cutoff   = buf.cutoff;
+   pot->mass     = buf.mass;
+   pot->lat      = buf.lat;
+   pot->atomicNo = buf.atomicNo;
+   strcpy(pot->latticeType, buf.latticeType);
+   strcpy(pot->name, buf.name);
+
+   bcastInterpolationObject(&pot->phi);
+   bcastInterpolationObject(&pot->rho);
+   bcastInterpolationObject(&pot->f);
+}
+
+/// Builds a structure to store interpolation data for a tabular
+/// function.  Interpolation must be supported on the range
+/// \f$[x_0, x_n]\f$, where \f$x_n = n*dx\f$.
+///
+/// \see interpolate
+/// \see bcastInterpolationObject
+/// \see destroyInterpolationObject
+///
+/// \param [in] n    number of values in the table.
+/// \param [in] x0   minimum ordinate value of the table.
+/// \param [in] dx   spacing of the ordinate values.
+/// \param [in] data abscissa values.  An array of size n. 
+InterpolationObject* initInterpolationObject(
+   int n, real_t x0, real_t dx, real_t* data)
+{
+   InterpolationObject* table =
+      (InterpolationObject *)comdMalloc(sizeof(InterpolationObject)) ;
+   assert(table);
+
+   table->values = (real_t*)comdCalloc(1, (n+3)*sizeof(real_t));
+   assert(table->values);
+
+   table->values++; 
+   table->n = n;
+   table->invDx = 1.0/dx;
+   table->x0 = x0;
+
+   for (int ii=0; ii<n; ++ii)
+      table->values[ii] = data[ii];
+   
+   table->values[-1] = table->values[0];
+   table->values[n+1] = table->values[n] = table->values[n-1];
+
+   return table;
+}
+
+void destroyInterpolationObject(InterpolationObject** a)
+{
+   if ( ! a ) return;
+   if ( ! *a ) return;
+   if ( (*a)->values)
+   {
+      (*a)->values--;
+      comdFree((*a)->values);
+   }
+   comdFree(*a);
+   *a = NULL;
+
+   return;
+}
+
+/// Interpolate a table to determine f(r) and its derivative f'(r).
+///
+/// The forces on the particle are much more sensitive to the derivative
+/// of the potential than on the potential itself.  It is therefore
+/// absolutely essential that the interpolated derivatives are smooth
+/// and continuous.  This function uses simple quadratic interpolation
+/// to find f(r).  Since quadric interpolants don't have smooth
+/// derivatives, f'(r) is computed using a 4 point finite difference
+/// stencil.
+///
+/// Interpolation is used heavily by the EAM force routine so this
+/// function is a potential performance hot spot.  Feel free to
+/// reimplement this function (and initInterpolationObject if necessay)
+/// with any higher performing implementation of interpolation, as long
+/// as the alternate implmentation that has the required smoothness
+/// properties.  Cubic splines are one common alternate choice.
+///
+/// \param [in] table Interpolation table.
+/// \param [in] r Point where function value is needed.
+/// \param [out] f The interpolated value of f(r).
+/// \param [out] df The interpolated value of df(r)/dr.
+void interpolate(InterpolationObject* table, real_t r, real_t* f, real_t* df)
+{
+   const real_t* tt = table->values; // alias
+   
+   if ( r < table->x0 ) r = table->x0;
+
+   r = (r-table->x0)*(table->invDx) ;
+   int ii = (int)floor(r);
+   if (ii > table->n)
+   {
+      ii = table->n;
+      r = table->n / table->invDx;
+   }
+   // reset r to fractional distance
+   r = r - floor(r);
+
+   real_t g1 = tt[ii+1] - tt[ii-1];
+   real_t g2 = tt[ii+2] - tt[ii];
+
+   *f = tt[ii] + 0.5*r*(g1 + r*(tt[ii+1] + tt[ii-1] - 2.0*tt[ii]) );
+
+   *df = 0.5*(g1 + r*(g2-g1))*table->invDx;
+}
+
+/// Broadcasts an InterpolationObject from rank 0 to all other ranks.
+///
+/// It is commonly the case that the data needed to create the
+/// interpolation table is available on only one task (for example, only
+/// one task has read the data from a file).  Broadcasting the table
+/// eliminates the need to put broadcast code in multiple table readers.
+///
+/// \see eamBcastPotential
+void bcastInterpolationObject(InterpolationObject** table)
+{
+   struct
+   {
+      int n;
+      real_t x0, invDx;
+   } buf;
+
+   if (getMyRank() == 0)
+   {
+      buf.n     = (*table)->n;
+      buf.x0    = (*table)->x0;
+      buf.invDx = (*table)->invDx;
+   }
+   bcastParallel(&buf, sizeof(buf), 0);
+
+   if (getMyRank() != 0)
+   {
+      assert(*table == NULL);
+      *table = comdMalloc(sizeof(InterpolationObject));
+      (*table)->n      = buf.n;
+      (*table)->x0     = buf.x0;
+      (*table)->invDx  = buf.invDx;
+      (*table)->values = comdMalloc(sizeof(real_t) * (buf.n+3) );
+      (*table)->values++;
+   }
+   
+   int valuesSize = sizeof(real_t) * ((*table)->n+3);
+   bcastParallel((*table)->values-1, valuesSize, 0);
+}
+
+void printTableData(InterpolationObject* table, const char* fileName)
+{
+   if (!printRank()) return;
+
+   FILE* potData;
+   potData = fopen(fileName,"w");
+   real_t dR = 1.0/table->invDx;
+   for (int i = 0; i<table->n; i++)
+   {
+      real_t r = table->x0+i*dR;
+      fprintf(potData, "%d %e %e\n", i, r, table->values[i]);
+   }
+   fclose(potData);
+}
+   
+/// Reads potential data from a setfl file and populates
+/// corresponding members and InterpolationObjects in an EamPotential.
+///
+/// setfl is a file format for tabulated potential functions used by
+/// the original EAM code DYNAMO.  A setfl file contains EAM
+/// potentials for multiple elements.
+///
+/// The contents of a setfl file are:
+///
+/// | Line Num | Description
+/// | :------: | :----------
+/// | 1 - 3    | comments
+/// | 4        | ntypes type1 type2 ... typen
+/// | 5        | nrho     drho     nr   dr   rcutoff
+/// | F, rho   | Following line 5 there is a block for each atom type with F, and rho.
+/// | b1       | ielem(i)   amass(i)     latConst(i)    latType(i)
+/// | b2       | embedding function values F(rhobar) starting at rhobar=0
+/// |   ...    | (nrho values. Multiple values per line allowed.)
+/// | bn       | electron density, starting at r=0
+/// |   ...    | (nr values. Multiple values per line allowed.)
+/// | repeat   | Return to b1 for each atom type.
+/// | phi      | phi_ij for (1,1), (2,1), (2,2), (3,1), (3,2), (3,3), (4,1), ..., 
+/// | p1       | pair potential between type i and type j, starting at r=0
+/// |   ...    | (nr values. Multiple values per line allowed.)
+/// | repeat   | Return to p1 for each phi_ij
+///
+/// Where:
+///    -  ntypes        :      number of element types in the potential  
+///    -  nrho          :      number of points the embedding energy F(rhobar)
+///    -  drho          :      table spacing for rhobar 
+///    -  nr            :      number of points for rho(r) and phi(r)
+///    -  dr            :      table spacing for r in Angstroms
+///    -  rcutoff       :      cut-off distance in Angstroms
+///    -  ielem(i)      :      atomic number for element(i)
+///    -  amass(i)      :      atomic mass for element(i) in AMU
+///    -  latConst(i)   :      lattice constant for element(i) in Angstroms
+///    -  latType(i)    :      lattice type for element(i)  
+///
+/// setfl format stores r*phi(r), so we need to converted to the pair
+/// potential phi(r).  In the file, phi(r)*r is in eV*Angstroms.
+/// NB: phi is not defined for r = 0
+///
+/// F(rhobar) is in eV.
+///
+void eamReadSetfl(EamPotential* pot, const char* dir, const char* potName)
+{
+   char tmp[4096];
+   sprintf(tmp, "%s/%s", dir, potName);
+
+   FILE* potFile = fopen(tmp, "r");
+   if (potFile == NULL)
+      fileNotFound("eamReadSetfl", tmp);
+   
+   // read the first 3 lines (comments)
+   fgets(tmp, sizeof(tmp), potFile);
+   fgets(tmp, sizeof(tmp), potFile);
+   fgets(tmp, sizeof(tmp), potFile);
+
+   // line 4
+   fgets(tmp, sizeof(tmp), potFile);
+   int nElems;
+   sscanf(tmp, "%d", &nElems);
+   if( nElems != 1 )
+      notAlloyReady("eamReadSetfl");
+
+   //line 5
+   int nRho, nR;
+   double dRho, dR, cutoff;
+   //  The same cutoff is used by all alloys, NB: cutoff = nR * dR is redundant
+   fgets(tmp, sizeof(tmp), potFile);
+   sscanf(tmp, "%d %le %d %le %le", &nRho, &dRho, &nR, &dR, &cutoff);
+   pot->cutoff = cutoff;
+
+   // **** THIS CODE IS RESTRICTED TO ONE ELEMENT
+   // Per-atom header 
+   fgets(tmp, sizeof(tmp), potFile);
+   int nAtomic;
+   double mass, lat;
+   char latticeType[8];
+   sscanf(tmp, "%d %le %le %s", &nAtomic, &mass, &lat, latticeType);
+   pot->atomicNo = nAtomic;
+   pot->lat = lat;
+   pot->mass = mass * amuToInternalMass;  // file has mass in AMU.
+   strcpy(pot->latticeType, latticeType);
+   
+   // allocate read buffer
+   int bufSize = MAX(nRho, nR);
+   real_t* buf = comdMalloc(bufSize * sizeof(real_t));
+   real_t x0 = 0.0;
+
+   // Read embedding energy F(rhobar)
+   for (int ii=0; ii<nRho; ++ii)
+      fscanf(potFile, FMT1, buf+ii);
+   pot->f = initInterpolationObject(nRho, x0, dRho, buf);
+
+   // Read electron density rho(r)
+   for (int ii=0; ii<nR; ++ii)
+      fscanf(potFile, FMT1, buf+ii);
+   pot->rho = initInterpolationObject(nR, x0, dR, buf);
+
+   // Read phi(r)*r and convert to phi(r)
+   for (int ii=0; ii<nR; ++ii)
+      fscanf(potFile, FMT1, buf+ii);
+   for (int ii=1; ii<nR; ++ii)
+   {
+      real_t r = x0 + ii*dR;
+      buf[ii] /= r;
+   }
+   buf[0] = buf[1] + (buf[1] - buf[2]); // Linear interpolation to get phi[0].
+   pot->phi = initInterpolationObject(nR, x0, dR, buf);
+
+   comdFree(buf);
+
+   // write to text file for comparison, currently commented out
+/*    printPot(pot->f, "SetflDataF.txt"); */
+/*    printPot(pot->rho, "SetflDataRho.txt"); */
+/*    printPot(pot->phi, "SetflDataPhi.txt");  */
+}
+
+/// Reads potential data from a funcfl file and populates
+/// corresponding members and InterpolationObjects in an EamPotential.
+/// 
+/// funcfl is a file format for tabulated potential functions used by
+/// the original EAM code DYNAMO.  A funcfl file contains an EAM
+/// potential for a single element.
+/// 
+/// The contents of a funcfl file are:
+///
+/// | Line Num | Description
+/// | :------: | :----------
+/// | 1        | comments
+/// | 2        | elem amass latConstant latType
+/// | 3        | nrho   drho   nr   dr    rcutoff
+/// | 4        | embedding function values F(rhobar) starting at rhobar=0
+/// |    ...   | (nrho values. Multiple values per line allowed.)
+/// | x'       | electrostatic interation Z(r) starting at r=0
+/// |    ...   | (nr values. Multiple values per line allowed.)
+/// | y'       | electron density values rho(r) starting at r=0
+/// |    ...   | (nr values. Multiple values per line allowed.)
+///
+/// Where:
+///    -  elem          :   atomic number for this element
+///    -  amass         :   atomic mass for this element in AMU
+///    -  latConstant   :   lattice constant for this elemnent in Angstroms
+///    -  lattticeType  :   lattice type for this element (e.g. FCC) 
+///    -  nrho          :   number of values for the embedding function, F(rhobar)
+///    -  drho          :   table spacing for rhobar
+///    -  nr            :   number of values for Z(r) and rho(r)
+///    -  dr            :   table spacing for r in Angstroms
+///    -  rcutoff       :   potential cut-off distance in Angstroms
+///
+/// funcfl format stores the "electrostatic interation" Z(r).  This needs to
+/// be converted to the pair potential phi(r).
+/// using the formula 
+/// \f[phi = Z(r) * Z(r) / r\f]
+/// NB: phi is not defined for r = 0
+///
+/// Z(r) is in atomic units (i.e., sqrt[Hartree * bohr]) so it is
+/// necesary to convert to eV.
+///
+/// F(rhobar) is in eV.
+///
+void eamReadFuncfl(EamPotential* pot, const char* dir, const char* potName)
+{
+   char tmp[4096];
+
+   sprintf(tmp, "%s/%s", dir, potName);
+   FILE* potFile = fopen(tmp, "r");
+   if (potFile == NULL)
+      fileNotFound("eamReadFuncfl", tmp);
+
+   // line 1
+   fgets(tmp, sizeof(tmp), potFile);
+   char name[3];
+   sscanf(tmp, "%s", name);
+   strcpy(pot->name, name);
+
+   // line 2
+   int nAtomic;
+   double mass, lat;
+   char latticeType[8];
+   fgets(tmp,sizeof(tmp),potFile);
+   sscanf(tmp, "%d %le %le %s", &nAtomic, &mass, &lat, latticeType);
+   pot->atomicNo = nAtomic;
+   pot->lat = lat;
+   pot->mass = mass*amuToInternalMass; // file has mass in AMU.
+   strcpy(pot->latticeType, latticeType);
+
+   // line 3
+   int nRho, nR;
+   double dRho, dR, cutoff;
+   fgets(tmp,sizeof(tmp),potFile);
+   sscanf(tmp, "%d %le %d %le %le", &nRho, &dRho, &nR, &dR, &cutoff);
+   pot->cutoff = cutoff;
+   real_t x0 = 0.0; // tables start at zero.
+
+   // allocate read buffer
+   int bufSize = MAX(nRho, nR);
+   real_t* buf = comdMalloc(bufSize * sizeof(real_t));
+
+   // read embedding energy
+   for (int ii=0; ii<nRho; ++ii)
+      fscanf(potFile, FMT1, buf+ii);
+   pot->f = initInterpolationObject(nRho, x0, dRho, buf);
+
+   // read Z(r) and convert to phi(r)
+   for (int ii=0; ii<nR; ++ii)
+      fscanf(potFile, FMT1, buf+ii);
+   for (int ii=1; ii<nR; ++ii)
+   {
+      real_t r = x0 + ii*dR;
+      buf[ii] *= buf[ii] / r;
+      buf[ii] *= hartreeToEv * bohrToAngs; // convert to eV
+   }
+   buf[0] = buf[1] + (buf[1] - buf[2]); // linear interpolation to get phi[0].
+   pot->phi = initInterpolationObject(nR, x0, dR, buf);
+
+   // read electron density rho
+   for (int ii=0; ii<nR; ++ii)
+      fscanf(potFile, FMT1, buf+ii);
+   pot->rho = initInterpolationObject(nR, x0, dR, buf);
+
+   comdFree(buf);
+   
+/*    printPot(pot->f,   "funcflDataF.txt"); */
+/*    printPot(pot->rho, "funcflDataRho.txt"); */
+/*    printPot(pot->phi, "funcflDataPhi.txt"); */
+}
+
+void fileNotFound(const char* callSite, const char* filename)
+{
+   fprintf(screenOut,
+           "%s: Can't open file %s.  Fatal Error.\n", callSite, filename);
+   exit(-1);
+}
+
+void notAlloyReady(const char* callSite)
+{
+   fprintf(screenOut,
+          "%s: CoMD 1.1 does not support alloys and cannot\n"
+           "   read setfl files with multiple species.  Fatal Error.\n", callSite);
+   exit(-1);
+}
+
+void typeNotSupported(const char* callSite, const char* type)
+{
+   fprintf(screenOut,
+          "%s: Potential type %s not supported. Fatal Error.\n", callSite, type);
+   exit(-1);
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.h
@@ -0,0 +1,86 @@
+/// \file
+/// Communicate halo data such as "ghost" atoms with neighboring tasks.
+
+#ifndef __HALO_EXCHANGE_
+#define __HALO_EXCHANGE_
+
+#include "mytype.h"
+struct AtomsSt;
+struct LinkCellSt;
+struct DomainSt;
+
+/// A polymorphic structure to store information about a halo exchange.
+/// This structure can be thought of as an abstract base class that
+/// specifies the interface and implements the communication patterns of
+/// a halo exchange.  Concrete sub-classes supply actual implementations
+/// of the loadBuffer, unloadBuffer, and destroy functions, that are
+/// specific to the actual data being exchanged.  If the subclass needs
+/// additional data members, these can be stored in a structure that is
+/// pointed to by parms.
+///
+/// Designing the structure this way allows us to re-use the
+/// communication code for both atom data and partial force data.
+///
+/// \see eamForce
+/// \see redistributeAtoms
+typedef struct HaloExchangeSt
+{
+   /// The MPI ranks of the six face neighbors of the local domain.
+   /// Ranks are stored in the order specified in HaloFaceOrder.
+   int nbrRank[6];
+   /// The maximum send/recv buffer size (in bytes) that will be needed
+   /// for this halo exchange.
+   int bufCapacity;
+   /// Pointer to a sub-class specific function to load the send buffer.
+   /// \param [in] parms The parms member of the structure.  This is a
+   ///                   pointer to a sub-class specific structure that can
+   ///                   be used by the load and unload functions to store
+   ///                   sub-class specific data.
+   /// \param [in] data  A pointer to a structure that the contains the data
+   ///                   that is needed by the loadBuffer function.  The
+   ///                   loadBuffer function will cast the pointer to a
+   ///                   concrete type that is appropriate for the data
+   ///                   being exchanged.
+   /// \param [in] face  Specifies the face across which data is being sent.
+   /// \param [in] buf   The send buffer to be loaded
+   /// \return The number of bytes loaded into the send buffer.
+   int  (*loadBuffer)(void* parms, void* data, int face, char* buf);
+   /// Pointer to a sub-class specific function to unload the recv buffer.
+   /// \param [in] parms The parms member of the structure.  This is a
+   ///                   pointer to a sub-class specific structure that can
+   ///                   be used by the load and unload functions to store
+   ///                   sub-class specific data.
+   /// \param [out] data A pointer to a structure that the contains the data
+   ///                   that is needed by the unloadBuffer function.  The
+   ///                   unloadBuffer function will cast the pointer to a
+   ///                   concrete type that is appropriate for the data
+   ///                   being exchanged.
+   /// \param [in] face  Specifies the face across which data is being sent.
+   /// \param [in] bufSize The number of bytes in the recv buffer.
+   /// \param [in] buf   The recv buffer to be unloaded.
+   void (*unloadBuffer)(void* parms, void* data, int face, int bufSize, char* buf);
+   /// Pointer to a function to deallocate any memory used by the
+   /// sub-class parms.  Essentially this is a virtual destructor.
+   void (*destroy)(void* parms);
+   /// A pointer to a sub-class specific structure that contains
+   /// additional data members needed by the sub-class.
+   void* parms;
+} 
+HaloExchange;
+
+/// Create a HaloExchange for atom data.
+HaloExchange* initAtomHaloExchange(struct DomainSt* domain, struct LinkCellSt* boxes);
+
+/// Create a HaloExchange for force data.
+HaloExchange* initForceHaloExchange(struct DomainSt* domain, struct LinkCellSt* boxes);
+
+/// HaloExchange destructor.
+void destroyHaloExchange(HaloExchange** haloExchange);
+
+/// Execute a halo exchange.
+void haloExchange(HaloExchange* haloExchange, void* data);
+
+/// Sort the atoms by gid in the specified link cell.
+void sortAtomsInCell(struct AtomsSt* atoms, struct LinkCellSt* boxes, int iBox);
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.c
@@ -0,0 +1,662 @@
+/// \file
+/// Communicate halo data such as "ghost" atoms with neighboring tasks.
+/// In addition to ghost atoms, the EAM potential also needs to exchange
+/// some force information.  Hence this file implements both an atom
+/// exchange and a force exchange, each with slightly different
+/// properties due to their different roles.
+/// 
+/// The halo exchange in CoMD 1.1 takes advantage of the Cartesian domain
+/// decomposition as well as the link cell structure to quickly
+/// determine what data needs to be sent.  
+///
+/// This halo exchange implementation is able to send data to all 26
+/// neighboring tasks using only 6 messages.  This is accomplished by
+/// sending data across the x-faces, then the y-faces, and finally
+/// across the z-faces.  Some of the data that was received from the
+/// x-faces is included in the y-face sends and so on.  This
+/// accumulation of data allows data to reach edge neighbors and corner
+/// neighbors by a two or three step process.
+///
+/// The advantage of this type of structured halo exchange is that it
+/// minimizes the number of MPI messages to send, and maximizes the size
+/// of those messages.
+///
+/// The disadvantage of this halo exchange is that it serializes message
+/// traffic.  Only two messages can be in flight at once. The x-axis
+/// messages must be received and processed before the y-axis messages
+/// can begin.  Architectures with low message latency and many off node
+/// network links would likely benefit from alternate halo exchange
+/// strategies that send independent messages to each neighbor task.
+
+#include "haloExchange.h"
+
+#include <assert.h>
+
+#include "CoMDTypes.h"
+#include "decomposition.h"
+#include "parallel.h"
+#include "linkCells.h"
+#include "eam.h"
+#include "memUtils.h"
+#include "performanceTimers.h"
+
+#define MAX(A,B) ((A) > (B) ? (A) : (B))
+
+/// Don't change the order of the faces in this enum.
+enum HaloFaceOrder {HALO_X_MINUS, HALO_X_PLUS,
+                    HALO_Y_MINUS, HALO_Y_PLUS,
+                    HALO_Z_MINUS, HALO_Z_PLUS};
+
+/// Don't change the order of the axes in this enum.
+enum HaloAxisOrder {HALO_X_AXIS, HALO_Y_AXIS, HALO_Z_AXIS};
+
+/// Extra data members that are needed for the exchange of atom data.
+/// For an atom exchange, the HaloExchangeSt::parms will point to a
+/// structure of this type.
+typedef struct AtomExchangeParmsSt
+{
+   int nCells[6];        //!< Number of cells in cellList for each face.
+   int* cellList[6];     //!< List of link cells from which to load data for each face.
+   real_t* pbcFactor[6]; //!< Whether this face is a periodic boundary.
+}
+AtomExchangeParms;
+
+/// Extra data members that are needed for the exchange of force data.
+/// For an force exchange, the HaloExchangeSt::parms will point to a
+/// structure of this type.
+typedef struct ForceExchangeParmsSt
+{
+   int nCells[6];     //!< Number of cells to send/recv for each face.
+   int* sendCells[6]; //!< List of link cells to send for each face.
+   int* recvCells[6]; //!< List of link cells to recv for each face.
+}
+ForceExchangeParms;
+
+/// A structure to package data for a single atom to pack into a
+/// send/recv buffer.  Also used for sorting atoms within link cells.
+typedef struct AtomMsgSt
+{
+   int gid;
+   int type;
+   real_t rx, ry, rz;
+   real_t px, py, pz;
+}
+AtomMsg;
+
+/// Package data for the force exchange.
+typedef struct ForceMsgSt
+{
+   real_t dfEmbed;
+}
+ForceMsg;
+
+static HaloExchange* initHaloExchange(Domain* domain);
+static void exchangeData(HaloExchange* haloExchange, void* data, int iAxis);
+
+static int* mkAtomCellList(LinkCell* boxes, enum HaloFaceOrder iFace, const int nCells);
+static int loadAtomsBuffer(void* vparms, void* data, int face, char* charBuf);
+static void unloadAtomsBuffer(void* vparms, void* data, int face, int bufSize, char* charBuf);
+static void destroyAtomsExchange(void* vparms);
+
+static int* mkForceSendCellList(LinkCell* boxes, int face, int nCells);
+static int* mkForceRecvCellList(LinkCell* boxes, int face, int nCells);
+static int loadForceBuffer(void* vparms, void* data, int face, char* charBuf);
+static void unloadForceBuffer(void* vparms, void* data, int face, int bufSize, char* charBuf);
+static void destroyForceExchange(void* vparms);
+static int sortAtomsById(const void* a, const void* b);
+
+/// \details
+/// When called in proper sequence by redistributeAtoms, the atom halo
+/// exchange helps serve three purposes:
+/// - Send ghost atom data to neighbor tasks.
+/// - Shift atom coordinates by the global simulation size when they cross
+///   periodic boundaries.  This shift is performed in loadAtomsBuffer.
+/// - Transfer ownership of atoms between tasks as the atoms move across
+///   spatial domain boundaries.  This transfer of ownership occurs in
+///   two places.  The former owner gives up ownership when
+///   updateLinkCells moves a formerly local atom into a halo link cell.
+///   The new owner accepts ownership when unloadAtomsBuffer calls
+///   putAtomInBox to place a received atom into a local link cell.
+///
+/// This constructor does the following:
+///
+/// - Sets the bufCapacity to hold the largest possible number of atoms
+///   that can be sent across a face.
+/// - Initialize function pointers to the atom-specific versions
+/// - Sets the number of link cells to send across each face.
+/// - Builds the list of link cells to send across each face.  As
+///   explained in the comments for mkAtomCellList, this list must
+///   include any link cell, local or halo, that could possibly contain
+///   an atom that needs to be sent across the face.  Atoms that need to
+///   be sent include "ghost atoms" that are located in local link
+///   cells that correspond to halo link cells on receiving tasks as well as
+///   formerly local atoms that have just moved into halo link cells and
+///   need to be sent to the rank that owns the spatial domain the atom
+///   has moved into.
+/// - Sets a coordinate shift factor for each face to account for
+///   periodic boundary conditions.  For most faces the factor is zero.
+///   For faces on the +x, +y, or +z face of the simulation domain
+///   the factor is -1.0 (to shift the coordinates by -1 times the
+///   simulation domain size).  For -x, -y, and -z faces of the
+///   simulation domain, the factor is +1.0.
+///
+/// \see redistributeAtoms
+HaloExchange* initAtomHaloExchange(Domain* domain, LinkCell* boxes)
+{
+   HaloExchange* hh = initHaloExchange(domain);
+   
+   int size0 = (boxes->gridSize[1]+2)*(boxes->gridSize[2]+2);
+   int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]+2);
+   int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2);
+   int maxSize = MAX(size0, size1);
+   maxSize = MAX(size1, size2);
+   hh->bufCapacity = maxSize*2*MAXATOMS*sizeof(AtomMsg);
+   
+   hh->loadBuffer = loadAtomsBuffer;
+   hh->unloadBuffer = unloadAtomsBuffer;
+   hh->destroy = destroyAtomsExchange;
+
+   AtomExchangeParms* parms = comdMalloc(sizeof(AtomExchangeParms));
+
+   parms->nCells[HALO_X_MINUS] = 2*(boxes->gridSize[1]+2)*(boxes->gridSize[2]+2);
+   parms->nCells[HALO_Y_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[2]+2);
+   parms->nCells[HALO_Z_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[1]+2);
+   parms->nCells[HALO_X_PLUS]  = parms->nCells[HALO_X_MINUS];
+   parms->nCells[HALO_Y_PLUS]  = parms->nCells[HALO_Y_MINUS];
+   parms->nCells[HALO_Z_PLUS]  = parms->nCells[HALO_Z_MINUS];
+
+   for (int ii=0; ii<6; ++ii)
+      parms->cellList[ii] = mkAtomCellList(boxes, ii, parms->nCells[ii]);
+
+   for (int ii=0; ii<6; ++ii)
+   {
+      parms->pbcFactor[ii] = comdMalloc(3*sizeof(real_t));
+      for (int jj=0; jj<3; ++jj)
+         parms->pbcFactor[ii][jj] = 0.0;
+   }
+   int* procCoord = domain->procCoord; //alias
+   int* procGrid  = domain->procGrid; //alias
+   if (procCoord[HALO_X_AXIS] == 0)                       parms->pbcFactor[HALO_X_MINUS][HALO_X_AXIS] = +1.0;
+   if (procCoord[HALO_X_AXIS] == procGrid[HALO_X_AXIS]-1) parms->pbcFactor[HALO_X_PLUS][HALO_X_AXIS]  = -1.0;
+   if (procCoord[HALO_Y_AXIS] == 0)                       parms->pbcFactor[HALO_Y_MINUS][HALO_Y_AXIS] = +1.0;
+   if (procCoord[HALO_Y_AXIS] == procGrid[HALO_Y_AXIS]-1) parms->pbcFactor[HALO_Y_PLUS][HALO_Y_AXIS]  = -1.0;
+   if (procCoord[HALO_Z_AXIS] == 0)                       parms->pbcFactor[HALO_Z_MINUS][HALO_Z_AXIS] = +1.0;
+   if (procCoord[HALO_Z_AXIS] == procGrid[HALO_Z_AXIS]-1) parms->pbcFactor[HALO_Z_PLUS][HALO_Z_AXIS]  = -1.0;
+   
+   hh->parms = parms;
+   return hh;
+}
+
+/// The force exchange is considerably simpler than the atom exchange.
+/// In the force case we only need to exchange data that is needed to
+/// complete the force calculation.  Since the atoms have not moved we
+/// only need to send data from local link cells and we are guaranteed
+/// that the same atoms exist in the same order in corresponding halo
+/// cells on remote tasks.  The only tricky part is the size of the
+/// plane of local cells that needs to be sent grows in each direction.
+/// This is because the y-axis send must send some of the data that was
+/// received from the x-axis send, and the z-axis must send some data
+/// from the y-axis send.  This accumulation of data to send is
+/// responsible for data reaching neighbor cells that share only edges
+/// or corners.
+///
+/// \see eam.c for an explanation of the requirement to exchange
+/// force data.
+HaloExchange* initForceHaloExchange(Domain* domain, LinkCell* boxes)
+{
+   HaloExchange* hh = initHaloExchange(domain);
+
+   hh->loadBuffer = loadForceBuffer;
+   hh->unloadBuffer = unloadForceBuffer;
+   hh->destroy = destroyForceExchange;
+
+   int size0 = (boxes->gridSize[1])*(boxes->gridSize[2]);
+   int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]);
+   int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2);
+   int maxSize = MAX(size0, size1);
+   maxSize = MAX(size1, size2);
+   hh->bufCapacity = (maxSize)*MAXATOMS*sizeof(ForceMsg);
+
+   ForceExchangeParms* parms = comdMalloc(sizeof(ForceExchangeParms));
+
+   parms->nCells[HALO_X_MINUS] = (boxes->gridSize[1]  )*(boxes->gridSize[2]  );
+   parms->nCells[HALO_Y_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[2]  );
+   parms->nCells[HALO_Z_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2);
+   parms->nCells[HALO_X_PLUS]  = parms->nCells[HALO_X_MINUS];
+   parms->nCells[HALO_Y_PLUS]  = parms->nCells[HALO_Y_MINUS];
+   parms->nCells[HALO_Z_PLUS]  = parms->nCells[HALO_Z_MINUS];
+
+   for (int ii=0; ii<6; ++ii)
+   {
+      parms->sendCells[ii] = mkForceSendCellList(boxes, ii, parms->nCells[ii]);
+      parms->recvCells[ii] = mkForceRecvCellList(boxes, ii, parms->nCells[ii]);
+   }
+   
+   hh->parms = parms;
+   return hh;
+}
+
+void destroyHaloExchange(HaloExchange** haloExchange)
+{
+   (*haloExchange)->destroy((*haloExchange)->parms);
+   comdFree((*haloExchange)->parms);
+   comdFree(*haloExchange);
+   *haloExchange = NULL;
+}
+
+void haloExchange(HaloExchange* haloExchange, void* data)
+{
+   for (int iAxis=0; iAxis<3; ++iAxis)
+      exchangeData(haloExchange, data, iAxis);
+}
+
+/// Base class constructor.
+HaloExchange* initHaloExchange(Domain* domain)
+{
+   HaloExchange* hh = comdMalloc(sizeof(HaloExchange));
+
+   // Rank of neighbor task for each face.
+   hh->nbrRank[HALO_X_MINUS] = processorNum(domain, -1,  0,  0);
+   hh->nbrRank[HALO_X_PLUS]  = processorNum(domain, +1,  0,  0);
+   hh->nbrRank[HALO_Y_MINUS] = processorNum(domain,  0, -1,  0);
+   hh->nbrRank[HALO_Y_PLUS]  = processorNum(domain,  0, +1,  0);
+   hh->nbrRank[HALO_Z_MINUS] = processorNum(domain,  0,  0, -1);
+   hh->nbrRank[HALO_Z_PLUS]  = processorNum(domain,  0,  0, +1);
+   hh->bufCapacity = 0; // will be set by sub-class.
+
+   return hh;
+}
+
+/// This is the function that does the heavy lifting for the
+/// communication of halo data.  It is called once for each axis and
+/// sends and receives two message.  Loading and unloading of the
+/// buffers is in the hands of the sub-class virtual functions.
+///
+/// \param [in] iAxis     Axis index.
+/// \param [in, out] data Pointer to data that will be passed to the load and
+///                       unload functions
+void exchangeData(HaloExchange* haloExchange, void* data, int iAxis)
+{
+   enum HaloFaceOrder faceM = 2*iAxis;
+   enum HaloFaceOrder faceP = faceM+1;
+
+   char* sendBufM = comdMalloc(haloExchange->bufCapacity);
+   char* sendBufP = comdMalloc(haloExchange->bufCapacity);
+   char* recvBufM = comdMalloc(haloExchange->bufCapacity);
+   char* recvBufP = comdMalloc(haloExchange->bufCapacity);
+
+   int nSendM = haloExchange->loadBuffer(haloExchange->parms, data, faceM, sendBufM);
+   int nSendP = haloExchange->loadBuffer(haloExchange->parms, data, faceP, sendBufP);
+
+   int nbrRankM = haloExchange->nbrRank[faceM];
+   int nbrRankP = haloExchange->nbrRank[faceP];
+
+   int nRecvM, nRecvP;
+
+   startTimer(commHaloTimer);
+   nRecvP = sendReceiveParallel(sendBufM, nSendM, nbrRankM, recvBufP, haloExchange->bufCapacity, nbrRankP);
+   nRecvM = sendReceiveParallel(sendBufP, nSendP, nbrRankP, recvBufM, haloExchange->bufCapacity, nbrRankM);
+   stopTimer(commHaloTimer);
+   
+   haloExchange->unloadBuffer(haloExchange->parms, data, faceM, nRecvM, recvBufM);
+   haloExchange->unloadBuffer(haloExchange->parms, data, faceP, nRecvP, recvBufP);
+   comdFree(recvBufP);
+   comdFree(recvBufM);
+   comdFree(sendBufP);
+   comdFree(sendBufM);
+}
+
+/// Make a list of link cells that need to be sent across the specified
+/// face.  For each face, the list must include all cells, local and
+/// halo, in the first two planes of link cells.  Halo cells must be
+/// included in the list of link cells to send since local atoms may
+/// have moved from local cells into halo cells on this time step.
+/// (Actual remote atoms should have been deleted, so the halo cells
+/// should contain only these few atoms that have just crossed.)
+/// Sending these atoms will allow them to be reassigned to the task
+/// that covers the spatial domain they have moved into.
+///
+/// Note that link cell grid coordinates range from -1 to gridSize[iAxis].
+/// \see initLinkCells for an explanation link cell grid coordinates.
+///
+/// \param [in] boxes  Link cell information.
+/// \param [in] iFace  Index of the face data will be sent across.
+/// \param [in] nCells Number of cells to send.  This is used for a
+///                    consistency check.
+/// \return The list of cells to send.  Caller is responsible to free
+/// the list.
+int* mkAtomCellList(LinkCell* boxes, enum HaloFaceOrder iFace, const int nCells)
+{
+   int* list = comdMalloc(nCells*sizeof(int));
+   int xBegin = -1;
+   int xEnd   = boxes->gridSize[0]+1;
+   int yBegin = -1;
+   int yEnd   = boxes->gridSize[1]+1;
+   int zBegin = -1;
+   int zEnd   = boxes->gridSize[2]+1;
+
+   if (iFace == HALO_X_MINUS) xEnd = xBegin+2;
+   if (iFace == HALO_X_PLUS)  xBegin = xEnd-2;
+   if (iFace == HALO_Y_MINUS) yEnd = yBegin+2;
+   if (iFace == HALO_Y_PLUS)  yBegin = yEnd-2;
+   if (iFace == HALO_Z_MINUS) zEnd = zBegin+2;
+   if (iFace == HALO_Z_PLUS)  zBegin = zEnd-2;
+
+   int count = 0;
+   for (int ix=xBegin; ix<xEnd; ++ix)
+      for (int iy=yBegin; iy<yEnd; ++iy)
+         for (int iz=zBegin; iz<zEnd; ++iz)
+            list[count++] = getBoxFromTuple(boxes, ix, iy, iz);
+   assert(count == nCells);
+   return list;
+}
+
+/// The loadBuffer function for a halo exchange of atom data.  Iterates
+/// link cells in the cellList and load any atoms into the send buffer.
+/// This function also shifts coordinates of the atoms by an appropriate
+/// factor if they are being sent across a periodic boundary.
+///
+/// \see HaloExchangeSt::loadBuffer for an explanation of the loadBuffer
+/// parameters.
+int loadAtomsBuffer(void* vparms, void* data, int face, char* charBuf)
+{
+   AtomExchangeParms* parms = (AtomExchangeParms*) vparms;
+   SimFlat* s = (SimFlat*) data;
+   AtomMsg* buf = (AtomMsg*) charBuf;
+   
+   real_t* pbcFactor = parms->pbcFactor[face];
+   real3 shift;
+   shift[0] = pbcFactor[0] * s->domain->globalExtent[0];
+   shift[1] = pbcFactor[1] * s->domain->globalExtent[1];
+   shift[2] = pbcFactor[2] * s->domain->globalExtent[2];
+   
+   int nCells = parms->nCells[face];
+   int* cellList = parms->cellList[face];
+   int nBuf = 0;
+   for (int iCell=0; iCell<nCells; ++iCell)
+   {
+      int iBox = cellList[iCell];
+      int iOff = iBox*MAXATOMS;
+      for (int ii=iOff; ii<iOff+s->boxes->nAtoms[iBox]; ++ii)
+      {
+         buf[nBuf].gid  = s->atoms->gid[ii];
+         buf[nBuf].type = s->atoms->iSpecies[ii];
+         buf[nBuf].rx = s->atoms->r[ii][0] + shift[0];
+         buf[nBuf].ry = s->atoms->r[ii][1] + shift[1];
+         buf[nBuf].rz = s->atoms->r[ii][2] + shift[2];
+         buf[nBuf].px = s->atoms->p[ii][0];
+         buf[nBuf].py = s->atoms->p[ii][1];
+         buf[nBuf].pz = s->atoms->p[ii][2];
+         ++nBuf;
+      }
+   }
+   return nBuf*sizeof(AtomMsg);
+}
+
+/// The unloadBuffer function for a halo exchange of atom data.
+/// Iterates the receive buffer and places each atom that was received
+/// into the link cell that corresponds to the atom coordinate.  Note
+/// that this naturally accomplishes transfer of ownership of atoms that
+/// have moved from one spatial domain to another.  Atoms with
+/// coordinates in local link cells automatically become local
+/// particles.  Atoms that are owned by other ranks are automatically
+/// placed in halo kink cells.
+/// \see HaloExchangeSt::unloadBuffer for an explanation of the
+/// unloadBuffer parameters.
+void unloadAtomsBuffer(void* vparms, void* data, int face, int bufSize, char* charBuf)
+{
+   AtomExchangeParms* parms = (AtomExchangeParms*) vparms;
+   SimFlat* s = (SimFlat*) data;
+   AtomMsg* buf = (AtomMsg*) charBuf;
+   int nBuf = bufSize / sizeof(AtomMsg);
+   assert(bufSize % sizeof(AtomMsg) == 0);
+   
+   for (int ii=0; ii<nBuf; ++ii)
+   {
+      int gid   = buf[ii].gid;
+      int type  = buf[ii].type;
+      real_t rx = buf[ii].rx;
+      real_t ry = buf[ii].ry;
+      real_t rz = buf[ii].rz;
+      real_t px = buf[ii].px;
+      real_t py = buf[ii].py;
+      real_t pz = buf[ii].pz;
+      putAtomInBox(s->boxes, s->atoms, gid, type, rx, ry, rz, px, py, pz);
+   }
+}
+
+void destroyAtomsExchange(void* vparms)
+{
+   AtomExchangeParms* parms = (AtomExchangeParms*) vparms;
+
+   for (int ii=0; ii<6; ++ii)
+   {
+      comdFree(parms->pbcFactor[ii]);
+      comdFree(parms->cellList[ii]);
+   }
+}
+
+/// Make a list of link cells that need to send data across the
+/// specified face.  Note that this list must be compatible with the
+/// corresponding recv list to ensure that the data goes to the correct
+/// atoms.
+///
+/// \see initLinkCells for information about the conventions for grid
+/// coordinates of link cells.
+int* mkForceSendCellList(LinkCell* boxes, int face, int nCells)
+{
+   int* list = comdMalloc(nCells*sizeof(int));
+   int xBegin, xEnd, yBegin, yEnd, zBegin, zEnd;
+
+   int nx = boxes->gridSize[0];
+   int ny = boxes->gridSize[1];
+   int nz = boxes->gridSize[2];
+   switch(face)
+   {
+     case HALO_X_MINUS:
+      xBegin=0;    xEnd=1;    yBegin=0;    yEnd=ny;   zBegin=0;    zEnd=nz;
+      break;
+     case HALO_X_PLUS:
+      xBegin=nx-1; xEnd=nx;   yBegin=0;    yEnd=ny;   zBegin=0;    zEnd=nz;
+      break;
+     case HALO_Y_MINUS:
+      xBegin=-1;   xEnd=nx+1; yBegin=0;    yEnd=1;    zBegin=0;    zEnd=nz;
+      break;
+     case HALO_Y_PLUS:
+      xBegin=-1;   xEnd=nx+1; yBegin=ny-1; yEnd=ny;   zBegin=0;    zEnd=nz;
+      break;
+     case HALO_Z_MINUS:
+      xBegin=-1;   xEnd=nx+1; yBegin=-1;   yEnd=ny+1; zBegin=0;    zEnd=1;
+      break;
+     case HALO_Z_PLUS:
+      xBegin=-1;   xEnd=nx+1; yBegin=-1;   yEnd=ny+1; zBegin=nz-1; zEnd=nz;
+      break;
+     default:
+      assert(1==0);
+   }
+   
+   int count = 0;
+   for (int ix=xBegin; ix<xEnd; ++ix)
+      for (int iy=yBegin; iy<yEnd; ++iy)
+         for (int iz=zBegin; iz<zEnd; ++iz)
+            list[count++] = getBoxFromTuple(boxes, ix, iy, iz);
+   
+   assert(count == nCells);
+   return list;
+}
+
+/// Make a list of link cells that need to receive data across the
+/// specified face.  Note that this list must be compatible with the
+/// corresponding send list to ensure that the data goes to the correct
+/// atoms.
+///
+/// \see initLinkCells for information about the conventions for grid
+/// coordinates of link cells.
+int* mkForceRecvCellList(LinkCell* boxes, int face, int nCells)
+{
+   int* list = comdMalloc(nCells*sizeof(int));
+   int xBegin, xEnd, yBegin, yEnd, zBegin, zEnd;
+
+   int nx = boxes->gridSize[0];
+   int ny = boxes->gridSize[1];
+   int nz = boxes->gridSize[2];
+   switch(face)
+   {
+     case HALO_X_MINUS:
+      xBegin=-1; xEnd=0;    yBegin=0;  yEnd=ny;   zBegin=0;  zEnd=nz;
+      break;
+     case HALO_X_PLUS:
+      xBegin=nx; xEnd=nx+1; yBegin=0;  yEnd=ny;   zBegin=0;  zEnd=nz;
+      break;
+     case HALO_Y_MINUS:
+      xBegin=-1; xEnd=nx+1; yBegin=-1; yEnd=0;    zBegin=0;  zEnd=nz;
+      break;
+     case HALO_Y_PLUS:
+      xBegin=-1; xEnd=nx+1; yBegin=ny; yEnd=ny+1; zBegin=0;  zEnd=nz;
+      break;
+     case HALO_Z_MINUS:
+      xBegin=-1; xEnd=nx+1; yBegin=-1; yEnd=ny+1; zBegin=-1; zEnd=0;
+      break;
+     case HALO_Z_PLUS:
+      xBegin=-1; xEnd=nx+1; yBegin=-1; yEnd=ny+1; zBegin=nz; zEnd=nz+1;
+      break;
+     default:
+      assert(1==0);
+   }
+   
+   int count = 0;
+   for (int ix=xBegin; ix<xEnd; ++ix)
+      for (int iy=yBegin; iy<yEnd; ++iy)
+         for (int iz=zBegin; iz<zEnd; ++iz)
+            list[count++] = getBoxFromTuple(boxes, ix, iy, iz);
+   
+   assert(count == nCells);
+   return list;
+}
+
+/// The loadBuffer function for a force exchange.
+/// Iterate the send list and load the derivative of the embedding
+/// energy with respect to the local density into the send buffer.
+///
+/// \see HaloExchangeSt::loadBuffer for an explanation of the loadBuffer
+/// parameters.
+int loadForceBuffer(void* vparms, void* vdata, int face, char* charBuf)
+{
+   ForceExchangeParms* parms = (ForceExchangeParms*) vparms;
+   ForceExchangeData* data = (ForceExchangeData*) vdata;
+   ForceMsg* buf = (ForceMsg*) charBuf;
+   
+   int nCells = parms->nCells[face];
+   int* cellList = parms->sendCells[face];
+   int nBuf = 0;
+   for (int iCell=0; iCell<nCells; ++iCell)
+   {
+      int iBox = cellList[iCell];
+      int iOff = iBox*MAXATOMS;
+      for (int ii=iOff; ii<iOff+data->boxes->nAtoms[iBox]; ++ii)
+      {
+         buf[nBuf].dfEmbed = data->dfEmbed[ii];
+         ++nBuf;
+      }
+   }
+   return nBuf*sizeof(ForceMsg);
+}
+
+/// The unloadBuffer function for a force exchange.
+/// Data is received in an order that naturally aligns with the atom
+/// storage so it is simple to put the data where it belongs.
+///
+/// \see HaloExchangeSt::unloadBuffer for an explanation of the
+/// unloadBuffer parameters.
+void unloadForceBuffer(void* vparms, void* vdata, int face, int bufSize, char* charBuf)
+{
+   ForceExchangeParms* parms = (ForceExchangeParms*) vparms;
+   ForceExchangeData* data = (ForceExchangeData*) vdata;
+   ForceMsg* buf = (ForceMsg*) charBuf;
+   assert(bufSize % sizeof(ForceMsg) == 0);
+   
+   int nCells = parms->nCells[face];
+   int* cellList = parms->recvCells[face];
+   int iBuf = 0;
+   for (int iCell=0; iCell<nCells; ++iCell)
+   {
+      int iBox = cellList[iCell];
+      int iOff = iBox*MAXATOMS;
+      for (int ii=iOff; ii<iOff+data->boxes->nAtoms[iBox]; ++ii)
+      {
+         data->dfEmbed[ii] = buf[iBuf].dfEmbed;
+         ++iBuf;
+      }
+   }
+   assert(iBuf == bufSize/ sizeof(ForceMsg));
+}
+
+void destroyForceExchange(void* vparms)
+{
+   ForceExchangeParms* parms = (ForceExchangeParms*) vparms;
+
+   for (int ii=0; ii<6; ++ii)
+   {
+      comdFree(parms->sendCells[ii]);
+      comdFree(parms->recvCells[ii]);
+   }
+}
+
+/// \details
+/// The force exchange assumes that the atoms are in the same order in
+/// both a given local link cell and the corresponding remote cell(s).
+/// However, the atom exchange does not guarantee this property,
+/// especially when atoms cross a domain decomposition boundary and move
+/// from one task to another.  Trying to maintain the atom order during
+/// the atom exchange would immensely complicate that code.  Instead, we
+/// just sort the atoms after the atom exchange.
+void sortAtomsInCell(Atoms* atoms, LinkCell* boxes, int iBox)
+{
+   int nAtoms = boxes->nAtoms[iBox];
+
+   AtomMsg tmp[nAtoms];
+
+   int begin = iBox*MAXATOMS;
+   int end = begin + nAtoms;
+   for (int ii=begin, iTmp=0; ii<end; ++ii, ++iTmp)
+   {
+      tmp[iTmp].gid  = atoms->gid[ii];
+      tmp[iTmp].type = atoms->iSpecies[ii];
+      tmp[iTmp].rx =   atoms->r[ii][0];
+      tmp[iTmp].ry =   atoms->r[ii][1];
+      tmp[iTmp].rz =   atoms->r[ii][2];
+      tmp[iTmp].px =   atoms->p[ii][0];
+      tmp[iTmp].py =   atoms->p[ii][1];
+      tmp[iTmp].pz =   atoms->p[ii][2];
+   }
+   qsort(&tmp, nAtoms, sizeof(AtomMsg), sortAtomsById);
+   for (int ii=begin, iTmp=0; ii<end; ++ii, ++iTmp)
+   {
+      atoms->gid[ii]   = tmp[iTmp].gid;
+      atoms->iSpecies[ii] = tmp[iTmp].type;
+      atoms->r[ii][0]  = tmp[iTmp].rx;
+      atoms->r[ii][1]  = tmp[iTmp].ry;
+      atoms->r[ii][2]  = tmp[iTmp].rz;
+      atoms->p[ii][0]  = tmp[iTmp].px;
+      atoms->p[ii][1]  = tmp[iTmp].py;
+      atoms->p[ii][2]  = tmp[iTmp].pz;
+   }
+   
+}
+
+///  A function suitable for passing to qsort to sort atoms by gid.
+///  Because every atom in the simulation is supposed to have a unique
+///  id, this function checks that the atoms have different gids.  If
+///  that assertion ever fails it is a sign that something has gone
+///  wrong elsewhere in the code.
+int sortAtomsById(const void* a, const void* b)
+{
+   int aId = ((AtomMsg*) a)->gid;
+   int bId = ((AtomMsg*) b)->gid;
+   assert(aId != bId);
+
+   if (aId < bId)
+      return -1;
+   return 1;
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.h
@@ -0,0 +1,38 @@
+/// \file
+/// Initialize the atom configuration.
+
+#ifndef __INIT_ATOMS_H
+#define __INIT_ATOMS_H
+
+#include "mytype.h"
+
+struct SimFlatSt;
+struct LinkCellSt;
+
+/// Atom data
+typedef struct AtomsSt
+{
+   // atom-specific data
+   int nLocal;    //!< total number of atoms on this processor
+   int nGlobal;   //!< total number of atoms in simulation
+
+   int* gid;      //!< A globally unique id for each atom
+   int* iSpecies; //!< the species index of the atom
+
+   real3*  r;     //!< positions
+   real3*  p;     //!< momenta of atoms
+   real3*  f;     //!< forces 
+   real_t* U;     //!< potential energy per atom
+} Atoms;
+
+
+/// Allocates memory to store atom data.
+Atoms* initAtoms(struct LinkCellSt* boxes);
+void destroyAtoms(struct AtomsSt* atoms);
+
+void createFccLattice(int nx, int ny, int nz, real_t lat, struct SimFlatSt* s);
+
+void setVcm(struct SimFlatSt* s, real_t vcm[3]);
+void setTemperature(struct SimFlatSt* s, real_t temperature);
+void randomDisplacements(struct SimFlatSt* s, real_t delta);
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.c
@@ -0,0 +1,232 @@
+/// \file
+/// Initialize the atom configuration.
+
+#include "initAtoms.h"
+
+#include <math.h>
+#include <assert.h>
+
+#include "constants.h"
+#include "decomposition.h"
+#include "parallel.h"
+#include "random.h"
+#include "linkCells.h"
+#include "timestep.h"
+#include "memUtils.h"
+#include "performanceTimers.h"
+
+static void computeVcm(SimFlat* s, real_t vcm[3]);
+
+/// \details
+/// Call functions such as createFccLattice and setTemperature to set up
+/// initial atom positions and momenta.
+Atoms* initAtoms(LinkCell* boxes)
+{
+   Atoms* atoms = comdMalloc(sizeof(Atoms));
+
+   int maxTotalAtoms = MAXATOMS*boxes->nTotalBoxes;
+
+   atoms->gid =      (int*)   comdMalloc(maxTotalAtoms*sizeof(int));
+   atoms->iSpecies = (int*)   comdMalloc(maxTotalAtoms*sizeof(int));
+   atoms->r =        (real3*) comdMalloc(maxTotalAtoms*sizeof(real3));
+   atoms->p =        (real3*) comdMalloc(maxTotalAtoms*sizeof(real3));
+   atoms->f =        (real3*) comdMalloc(maxTotalAtoms*sizeof(real3));
+   atoms->U =        (real_t*)comdMalloc(maxTotalAtoms*sizeof(real_t));
+
+   atoms->nLocal = 0;
+   atoms->nGlobal = 0;
+
+   for (int iOff = 0; iOff < maxTotalAtoms; iOff++)
+   {
+      atoms->gid[iOff] = 0;
+      atoms->iSpecies[iOff] = 0;
+      zeroReal3(atoms->r[iOff]);
+      zeroReal3(atoms->p[iOff]);
+      zeroReal3(atoms->f[iOff]);
+      atoms->U[iOff] = 0.;
+   }
+
+   return atoms;
+}
+
+void destroyAtoms(Atoms *atoms)
+{
+   freeMe(atoms,gid);
+   freeMe(atoms,iSpecies);
+   freeMe(atoms,r);
+   freeMe(atoms,p);
+   freeMe(atoms,f);
+   freeMe(atoms,U);
+   comdFree(atoms);
+}
+
+/// Creates atom positions on a face centered cubic (FCC) lattice with
+/// nx * ny * nz unit cells and lattice constant lat.
+/// Set momenta to zero.
+void createFccLattice(int nx, int ny, int nz, real_t lat, SimFlat* s)
+{
+   const real_t* localMin = s->domain->localMin; // alias
+   const real_t* localMax = s->domain->localMax; // alias
+   
+   int nb = 4; // number of atoms in the basis
+   real3 basis[4] = { {0.25, 0.25, 0.25},
+      {0.25, 0.75, 0.75},
+      {0.75, 0.25, 0.75},
+      {0.75, 0.75, 0.25} };
+
+   // create and place atoms
+   int begin[3];
+   int end[3];
+   for (int ii=0; ii<3; ++ii)
+   {
+      begin[ii] = floor(localMin[ii]/lat);
+      end[ii]   = ceil (localMax[ii]/lat);
+   }
+
+   real_t px,py,pz;
+   px=py=pz=0.0;
+   for (int ix=begin[0]; ix<end[0]; ++ix)
+      for (int iy=begin[1]; iy<end[1]; ++iy)
+         for (int iz=begin[2]; iz<end[2]; ++iz)
+            for (int ib=0; ib<nb; ++ib)
+            {
+               real_t rx = (ix+basis[ib][0]) * lat;
+               real_t ry = (iy+basis[ib][1]) * lat;
+               real_t rz = (iz+basis[ib][2]) * lat;
+               if (rx < localMin[0] || rx >= localMax[0]) continue;
+               if (ry < localMin[1] || ry >= localMax[1]) continue;
+               if (rz < localMin[2] || rz >= localMax[2]) continue;
+               int id = ib+nb*(iz+nz*(iy+ny*(ix)));
+               putAtomInBox(s->boxes, s->atoms, id, 0, rx, ry, rz, px, py, pz);
+            }
+
+   // set total atoms in simulation
+   startTimer(commReduceTimer);
+   addIntParallel(&s->atoms->nLocal, &s->atoms->nGlobal, 1);
+   stopTimer(commReduceTimer);
+
+   assert(s->atoms->nGlobal == nb*nx*ny*nz);
+}
+
+/// Sets the center of mass velocity of the system.
+/// \param [in] newVcm The desired center of mass velocity.
+void setVcm(SimFlat* s, real_t newVcm[3])
+{
+   real_t oldVcm[3];
+   computeVcm(s, oldVcm);
+
+   real_t vShift[3];
+   vShift[0] = (newVcm[0] - oldVcm[0]);
+   vShift[1] = (newVcm[1] - oldVcm[1]);
+   vShift[2] = (newVcm[2] - oldVcm[2]);
+
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; ++iBox)
+   {
+      for (int iOff=MAXATOMS*iBox, ii=0; ii<s->boxes->nAtoms[iBox]; ++ii, ++iOff)
+      {
+         int iSpecies = s->atoms->iSpecies[iOff];
+         real_t mass = s->species[iSpecies].mass;
+
+         s->atoms->p[iOff][0] += mass * vShift[0];
+         s->atoms->p[iOff][1] += mass * vShift[1];
+         s->atoms->p[iOff][2] += mass * vShift[2];
+      }
+   }
+}
+
+/// Sets the temperature of system.
+///
+/// Selects atom velocities randomly from a boltzmann (equilibrium)
+/// distribution that corresponds to the specified temperature.  This
+/// random process will typically result in a small, but non zero center
+/// of mass velocity and a small difference from the specified
+/// temperature.  For typical MD runs these small differences are
+/// unimportant, However, to avoid possible confusion, we set the center
+/// of mass velocity to zero and scale the velocities to exactly match
+/// the input temperature.
+void setTemperature(SimFlat* s, real_t temperature)
+{
+   // set initial velocities for the distribution
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; ++iBox)
+   {
+      for (int iOff=MAXATOMS*iBox, ii=0; ii<s->boxes->nAtoms[iBox]; ++ii, ++iOff)
+      {
+         int iType = s->atoms->iSpecies[iOff];
+         real_t mass = s->species[iType].mass;
+         real_t sigma = sqrt(kB_eV * temperature/mass);
+         uint64_t seed = mkSeed(s->atoms->gid[iOff], 123);
+         s->atoms->p[iOff][0] = mass * sigma * gasdev(&seed);
+         s->atoms->p[iOff][1] = mass * sigma * gasdev(&seed);
+         s->atoms->p[iOff][2] = mass * sigma * gasdev(&seed);
+      }
+   }
+   // compute the resulting temperature
+   // kinetic energy  = 3/2 kB * Temperature 
+   if (temperature == 0.0) return;
+   real_t vZero[3] = {0., 0., 0.};
+   setVcm(s, vZero);
+   kineticEnergy(s);
+   real_t temp = (s->eKinetic/s->atoms->nGlobal)/kB_eV/1.5;
+   // scale the velocities to achieve the target temperature
+   real_t scaleFactor = sqrt(temperature/temp);
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; ++iBox)
+   {
+      for (int iOff=MAXATOMS*iBox, ii=0; ii<s->boxes->nAtoms[iBox]; ++ii, ++iOff)
+      {
+         s->atoms->p[iOff][0] *= scaleFactor;
+         s->atoms->p[iOff][1] *= scaleFactor;
+         s->atoms->p[iOff][2] *= scaleFactor;
+      }
+   }
+   kineticEnergy(s);
+   temp = s->eKinetic/s->atoms->nGlobal/kB_eV/1.5;
+}
+
+/// Add a random displacement to the atom positions.
+/// Atoms are displaced by a random distance in the range
+/// [-delta, +delta] along each axis.
+/// \param [in] delta The maximum displacement (along each axis).
+void randomDisplacements(SimFlat* s, real_t delta)
+{
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; ++iBox)
+   {
+      for (int iOff=MAXATOMS*iBox, ii=0; ii<s->boxes->nAtoms[iBox]; ++ii, ++iOff)
+      {
+         uint64_t seed = mkSeed(s->atoms->gid[iOff], 457);
+         s->atoms->r[iOff][0] += (2.0*lcg61(&seed)-1.0) * delta;
+         s->atoms->r[iOff][1] += (2.0*lcg61(&seed)-1.0) * delta;
+         s->atoms->r[iOff][2] += (2.0*lcg61(&seed)-1.0) * delta;
+      }
+   }
+}
+
+/// Computes the center of mass velocity of the system.
+void computeVcm(SimFlat* s, real_t vcm[3])
+{
+   real_t vcmLocal[4] = {0., 0., 0., 0.};
+   real_t vcmSum[4] = {0., 0., 0., 0.};
+
+   // sum the momenta and particle masses 
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; ++iBox)
+   {
+      for (int iOff=MAXATOMS*iBox, ii=0; ii<s->boxes->nAtoms[iBox]; ++ii, ++iOff)
+      {
+         vcmLocal[0] += s->atoms->p[iOff][0];
+         vcmLocal[1] += s->atoms->p[iOff][1];
+         vcmLocal[2] += s->atoms->p[iOff][2];
+
+         int iSpecies = s->atoms->iSpecies[iOff];
+         vcmLocal[3] += s->species[iSpecies].mass;
+      }
+   }
+
+   startTimer(commReduceTimer);
+   addRealParallel(vcmLocal, vcmSum, 4);
+   stopTimer(commReduceTimer);
+
+   real_t totalMass = vcmSum[3];
+   vcm[0] = vcmSum[0]/totalMass;
+   vcm[1] = vcmSum[1]/totalMass;
+   vcm[2] = vcmSum[2]/totalMass;
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.h
@@ -0,0 +1,50 @@
+/// \file
+/// Functions to maintain link cell structures for fast pair finding.
+
+#ifndef __LINK_CELLS_H_
+#define __LINK_CELLS_H_
+
+#include "mytype.h"
+
+/// The maximum number of atoms that can be stored in a link cell.
+#define MAXATOMS 64 
+
+struct DomainSt;
+struct AtomsSt;
+
+/// Link cell data.  For convenience, we keep a copy of the localMin and
+/// localMax coordinates that are also found in the DomainsSt.
+typedef struct LinkCellSt
+{
+   int gridSize[3];     //!< number of boxes in each dimension on processor
+   int nLocalBoxes;     //!< total number of local boxes on processor
+   int nHaloBoxes;      //!< total number of remote halo/ghost boxes on processor
+   int nTotalBoxes;     //!< total number of boxes on processor
+                        //!< nLocalBoxes + nHaloBoxes
+   real3 localMin;      //!< minimum local bounds on processor
+   real3 localMax;      //!< maximum local bounds on processor
+   real3 boxSize;       //!< size of box in each dimension
+   real3 invBoxSize;    //!< inverse size of box in each dimension
+
+   int* nAtoms;         //!< total number of atoms in each box
+} LinkCell;
+
+LinkCell* initLinkCells(const struct DomainSt* domain, real_t cutoff);
+void destroyLinkCells(LinkCell** boxes);
+
+int getNeighborBoxes(LinkCell* boxes, int iBox, int* nbrBoxes);
+void putAtomInBox(LinkCell* boxes, struct AtomsSt* atoms,
+                  const int gid, const int iType,
+                  const real_t x,  const real_t y,  const real_t z,
+                  const real_t px, const real_t py, const real_t pz);
+int getBoxFromTuple(LinkCell* boxes, int x, int y, int z);
+
+void moveAtom(LinkCell* boxes, struct AtomsSt* atoms, int iId, int iBox, int jBox);
+
+/// Update link cell data structures when the atoms have moved.
+void updateLinkCells(LinkCell* boxes, struct AtomsSt* atoms);
+
+int maxOccupancy(LinkCell* boxes);
+
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.c
@@ -0,0 +1,457 @@
+/// \file
+/// Functions to maintain link cell structures for fast pair finding.
+///
+/// In CoMD 1.1, atoms are stored in link cells.  Link cells are widely
+/// used in classical MD to avoid an O(N^2) search for atoms that
+/// interact.  Link cells are formed by subdividing the local spatial
+/// domain with a Cartesian grid where the grid spacing in each
+/// direction is at least as big as he potential's cutoff distance.
+/// Because atoms don't interact beyond the potential cutoff, for an
+/// atom iAtom in any given link cell, we can be certain that all atoms
+/// that interact with iAtom are contained in the same link cell, or one
+/// of the 26 neighboring link cells.
+/// 
+/// CoMD chooses the link cell size (boxSize) on each axis to be the
+/// shortest possible distance, longer than cutoff, such that the local
+/// domain size divided by boxSize is an integer.  I.e., the link cells
+/// are commensurate with with the local domain size.  While this does
+/// not result in the smallest possible link cells, it does allow us to
+/// keep a strict separation between the link cells that are entirely
+/// inside the local domain and those that represent halo regions.
+///
+/// The number of local link cells in each direction is stored in
+/// gridSize.  Local link cells have 3D grid coordinates (ix, iy, iz)
+/// where ix, iy, and iz can range from 0 to gridSize[iAxis]-1,
+/// whiere iAxis is 0 for x, 1 for y and 2 for the z direction.  The
+/// number of local link cells is thus nLocalBoxes =
+/// gridSize[0]*gridSize[1]*gridSize[2].
+///
+/// The local link cells are surrounded by one complete shell of halo
+/// link cells.  The halo cells provide temporary storage for halo or
+/// "ghost" atoms that belong to other tasks, but whose coordinates are
+/// needed locally to complete the force calculation.  Halo link cells
+/// have at least one coordinate with a value of either -1 or
+/// gridSize[iAxis].
+///
+/// Because CoMD stores data in ordinary 1D C arrays, a mapping is
+/// needed from the 3D grid coords to a 1D array index.  For the local
+/// cells we use the conventional mapping ix + iy*nx + iz*nx*ny.  This
+/// keeps all of the local cells in a contiguous region of memory
+/// starting from the beginning of any relevant array and makes it easy
+/// to iterate the local cells in a single loop.  Halo cells are mapped
+/// differently.  After the local cells, the two planes of link cells
+/// that are face neighbors with local cells across the -x or +x axis
+/// are next.  These are followed by face neighbors across the -y and +y
+/// axis (including cells that are y-face neighbors with an x-plane of
+/// halo cells), followed by all remaining cells in the -z and +z planes
+/// of halo cells.  The total number of link cells (on each rank) is
+/// nTotalBoxes.
+///
+/// Data storage arrays that are used in association with link cells
+/// should be allocated to store nTotalBoxes*MAXATOMS items.  Data for
+/// the first atom in linkCell iBox is stored at index iBox*MAXATOMS.
+/// Data for subsequent atoms in the same link cell are stored
+/// sequentially, and the number of atoms in link cell iBox is
+/// nAtoms[iBox].
+///
+/// \see getBoxFromTuple is the 3D->1D mapping for link cell indices.
+/// \see getTuple is the 1D->3D mapping
+///
+/// \param [in] cutoff The cutoff distance of the potential.
+
+#include "linkCells.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+#include "parallel.h"
+#include "memUtils.h"
+#include "decomposition.h"
+#include "performanceTimers.h"
+#include "CoMDTypes.h"
+
+#define   MIN(A,B) ((A) < (B) ? (A) : (B))
+#define   MAX(A,B) ((A) > (B) ? (A) : (B))
+
+static void copyAtom(LinkCell* boxes, Atoms* atoms, int iAtom, int iBox, int jAtom, int jBox);
+static int getBoxFromCoord(LinkCell* boxes, real_t rr[3]);
+static void emptyHaloCells(LinkCell* boxes);
+static void getTuple(LinkCell* boxes, int iBox, int* ixp, int* iyp, int* izp);
+
+LinkCell* initLinkCells(const Domain* domain, real_t cutoff)
+{
+   assert(domain);
+   LinkCell* ll = comdMalloc(sizeof(LinkCell));
+
+   for (int i = 0; i < 3; i++)
+   {
+      ll->localMin[i] = domain->localMin[i];
+      ll->localMax[i] = domain->localMax[i];
+      ll->gridSize[i] = domain->localExtent[i] / cutoff; // local number of boxes
+      ll->boxSize[i] = domain->localExtent[i] / ((real_t) ll->gridSize[i]);
+      ll->invBoxSize[i] = 1.0/ll->boxSize[i];
+   }
+
+   ll->nLocalBoxes = ll->gridSize[0] * ll->gridSize[1] * ll->gridSize[2];
+   
+   ll->nHaloBoxes = 2 * ((ll->gridSize[0] + 2) *
+                         (ll->gridSize[1] + ll->gridSize[2] + 2) +
+                         (ll->gridSize[1] * ll->gridSize[2]));
+
+   ll->nTotalBoxes = ll->nLocalBoxes + ll->nHaloBoxes;
+   
+   ll->nAtoms = comdMalloc(ll->nTotalBoxes*sizeof(int));
+   for (int iBox=0; iBox<ll->nTotalBoxes; ++iBox)
+      ll->nAtoms[iBox] = 0;
+
+   assert ( (ll->gridSize[0] >= 2) && (ll->gridSize[1] >= 2) && (ll->gridSize[2] >= 2) );
+   return ll;
+}
+
+void destroyLinkCells(LinkCell** boxes)
+{
+   if (! boxes) return;
+   if (! *boxes) return;
+
+   comdFree((*boxes)->nAtoms);
+   comdFree(*boxes);
+   *boxes = NULL;
+
+   return;
+}
+
+/// \details
+/// Populates the nbrBoxes array with the 27 boxes that are adjacent to
+/// iBox.  The count is 27 instead of 26 because iBox is included in the
+/// list (as neighbor 13).  Caller is responsible to alloc and free
+/// nbrBoxes.
+/// \return The number of nbr boxes (always 27 in this implementation).
+int getNeighborBoxes(LinkCell* boxes, int iBox, int* nbrBoxes)
+{
+   int ix, iy, iz;
+   getTuple(boxes, iBox, &ix, &iy, &iz);
+   
+   int count = 0;
+   for (int i=ix-1; i<=ix+1; i++)
+      for (int j=iy-1; j<=iy+1; j++)
+         for (int k=iz-1; k<=iz+1; k++)
+            nbrBoxes[count++] = getBoxFromTuple(boxes,i,j,k);
+   
+   return count;
+}
+
+/// \details
+/// Finds the appropriate link cell for an atom based on the spatial
+/// coordinates and stores data in that link cell.
+/// \param [in] gid   The global of the atom.
+/// \param [in] iType The species index of the atom.
+/// \param [in] x     The x-coordinate of the atom.
+/// \param [in] y     The y-coordinate of the atom.
+/// \param [in] z     The z-coordinate of the atom.
+/// \param [in] px    The x-component of the atom's momentum.
+/// \param [in] py    The y-component of the atom's momentum.
+/// \param [in] pz    The z-component of the atom's momentum.
+void putAtomInBox(LinkCell* boxes, Atoms* atoms,
+                  const int gid, const int iType,
+                  const real_t x,  const real_t y,  const real_t z,
+                  const real_t px, const real_t py, const real_t pz)
+{
+   real_t xyz[3] = {x,y,z};
+   
+   // Find correct box.
+   int iBox = getBoxFromCoord(boxes, xyz);
+   int iOff = iBox*MAXATOMS;
+   iOff += boxes->nAtoms[iBox];
+   
+   // assign values to array elements
+   if (iBox < boxes->nLocalBoxes)
+      atoms->nLocal++;
+   boxes->nAtoms[iBox]++;
+   atoms->gid[iOff] = gid;
+   atoms->iSpecies[iOff] = iType;
+   
+   atoms->r[iOff][0] = x;
+   atoms->r[iOff][1] = y;
+   atoms->r[iOff][2] = z;
+   
+   atoms->p[iOff][0] = px;
+   atoms->p[iOff][1] = py;
+   atoms->p[iOff][2] = pz;
+}
+
+/// Calculates the link cell index from the grid coords.  The valid
+/// coordinate range in direction ii is [-1, gridSize[ii]].  Any
+/// coordinate that involves a -1 or gridSize[ii] is a halo link cell.
+/// Because of the order in which the local and halo link cells are
+/// stored the indices of the halo cells are special cases.
+/// \see initLinkCells for an explanation of storage order.
+int getBoxFromTuple(LinkCell* boxes, int ix, int iy, int iz)
+{
+   int iBox = 0;
+   const int* gridSize = boxes->gridSize; // alias
+   
+   // Halo in Z+
+   if (iz == gridSize[2])
+   {
+      iBox = boxes->nLocalBoxes + 2*gridSize[2]*gridSize[1] + 2*gridSize[2]*(gridSize[0]+2) +
+         (gridSize[0]+2)*(gridSize[1]+2) + (gridSize[0]+2)*(iy+1) + (ix+1);
+   }
+   // Halo in Z-
+   else if (iz == -1)
+   {
+      iBox = boxes->nLocalBoxes + 2*gridSize[2]*gridSize[1] + 2*gridSize[2]*(gridSize[0]+2) +
+         (gridSize[0]+2)*(iy+1) + (ix+1);
+   }
+   // Halo in Y+
+   else if (iy == gridSize[1])
+   {
+      iBox = boxes->nLocalBoxes + 2*gridSize[2]*gridSize[1] + gridSize[2]*(gridSize[0]+2) +
+         (gridSize[0]+2)*iz + (ix+1);
+   }
+   // Halo in Y-
+   else if (iy == -1)
+   {
+      iBox = boxes->nLocalBoxes + 2*gridSize[2]*gridSize[1] + iz*(gridSize[0]+2) + (ix+1);
+   }
+   // Halo in X+
+   else if (ix == gridSize[0])
+   {
+      iBox = boxes->nLocalBoxes + gridSize[1]*gridSize[2] + iz*gridSize[1] + iy;
+   }
+   // Halo in X-
+   else if (ix == -1)
+   {
+      iBox = boxes->nLocalBoxes + iz*gridSize[1] + iy;
+   }
+   // local link celll.
+   else
+   {
+      iBox = ix + gridSize[0]*iy + gridSize[0]*gridSize[1]*iz;
+   }
+   assert(iBox >= 0);
+   assert(iBox < boxes->nTotalBoxes);
+
+   return iBox;
+}
+
+/// Move an atom from one link cell to another.
+/// \param iId [in]  The index with box iBox of the atom to be moved.
+/// \param iBox [in] The index of the link cell the particle is moving from.
+/// \param jBox [in] The index of the link cell the particle is moving to.
+void moveAtom(LinkCell* boxes, Atoms* atoms, int iId, int iBox, int jBox)
+{
+   int nj = boxes->nAtoms[jBox];
+   copyAtom(boxes, atoms, iId, iBox, nj, jBox);
+   boxes->nAtoms[jBox]++;
+
+   assert(boxes->nAtoms[jBox] < MAXATOMS);
+
+   boxes->nAtoms[iBox]--;
+   int ni = boxes->nAtoms[iBox];
+   if (ni) copyAtom(boxes, atoms, ni, iBox, iId, iBox);
+
+   if (jBox > boxes->nLocalBoxes)
+      --atoms->nLocal;
+   
+   return;
+}
+
+/// \details
+/// This is the first step in returning data structures to a consistent
+/// state after the atoms move each time step.  First we discard all
+/// atoms in the halo link cells.  These are all atoms that are
+/// currently stored on other ranks and so any information we have about
+/// them is stale.  Next, we move any atoms that have crossed link cell
+/// boundaries into their new link cells.  It is likely that some atoms
+/// will be moved into halo link cells.  Since we have deleted halo
+/// atoms from other tasks, it is clear that any atoms that are in halo
+/// cells at the end of this routine have just transitioned from local
+/// to halo atoms.  Such atom must be sent to other tasks by a halo
+/// exchange to avoid being lost.
+/// \see redistributeAtoms
+void updateLinkCells(LinkCell* boxes, Atoms* atoms)
+{
+   emptyHaloCells(boxes);
+   
+   for (int iBox=0; iBox<boxes->nLocalBoxes; ++iBox)
+   {
+      int iOff = iBox*MAXATOMS;
+      int ii=0;
+      while (ii < boxes->nAtoms[iBox])
+      {
+         int jBox = getBoxFromCoord(boxes, atoms->r[iOff+ii]);
+         if (jBox != iBox)
+            moveAtom(boxes, atoms, ii, iBox, jBox);
+         else
+            ++ii;
+      }
+   }
+}
+
+/// \return The largest number of atoms in any link cell.
+int maxOccupancy(LinkCell* boxes)
+{
+   int localMax = 0;
+   for (int ii=0; ii<boxes->nLocalBoxes; ++ii)
+      localMax = MAX(localMax, boxes->nAtoms[ii]);
+
+   int globalMax;
+
+   startTimer(commReduceTimer);
+   maxIntParallel(&localMax, &globalMax, 1);
+   stopTimer(commReduceTimer);
+
+   return globalMax;
+}
+
+/// Copy atom iAtom in link cell iBox to atom jAtom in link cell jBox.
+/// Any data at jAtom, jBox is overwritten.  This routine can be used to
+/// re-order atoms within a link cell.
+void copyAtom(LinkCell* boxes, Atoms* atoms, int iAtom, int iBox, int jAtom, int jBox)
+{
+   const int iOff = MAXATOMS*iBox+iAtom;
+   const int jOff = MAXATOMS*jBox+jAtom;
+   atoms->gid[jOff] = atoms->gid[iOff];
+   atoms->iSpecies[jOff] = atoms->iSpecies[iOff];
+   memcpy(atoms->r[jOff], atoms->r[iOff], sizeof(real3));
+   memcpy(atoms->p[jOff], atoms->p[iOff], sizeof(real3));
+   memcpy(atoms->f[jOff], atoms->f[iOff], sizeof(real3));
+   memcpy(atoms->U+jOff,  atoms->U+iOff,  sizeof(real_t));
+}
+
+/// Get the index of the link cell that contains the specified
+/// coordinate.  This can be either a halo or a local link cell.
+///
+/// Because the rank ownership of an atom is strictly determined by the
+/// atom's position, we need to take care that all ranks will agree which
+/// rank owns an atom.  The conditionals at the end of this function are
+/// special care to ensure that all ranks make compatible link cell
+/// assignments for atoms that are near a link cell boundaries.  If no
+/// ranks claim an atom in a local cell it will be lost.  If multiple
+/// ranks claim an atom it will be duplicated.
+int getBoxFromCoord(LinkCell* boxes, real_t rr[3])
+{
+   const real_t* localMin = boxes->localMin; // alias
+   const real_t* localMax = boxes->localMax; // alias
+   const int*    gridSize = boxes->gridSize; // alias
+   int ix = (int)(floor((rr[0] - localMin[0])*boxes->invBoxSize[0]));
+   int iy = (int)(floor((rr[1] - localMin[1])*boxes->invBoxSize[1]));
+   int iz = (int)(floor((rr[2] - localMin[2])*boxes->invBoxSize[2]));
+
+
+   // For each axis, if we are inside the local domain, make sure we get
+   // a local link cell.  Otherwise, make sure we get a halo link cell.
+   if (rr[0] < localMax[0]) 
+   {
+      if (ix == gridSize[0]) ix = gridSize[0] - 1;
+   }
+   else
+      ix = gridSize[0]; // assign to halo cell
+   if (rr[1] < localMax[1])
+   {
+      if (iy == gridSize[1]) iy = gridSize[1] - 1;
+   }
+   else
+      iy = gridSize[1];
+   if (rr[2] < localMax[2])
+   {
+      if (iz == gridSize[2]) iz = gridSize[2] - 1;
+   }
+   else
+      iz = gridSize[2];
+   
+   return getBoxFromTuple(boxes, ix, iy, iz);
+}
+
+/// Set the number of atoms to zero in all halo link cells.
+void emptyHaloCells(LinkCell* boxes)
+{
+   for (int ii=boxes->nLocalBoxes; ii<boxes->nTotalBoxes; ++ii)
+      boxes->nAtoms[ii] = 0;
+}
+
+/// Get the grid coordinates of the link cell with index iBox.  Local
+/// cells are easy as they use a standard 1D->3D mapping.  Halo cell are
+/// special cases.
+/// \see initLinkCells for information on link cell order.
+/// \param [in]  iBox Index to link cell for which tuple is needed.
+/// \param [out] ixp  x grid coord of link cell.
+/// \param [out] iyp  y grid coord of link cell.
+/// \param [out] izp  z grid coord of link cell.
+void getTuple(LinkCell* boxes, int iBox, int* ixp, int* iyp, int* izp)
+{
+   int ix, iy, iz;
+   const int* gridSize = boxes->gridSize; // alias
+   
+   // If a local box
+   if( iBox < boxes->nLocalBoxes)
+   {
+      ix = iBox % gridSize[0];
+      iBox /= gridSize[0];
+      iy = iBox % gridSize[1];
+      iz = iBox / gridSize[1];
+   }
+   // It's a halo box
+   else 
+   {
+      int ink;
+      ink = iBox - boxes->nLocalBoxes;
+      if (ink < 2*gridSize[1]*gridSize[2])
+      {
+         if (ink < gridSize[1]*gridSize[2]) 
+         {
+            ix = 0;
+         }
+         else 
+         {
+            ink -= gridSize[1]*gridSize[2];
+            ix = gridSize[0] + 1;
+         }
+         iy = 1 + ink % gridSize[1];
+         iz = 1 + ink / gridSize[1];
+      }
+      else if (ink < (2 * gridSize[2] * (gridSize[1] + gridSize[0] + 2))) 
+      {
+         ink -= 2 * gridSize[2] * gridSize[1];
+         if (ink < ((gridSize[0] + 2) *gridSize[2])) 
+         {
+            iy = 0;
+         }
+         else 
+         {
+            ink -= (gridSize[0] + 2) * gridSize[2];
+            iy = gridSize[1] + 1;
+         }
+         ix = ink % (gridSize[0] + 2);
+         iz = 1 + ink / (gridSize[0] + 2);
+      }
+      else 
+      {
+         ink -= 2 * gridSize[2] * (gridSize[1] + gridSize[0] + 2);
+         if (ink < ((gridSize[0] + 2) * (gridSize[1] + 2))) 
+         {
+            iz = 0;
+         }
+         else 
+         {
+            ink -= (gridSize[0] + 2) * (gridSize[1] + 2);
+            iz = gridSize[2] + 1;
+         }
+         ix = ink % (gridSize[0] + 2);
+         iy = ink / (gridSize[0] + 2);
+      }
+      
+      // Calculated as off by 1
+      ix--;
+      iy--;
+      iz--;
+   }
+   
+   *ixp = ix;
+   *iyp = iy;
+   *izp = iz;
+}
+
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.h
@@ -0,0 +1,11 @@
+/// \file
+/// Computes forces for the 12-6 Lennard Jones (LJ) potential.
+
+#ifndef _LJTYPES_H_
+#define _LJTYPES_H_
+
+struct BasePotentialSt;
+struct BasePotentialSt* initLjPot(void);
+
+#endif
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.c
@@ -0,0 +1,235 @@
+/// \file
+/// Computes forces for the 12-6 Lennard Jones (LJ) potential.
+///
+/// The Lennard-Jones model is not a good representation for the
+/// bonding in copper, its use has been limited to constant volume
+/// simulations where the embedding energy contribution to the cohesive
+/// energy is not included in the two-body potential
+///
+/// The parameters here are taken from Wolf and Phillpot and fit to the
+/// room temperature lattice constant and the bulk melt temperature
+/// Ref: D. Wolf and S.Yip eds. Materials Interfaces (Chapman & Hall
+///      1992) Page 230.
+///
+/// Notes on LJ:
+///
+/// http://en.wikipedia.org/wiki/Lennard_Jones_potential
+///
+/// The total inter-atomic potential energy in the LJ model is:
+///
+/// \f[
+///   E_{tot} = \sum_{ij} U_{LJ}(r_{ij})
+/// \f]
+/// \f[
+///   U_{LJ}(r_{ij}) = 4 \epsilon
+///           \left\{ \left(\frac{\sigma}{r_{ij}}\right)^{12}
+///           - \left(\frac{\sigma}{r_{ij}}\right)^6 \right\}
+/// \f]
+///
+/// where \f$\epsilon\f$ and \f$\sigma\f$ are the material parameters in the potential.
+///    - \f$\epsilon\f$ = well depth
+///    - \f$\sigma\f$   = hard sphere diameter
+///
+///  To limit the interation range, the LJ potential is typically
+///  truncated to zero at some cutoff distance. A common choice for the
+///  cutoff distance is 2.5 * \f$\sigma\f$.
+///  This implementation can optionally shift the potential slightly
+///  upward so the value of the potential is zero at the cuotff
+///  distance.  This shift has no effect on the particle dynamics.
+///
+///
+/// The force on atom i is given by
+///
+/// \f[
+///   F_i = -\nabla_i \sum_{jk} U_{LJ}(r_{jk})
+/// \f]
+///
+/// where the subsrcipt i on the gradient operator indicates that the
+/// derivatives are taken with respect to the coordinates of atom i.
+/// Liberal use of the chain rule leads to the expression
+///
+/// \f{eqnarray*}{
+///   F_i &=& - \sum_j U'_{LJ}(r_{ij})\hat{r}_{ij}\\
+///       &=& \sum_j 24 \frac{\epsilon}{r_{ij}} \left\{ 2 \left(\frac{\sigma}{r_{ij}}\right)^{12}
+///               - \left(\frac{\sigma}{r_{ij}}\right)^6 \right\} \hat{r}_{ij}
+/// \f}
+///
+/// where \f$\hat{r}_{ij}\f$ is a unit vector in the direction from atom
+/// i to atom j.
+/// 
+///
+
+#include "ljForce.h"
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+#include "constants.h"
+#include "mytype.h"
+#include "parallel.h"
+#include "linkCells.h"
+#include "memUtils.h"
+#include "CoMDTypes.h"
+
+#define POT_SHIFT 1.0
+
+/// Derived struct for a Lennard Jones potential.
+/// Polymorphic with BasePotential.
+/// \see BasePotential
+typedef struct LjPotentialSt
+{
+   real_t cutoff;          //!< potential cutoff distance in Angstroms
+   real_t mass;            //!< mass of atoms in intenal units
+   real_t lat;             //!< lattice spacing (angs) of unit cell
+   char latticeType[8];    //!< lattice type, e.g. FCC, BCC, etc.
+   char  name[3];	   //!< element name
+   int	 atomicNo;	   //!< atomic number  
+   int  (*force)(SimFlat* s); //!< function pointer to force routine
+   void (*print)(FILE* file, BasePotential* pot);
+   void (*destroy)(BasePotential** pot); //!< destruction of the potential
+   real_t sigma;
+   real_t epsilon;
+} LjPotential;
+
+static int ljForce(SimFlat* s);
+static void ljPrint(FILE* file, BasePotential* pot);
+
+void ljDestroy(BasePotential** inppot)
+{
+   if ( ! inppot ) return;
+   LjPotential* pot = (LjPotential*)(*inppot);
+   if ( ! pot ) return;
+   comdFree(pot);
+   *inppot = NULL;
+
+   return;
+}
+
+/// Initialize an Lennard Jones potential for Copper.
+BasePotential* initLjPot(void)
+{
+   LjPotential *pot = (LjPotential*)comdMalloc(sizeof(LjPotential));
+   pot->force = ljForce;
+   pot->print = ljPrint;
+   pot->destroy = ljDestroy;
+   pot->sigma = 2.315;	                  // Angstrom
+   pot->epsilon = 0.167;                  // eV
+   pot->mass = 63.55 * amuToInternalMass; // Atomic Mass Units (amu)
+
+   pot->lat = 3.615;                      // Equilibrium lattice const in Angs
+   strcpy(pot->latticeType, "FCC");       // lattice type, i.e. FCC, BCC, etc.
+   pot->cutoff = 2.5*pot->sigma;          // Potential cutoff in Angs
+
+   strcpy(pot->name, "Cu");
+   pot->atomicNo = 29;
+
+   return (BasePotential*) pot;
+}
+
+void ljPrint(FILE* file, BasePotential* pot)
+{
+   LjPotential* ljPot = (LjPotential*) pot;
+   fprintf(file, "  Potential type   : Lennard-Jones\n");
+   fprintf(file, "  Species name     : %s\n", ljPot->name);
+   fprintf(file, "  Atomic number    : %d\n", ljPot->atomicNo);
+   fprintf(file, "  Mass             : "FMT1" amu\n", ljPot->mass / amuToInternalMass); // print in amu
+   fprintf(file, "  Lattice Type     : %s\n", ljPot->latticeType);
+   fprintf(file, "  Lattice spacing  : "FMT1" Angstroms\n", ljPot->lat);
+   fprintf(file, "  Cutoff           : "FMT1" Angstroms\n", ljPot->cutoff);
+   fprintf(file, "  Epsilon          : "FMT1" eV\n", ljPot->epsilon);
+   fprintf(file, "  Sigma            : "FMT1" Angstroms\n", ljPot->sigma);
+}
+
+int ljForce(SimFlat* s)
+{
+   LjPotential* pot = (LjPotential *) s->pot;
+   real_t sigma = pot->sigma;
+   real_t epsilon = pot->epsilon;
+   real_t rCut = pot->cutoff;
+   real_t rCut2 = rCut*rCut;
+
+   // zero forces and energy
+   real_t ePot = 0.0;
+   s->ePotential = 0.0;
+   int fSize = s->boxes->nTotalBoxes*MAXATOMS;
+   for (int ii=0; ii<fSize; ++ii)
+   {
+      zeroReal3(s->atoms->f[ii]);
+      s->atoms->U[ii] = 0.;
+   }
+   
+   real_t s6 = sigma*sigma*sigma*sigma*sigma*sigma;
+
+   real_t rCut6 = s6 / (rCut2*rCut2*rCut2);
+   real_t eShift = POT_SHIFT * rCut6 * (rCut6 - 1.0);
+
+   int nbrBoxes[27];
+   // loop over local boxes
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; iBox++)
+   {
+      int nIBox = s->boxes->nAtoms[iBox];
+      if ( nIBox == 0 ) continue;
+      int nNbrBoxes = getNeighborBoxes(s->boxes, iBox, nbrBoxes);
+      // loop over neighbors of iBox
+      for (int jTmp=0; jTmp<nNbrBoxes; jTmp++)
+      {
+         int jBox = nbrBoxes[jTmp];
+         
+         assert(jBox>=0);
+         
+         int nJBox = s->boxes->nAtoms[jBox];
+         if ( nJBox == 0 ) continue;
+         
+         // loop over atoms in iBox
+         for (int iOff=iBox*MAXATOMS,ii=0; ii<nIBox; ii++,iOff++)
+         {
+            int iId = s->atoms->gid[iOff];
+            // loop over atoms in jBox
+            for (int jOff=MAXATOMS*jBox,ij=0; ij<nJBox; ij++,jOff++)
+            {
+               real_t dr[3];
+               int jId = s->atoms->gid[jOff];  
+               if (jBox < s->boxes->nLocalBoxes && jId <= iId )
+                  continue; // don't double count local-local pairs.
+               real_t r2 = 0.0;
+               for (int m=0; m<3; m++)
+               {
+                  dr[m] = s->atoms->r[iOff][m]-s->atoms->r[jOff][m];
+                  r2+=dr[m]*dr[m];
+               }
+
+               if ( r2 > rCut2) continue;
+
+               // Important note:
+               // from this point on r actually refers to 1.0/r
+               r2 = 1.0/r2;
+               real_t r6 = s6 * (r2*r2*r2);
+               real_t eLocal = r6 * (r6 - 1.0) - eShift;
+               s->atoms->U[iOff] += 0.5*eLocal;
+               s->atoms->U[jOff] += 0.5*eLocal;
+
+               // calculate energy contribution based on whether
+               // the neighbor box is local or remote
+               if (jBox < s->boxes->nLocalBoxes)
+                  ePot += eLocal;
+               else
+                  ePot += 0.5 * eLocal;
+
+               // different formulation to avoid sqrt computation
+               real_t fr = - 4.0*epsilon*r6*r2*(12.0*r6 - 6.0);
+               for (int m=0; m<3; m++)
+               {
+                  s->atoms->f[iOff][m] -= dr[m]*fr;
+                  s->atoms->f[jOff][m] += dr[m]*fr;
+               }
+            } // loop over atoms in jBox
+         } // loop over atoms in iBox
+      } // loop over neighbor boxes
+   } // loop over local boxes in system
+
+   ePot = ePot*4.0*epsilon;
+   s->ePotential = ePot;
+
+   return 0;
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/memUtils.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/memUtils.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/memUtils.h
@@ -0,0 +1,30 @@
+/// \file
+/// Wrappers for memory allocation.
+
+#ifndef _MEMUTILS_H_
+#define _MEMUTILS_H_
+
+#include <stdlib.h>
+
+#define freeMe(s,element) {if(s->element) comdFree(s->element);  s->element = NULL;}
+
+static void* comdMalloc(size_t iSize)
+{
+   return malloc(iSize);
+}
+
+static void* comdCalloc(size_t num, size_t iSize)
+{
+   return calloc(num, iSize);
+}
+
+static void* comdRealloc(void* ptr, size_t iSize)
+{
+   return realloc(ptr, iSize);
+}
+
+static void comdFree(void *ptr)
+{
+   free(ptr);
+}
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.h
@@ -0,0 +1,37 @@
+/// \file
+/// Handle command line arguments.
+
+#ifndef MYCOMMAND_H
+#define MYCOMMAND_H
+
+#include <stdio.h>
+
+/// A structure to hold the value of every run-time parameter that can
+/// be read from the command line.
+typedef struct CommandSt
+{
+   char potDir[1024];  //!< the directory where EAM potentials reside
+   char potName[1024]; //!< the name of the potential
+   char potType[1024]; //!< the type of the potential (funcfl or setfl)
+   int doeam;          //!< a flag to determine whether we're running EAM potentials
+   int nx;             //!< number of unit cells in x
+   int ny;             //!< number of unit cells in y
+   int nz;             //!< number of unit cells in z
+   int xproc;          //!< number of processors in x direction
+   int yproc;          //!< number of processors in y direction
+   int zproc;          //!< number of processors in z direction
+   int nSteps;         //!< number of time steps to run
+   int printRate;      //!< number of steps between output
+   double dt;          //!< time step (in femtoseconds)
+   double lat;         //!< lattice constant (in Angstroms)
+   double temperature; //!< simulation initial temperature (in Kelvin)
+   double initialDelta; //!< magnitude of initial displacement from lattice (in Angstroms)
+} Command;
+
+/// Process command line arguments into an easy to handle structure.
+Command parseCommandLine(int argc, char** argv);
+
+/// Print run parameters in yaml format on the supplied output stream.
+void printCmdYaml(FILE* file, Command* cmd);
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.c
@@ -0,0 +1,290 @@
+/// \file
+/// Handle command line arguments.
+
+#include "mycommand.h"
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "cmdLineParser.h"
+#include "parallel.h"
+#include "mytype.h"
+
+/// \page pg_running_comd Running CoMD
+///
+/// \section sec_command_line_options Command Line Options
+///
+/// CoMD accepts a number of command line options to set the parameters
+/// of the simulation.  Every option has both a long form and a short
+/// form.  The long and short form of the arguments are entirely
+/// interchangeable and may be mixed. All the arguments are independent
+/// with the exception of the \--potDir, \--potName, and \--potType,
+/// (short forms -d, -n, and -t) arguments which are only relevant when
+/// used in conjunction with \--doeam, (-e).
+///
+/// Supported options are:
+///
+/// | Long  Form    | Short Form  | Default Value | Description
+/// | :------------ | :---------: | :-----------: | :----------
+/// | \--help       | -h          | N/A           | print this message
+/// | \--potDir     | -d          | pots          | potential directory
+/// | \--potName    | -p          | Cu_u6.eam     | potential name
+/// | \--potType    | -t          | funcfl        | potential type (funcfl or setfl)
+/// | \--doeam      | -e          | N/A           | compute eam potentials (default is LJ)
+/// | \--nx         | -x          | 20            | number of unit cells in x
+/// | \--ny         | -y          | 20            | number of unit cells in y
+/// | \--nz         | -z          | 20            | number of unit cells in z
+/// | \--xproc      | -i          | 1             | number of ranks in x direction
+/// | \--yproc      | -j          | 1             | number of ranks in y direction
+/// | \--zproc      | -k          | 1             | number of ranks in z direction
+/// | \--nSteps     | -N          | 100           | total number of time steps
+/// | \--printRate  | -n          | 10            | number of steps between output
+/// | \--dt         | -D          | 1             | time step (in fs)
+/// | \--lat        | -l          | -1            | lattice parameter (Angstroms)
+/// | \--temp       | -T          | 600           | initial temperature (K)
+/// | \--delta      | -r          | 0             | initial delta (Angstroms)
+///
+/// Notes: 
+/// 
+/// The negative value for the lattice parameter (such as the default
+/// value, -1) is interpreted as a flag to indicate that the lattice
+/// parameter should be set from the potential. All supplied potentials
+/// are for copper and have a lattice constant of 3.615
+/// Angstroms. Setting the lattice parameter to any positive value will
+/// override the values provided in the potential files.
+///
+/// The default potential name for the funcfl potential type is
+/// Cu_u6.eam (Adams potential).  For the setfl type the default
+/// potential name is Cu01.eam.alloy (Mishin potential).  Although these
+/// will yield similar dynamics, the table have a very different number
+/// of entries (500 vs. 10,000 points, respectively) This may give very
+/// different performance, depending on the hardware.
+///
+/// The default temperature is 600K.  However, when using a perfect
+/// lattice the system will rapidly cool to 300K due to equipartition of
+/// energy.
+///
+/// 
+/// \subsection ssec_example_command_lines Examples
+///
+/// All of the examples below assume:
+/// - The current working directory contains a copy of the pots dir (or
+///   a link to it).
+/// - The CoMD bin directory is located in ../bin
+///
+/// Running in the examples directory will satisfy these requirements.
+///
+/// ------------------------------
+///
+/// The canonical base simulation, is 
+///
+///     $ mpirun -np 1 ../bin/CoMD-mpi 
+/// 
+/// Or, if the code was built without MPI:
+///
+///     $ ../bin/CoMD-serial
+///
+/// ------------------------------
+///
+/// \subsubsection cmd_examples_potential Changing Potentials
+///
+/// To run with the default (Adams) EAM potential, specify -e:
+///
+///     $ ../bin/CoMD-mpi -e 
+///
+/// ------------------------------
+///
+/// To run using the Mishin EAM potential contained in the setfl file
+/// Cu01.eam.alloy. This potential uses much larger tables (10,000
+/// entries vs. 500 for the Adams potential).
+///
+///     $ ../bin/CoMD-mpi -e -t setfl 
+///
+/// ------------------------------
+///
+/// Selecting the name of a setfl file without setting the appropriate
+/// potential type
+///
+///     $ ../bin/CoMD-mpi -e -p Cu01.eam.alloy 
+/// 
+/// will result in an error message:
+///
+/// Only FCC Lattice type supported, not . Fatal Error.
+/// 
+/// Instead use:
+///
+///     $ ../bin/CoMD-mpi -e -t setfl -p Cu01.eam.alloy 
+///
+/// ------------------------------
+///
+/// \subsubsection cmd_example_struct Initial Structure Modifications
+///
+/// To change the lattice constant and run with an expanded or
+/// compressed lattice:
+///
+///     $ ../bin/CoMD-mpi -l 3.5 
+///
+/// This can be useful to test that the potential is being correctly
+/// evaluated as a function of interatomic spacing (the cold
+/// curve). However, due to the high degree of symmetry of a perfect
+/// lattice, this type of test is unlikely to detect errors in the force
+/// computation.
+///
+/// ------------------------------
+///
+/// Initialize with zero temperature (zero instantaneous particle
+/// velocity) but with a random displacements of the atoms (in this
+/// case the maximum displacement is 0.1 Angstrom along each axis).  
+/// 
+///      $ ../bin/CoMD-mpi --delta 0.1 -T 0
+///
+/// Typical values of delta are in the range of 0.1 to 0.5 Angstroms.
+/// Larger values of delta correspond to higher initial potential energy
+/// which in turn produce higer temperatures as the structure
+/// equilibrates.
+///
+/// ------------------------------
+///
+///
+/// \subsubsection cmd_examples_scaling Scaling Examples
+///
+/// Simple shell scripts that demonstrate weak and strong scaling
+/// studies are provided in the examples directory.
+///
+/// ------------------------------
+///
+/// Run the default global simulation size (32,000 atoms) distributed
+/// over 8 cubic subdomains, an example of strong scaling.  If the
+/// number of processors does not equal (i*j*k) the run will abort.
+/// Notice that spaces are optional between short form options and their
+/// arguments.
+/// 
+///     $ mpirun -np 8 ../bin/CoMD-mpi -i2 -j2 -k2
+/// 
+/// ------------------------------
+///
+/// Run a weak scaling example: the simulation is doubled in each
+/// dimension from the default 20 x 20 x 20 and the number of subdomains
+/// in each direction is also doubled.
+///
+///     $ mpirun -np 8 ../bin/CoMD-mpi -i2 -j2 -k2 -x 40 -y 40 -z 40
+///
+/// ------------------------------
+///
+/// The same weak scaling run, but for 10,000 timesteps, with output
+/// only every 100 steps.
+///
+///     $ mpirun -np 8 ../bin/CoMD-mpi -i2 -j2 -k2 -x 40 -y 40 -z 40 -N 10000 -n 100
+///
+
+/// \details Initialize a Command structure with default values, then
+/// parse any command line arguments that were supplied to overwrite
+/// defaults.
+///
+/// \param [in] argc the number of command line arguments
+/// \param [in] argv the command line arguments array
+Command parseCommandLine(int argc, char** argv)
+{
+   Command cmd;
+
+   memset(cmd.potDir, 0, 1024);
+   memset(cmd.potName, 0, 1024);
+   memset(cmd.potType, 0, 1024);
+   strcpy(cmd.potDir,  "pots");
+   strcpy(cmd.potName, "\0"); // default depends on potType
+   strcpy(cmd.potType, "funcfl");
+   cmd.doeam = 0;
+   cmd.nx = 20;
+   cmd.ny = 20;
+   cmd.nz = 20;
+   cmd.xproc = 1;
+   cmd.yproc = 1;
+   cmd.zproc = 1;
+   cmd.nSteps = 100;
+   cmd.printRate = 10;
+   cmd.dt = 1.0;
+   cmd.lat = -1.0;
+   cmd.temperature = 600.0;
+   cmd.initialDelta = 0.0;
+
+   int help=0;
+   // add arguments for processing.  Please update the html documentation too!
+   addArg("help",       'h', 0, 'i',  &(help),             0,             "print this message");
+   addArg("potDir",     'd', 1, 's',  cmd.potDir,    sizeof(cmd.potDir),  "potential directory");
+   addArg("potName",    'p', 1, 's',  cmd.potName,   sizeof(cmd.potName), "potential name");
+   addArg("potType",    't', 1, 's',  cmd.potType,   sizeof(cmd.potType), "potential type (funcfl or setfl)");
+   addArg("doeam",      'e', 0, 'i',  &(cmd.doeam),        0,             "compute eam potentials");
+   addArg("nx",         'x', 1, 'i',  &(cmd.nx),           0,             "number of unit cells in x");
+   addArg("ny",         'y', 1, 'i',  &(cmd.ny),           0,             "number of unit cells in y");
+   addArg("nz",         'z', 1, 'i',  &(cmd.nz),           0,             "number of unit cells in z");
+   addArg("xproc",      'i', 1, 'i',  &(cmd.xproc),        0,             "processors in x direction");
+   addArg("yproc",      'j', 1, 'i',  &(cmd.yproc),        0,             "processors in y direction");
+   addArg("zproc",      'k', 1, 'i',  &(cmd.zproc),        0,             "processors in z direction");
+   addArg("nSteps",     'N', 1, 'i',  &(cmd.nSteps),       0,             "number of time steps");
+   addArg("printRate",  'n', 1, 'i',  &(cmd.printRate),    0,             "number of steps between output");
+   addArg("dt",         'D', 1, 'd',  &(cmd.dt),           0,             "time step (in fs)");
+   addArg("lat",        'l', 1, 'd',  &(cmd.lat),          0,             "lattice parameter (Angstroms)");
+   addArg("temp",       'T', 1, 'd',  &(cmd.temperature),  0,             "initial temperature (K)");
+   addArg("delta",      'r', 1, 'd',  &(cmd.initialDelta), 0,             "initial delta (Angstroms)");
+
+   processArgs(argc,argv);
+
+   // If user didn't set potName, set type dependent default.
+   if (strlen(cmd.potName) == 0) 
+   {
+      if (strcmp(cmd.potType, "setfl" ) == 0)
+         strcpy(cmd.potName, "Cu01.eam.alloy");
+      if (strcmp(cmd.potType, "funcfl") == 0)
+         strcpy(cmd.potName, "Cu_u6.eam");
+   }
+      
+   if (help)
+   {
+      printArgs();
+      freeArgs();
+      exit(2);
+   }
+   freeArgs();
+
+   return cmd;
+}
+
+void printCmdYaml(FILE* file, Command* cmd)
+{
+   if (! printRank())
+      return;
+   fprintf(file,
+           "Command Line Parameters:\n"
+           "  doeam: %d\n"
+           "  potDir: %s\n"
+           "  potName: %s\n"
+           "  potType: %s\n"
+           "  nx: %d\n"
+           "  ny: %d\n"
+           "  nz: %d\n"
+           "  xproc: %d\n"
+           "  yproc: %d\n"
+           "  zproc: %d\n"
+           "  Lattice constant: %g Angstroms\n"
+           "  nSteps: %d\n"
+           "  printRate: %d\n"
+           "  Time step: %g fs\n"
+           "  Initial Temperature: %g K\n"
+           "  Initial Delta: %g Angstroms\n"
+           "\n",
+           cmd->doeam,
+           cmd->potDir,
+           cmd->potName,
+           cmd->potType,
+           cmd->nx, cmd->ny, cmd->nz,
+           cmd->xproc, cmd->yproc, cmd->zproc,
+           cmd->lat,
+           cmd->nSteps,
+           cmd->printRate,
+           cmd->dt,
+           cmd->temperature,
+           cmd->initialDelta
+   );
+   fflush(file);
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mytype.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mytype.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mytype.h
@@ -0,0 +1,29 @@
+/// \file
+/// Frequently needed typedefs.
+
+#ifndef __MYTYPE_H_
+#define __MYTYPE_H_
+
+/// \def SINGLE determines whether single or double precision is built
+#ifdef SINGLE
+typedef float real_t;  //!< define native type for CoMD as single precision
+  #define FMT1 "%g"    //!< /def format argument for floats 
+  #define EMT1 "%e"    //!< /def format argument for eng floats
+#else
+typedef double real_t; //!< define native type for CoMD as double precision
+  #define FMT1 "%lg"   //!< \def format argument for doubles 
+  #define EMT1 "%le"   //!< \def format argument for eng doubles 
+#endif
+
+typedef real_t real3[3]; //!< a convenience vector with three real_t 
+
+static void zeroReal3(real3 a)
+{
+   a[0] = 0.0;
+   a[1] = 0.0;
+   a[2] = 0.0;
+}
+
+#define screenOut stdout
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.h
@@ -0,0 +1,66 @@
+/// \file
+/// Wrappers for MPI functions.
+
+#ifndef _PARALLEL_H_
+#define _PARALLEL_H_
+
+#include "mytype.h"
+
+/// Structure for use with MPI_MINLOC and MPI_MAXLOC operations.
+typedef struct RankReduceDataSt
+{
+   double val;
+   int rank;
+} RankReduceData;
+
+/// Return total number of processors.
+int getNRanks(void);
+
+/// Return local rank.
+int getMyRank(void);
+
+/// Return non-zero if printing occurs from this rank.
+int printRank(void);
+
+/// Print a timestamp and message when all tasks arrive.
+void timestampBarrier(const char* msg);
+
+/// Wrapper for MPI_Init.
+void initParallel(int *argc, char ***argv);
+
+/// Wrapper for MPI_Finalize.
+void destroyParallel(void);
+
+/// Wrapper for MPI_Barrier(MPI_COMM_WORLD).
+void barrierParallel(void);
+
+/// Wrapper for MPI_Sendrecv.
+int sendReceiveParallel(void* sendBuf, int sendLen, int dest,
+                        void* recvBuf, int recvLen, int source);
+
+/// Wrapper for MPI_Allreduce integer sum.
+void addIntParallel(int* sendBuf, int* recvBuf, int count);
+
+/// Wrapper for MPI_Allreduce real sum.
+void addRealParallel(real_t* sendBuf, real_t* recvBuf, int count);
+
+/// Wrapper for MPI_Allreduce double sum.
+void addDoubleParallel(double* sendBuf, double* recvBuf, int count);
+
+/// Wrapper for MPI_Allreduce integer max.
+void maxIntParallel(int* sendBuf, int* recvBuf, int count);
+
+/// Wrapper for MPI_Allreduce double min with rank.
+void minRankDoubleParallel(RankReduceData* sendBuf, RankReduceData* recvBuf, int count);
+
+/// Wrapper for MPI_Allreduce double max with rank.
+void maxRankDoubleParallel(RankReduceData* sendBuf, RankReduceData* recvBuf, int count);
+
+/// Wrapper for MPI_Bcast
+void bcastParallel(void* buf, int len, int root);
+
+///  Return non-zero if code was built with MPI active.
+int builtWithMpi(void);
+
+#endif
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.c
@@ -0,0 +1,201 @@
+/// \file
+/// Wrappers for MPI functions.  This should be the only compilation 
+/// unit in the code that directly calls MPI functions.  To build a pure
+/// serial version of the code with no MPI, do not define DO_MPI.  If
+/// DO_MPI is not defined then all MPI functionality is replaced with
+/// equivalent single task behavior.
+
+#include "parallel.h"
+
+#ifdef DO_MPI
+#include <mpi.h>
+#endif
+
+#include <stdio.h>
+#include <time.h>
+#include <string.h>
+#include <assert.h>
+
+static int myRank = 0;
+static int nRanks = 1;
+
+#ifdef DO_MPI
+#ifdef SINGLE
+#define REAL_MPI_TYPE MPI_FLOAT
+#else
+#define REAL_MPI_TYPE MPI_DOUBLE
+#endif
+
+#endif
+
+int getNRanks()
+{
+   return nRanks;
+}
+
+int getMyRank()   
+{
+   return myRank;
+}
+
+/// \details
+/// For now this is just a check for rank 0 but in principle it could be
+/// more complex.  It is also possible to suppress practically all
+/// output by causing this function to return 0 for all ranks.
+int printRank()
+{
+   if (myRank == 0) return 1;
+   return 0;
+}
+
+void timestampBarrier(const char* msg)
+{
+   barrierParallel();
+   if (! printRank())
+      return;
+   time_t t= time(NULL);
+   char* timeString = ctime(&t);
+   timeString[24] = '\0'; // clobber newline
+#ifndef PRINT_TIMING
+   timeString[0] = '\0';
+#endif
+
+   fprintf(screenOut, "%s: %s\n", timeString, msg);
+   fflush(screenOut);
+}
+
+void initParallel(int* argc, char*** argv)
+{
+#ifdef DO_MPI
+   MPI_Init(argc, argv);
+   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
+   MPI_Comm_size(MPI_COMM_WORLD, &nRanks);
+#endif
+}
+
+void destroyParallel()
+{
+#ifdef DO_MPI
+   MPI_Finalize();
+#endif
+}
+
+void barrierParallel()
+{
+#ifdef DO_MPI
+   MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+/// \param [in]  sendBuf Data to send.
+/// \param [in]  sendLen Number of bytes to send.
+/// \param [in]  dest    Rank in MPI_COMM_WORLD where data will be sent.
+/// \param [out] recvBuf Received data.
+/// \param [in]  recvLen Maximum number of bytes to receive.
+/// \param [in]  source  Rank in MPI_COMM_WORLD from which to receive.
+/// \return Number of bytes received.
+int sendReceiveParallel(void* sendBuf, int sendLen, int dest,
+                        void* recvBuf, int recvLen, int source)
+{
+#ifdef DO_MPI
+   int bytesReceived;
+   MPI_Status status;
+   MPI_Sendrecv(sendBuf, sendLen, MPI_BYTE, dest,   0,
+                recvBuf, recvLen, MPI_BYTE, source, 0,
+                MPI_COMM_WORLD, &status);
+   MPI_Get_count(&status, MPI_BYTE, &bytesReceived);
+
+   return bytesReceived;
+#else
+   assert(source == dest);
+   memcpy(recvBuf, sendBuf, sendLen);
+
+   return sendLen;
+#endif
+}
+
+void addIntParallel(int* sendBuf, int* recvBuf, int count)
+{
+#ifdef DO_MPI
+   MPI_Allreduce(sendBuf, recvBuf, count, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#else
+   for (int ii=0; ii<count; ++ii)
+      recvBuf[ii] = sendBuf[ii];
+#endif
+}
+
+void addRealParallel(real_t* sendBuf, real_t* recvBuf, int count)
+{
+#ifdef DO_MPI
+   MPI_Allreduce(sendBuf, recvBuf, count, REAL_MPI_TYPE, MPI_SUM, MPI_COMM_WORLD);
+#else
+   for (int ii=0; ii<count; ++ii)
+      recvBuf[ii] = sendBuf[ii];
+#endif
+}
+
+void addDoubleParallel(double* sendBuf, double* recvBuf, int count)
+{
+#ifdef DO_MPI
+   MPI_Allreduce(sendBuf, recvBuf, count, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+   for (int ii=0; ii<count; ++ii)
+      recvBuf[ii] = sendBuf[ii];
+#endif
+}
+
+void maxIntParallel(int* sendBuf, int* recvBuf, int count)
+{
+#ifdef DO_MPI
+   MPI_Allreduce(sendBuf, recvBuf, count, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+#else
+   for (int ii=0; ii<count; ++ii)
+      recvBuf[ii] = sendBuf[ii];
+#endif
+}
+
+
+void minRankDoubleParallel(RankReduceData* sendBuf, RankReduceData* recvBuf, int count)
+{
+#ifdef DO_MPI
+   MPI_Allreduce(sendBuf, recvBuf, count, MPI_DOUBLE_INT, MPI_MINLOC, MPI_COMM_WORLD);
+#else
+   for (int ii=0; ii<count; ++ii)
+   {
+      recvBuf[ii].val = sendBuf[ii].val;
+      recvBuf[ii].rank = sendBuf[ii].rank;
+   }
+#endif
+}
+
+void maxRankDoubleParallel(RankReduceData* sendBuf, RankReduceData* recvBuf, int count)
+{
+#ifdef DO_MPI
+   MPI_Allreduce(sendBuf, recvBuf, count, MPI_DOUBLE_INT, MPI_MAXLOC, MPI_COMM_WORLD);
+#else
+   for (int ii=0; ii<count; ++ii)
+   {
+      recvBuf[ii].val = sendBuf[ii].val;
+      recvBuf[ii].rank = sendBuf[ii].rank;
+   }
+#endif
+}
+
+/// \param [in] count Length of buf in bytes.
+void bcastParallel(void* buf, int count, int root)
+{
+#ifdef DO_MPI
+   MPI_Bcast(buf, count, MPI_BYTE, root, MPI_COMM_WORLD);
+#endif
+}
+
+int builtWithMpi(void)
+{
+#ifdef DO_MPI
+   return 1;
+#else
+   return 0;
+#endif
+}
+
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/performanceTimers.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/performanceTimers.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/performanceTimers.h
@@ -0,0 +1,68 @@
+/// \file
+/// Performance timer functions.
+#ifndef __PERFORMANCE_TIMERS_H_
+#define __PERFORMANCE_TIMERS_H_
+
+#include <stdio.h>
+
+/// Timer handles
+enum TimerHandle{
+   totalTimer, 
+   loopTimer, 
+   timestepTimer, 
+   positionTimer, 
+   velocityTimer,  
+   redistributeTimer, 
+   atomHaloTimer, 
+   computeForceTimer, 
+   eamHaloTimer, 
+   commHaloTimer, 
+   commReduceTimer, 
+   numberOfTimers};
+
+/// Use the startTimer and stopTimer macros for timers in code regions
+/// that may be performance sensitive.  These can be compiled away by
+/// defining NTIMING.  If you are placing a timer anywere outside of a
+/// tight loop, consider calling profileStart and profileStop instead.
+///
+/// Place calls as follows to collect time for code pieces.
+/// Time is collected everytime this portion of code is executed. 
+///
+///     ...
+///     startTimer(computeForceTimer);
+///     computeForce(sim);
+///     stopTimer(computeForceTimer);
+///     ...
+///
+#ifndef NTIMING
+#define startTimer(handle)    \
+   do                         \
+{                          \
+   profileStart(handle);   \
+} while(0)
+#define stopTimer(handle)     \
+   do                         \
+{                          \
+   profileStop(handle);    \
+} while(0)
+#else
+#define startTimer(handle)
+#define stopTimer(handle)
+#endif
+
+/// Use profileStart and profileStop only for timers that should *never*
+/// be turned off.  Typically this means they are outside the main
+/// simulation loop.  If the timer is inside the main loop use
+/// startTimer and stopTimer instead.
+void profileStart(const enum TimerHandle handle);
+void profileStop(const enum TimerHandle handle);
+
+/// Use to get elapsed time (lap timer).
+double getElapsedTime(const enum TimerHandle handle);
+
+/// Print timing results.
+void printPerformanceResults(int nGlobalAtoms, int printRate);
+
+/// Print timing results to Yaml file
+void printPerformanceResultsYaml(FILE* file);
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/performanceTimers.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/performanceTimers.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/performanceTimers.c
@@ -0,0 +1,309 @@
+/// \file
+/// Performance timer functions.
+///
+/// Use the timer functionality to collect timing and number of calls
+/// information for chosen computations (such as force calls) and
+/// communication (such as sends, receives, reductions).  Timing results
+/// are reported at the end of the run showing overall timings and
+/// statistics of timings across ranks.
+///
+/// A new timer can be added as follows:
+/// -# add new handle to the TimerHandle in performanceTimers.h
+/// -# provide a corresponding name in timerName
+///
+/// Note that the order of the handles and names must be the
+/// same. This order also determines the order in which the timers are
+/// printed. Names can contain leading spaces to show a hierarchical
+/// ordering.  Timers with zero calls are omitted from the report.
+///
+/// Raw timer data is obtained from the getTime() and getTick()
+/// functions.  The supplied portable versions of these functions can be
+/// replaced with platform specific versions for improved accuracy or
+/// lower latency.
+/// \see TimerHandle
+/// \see getTime
+/// \see getTick
+///
+
+
+#include "performanceTimers.h"
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <math.h>
+
+#include "performanceTimers.h"
+#include "mytype.h"
+#include "parallel.h"
+#include "yamlOutput.h"
+
+static uint64_t getTime(void);
+static double getTick(void);
+static void timerStats(void);
+
+/// You must add timer name in same order as enum in .h file.
+/// Leading spaces can be specified to show a hierarchy of timers.
+char* timerName[numberOfTimers] = {
+   "total",
+   "loop",
+   "timestep",
+   "  position",
+   "  velocity",
+   "  redistribute",
+   "    atomHalo",
+   "  force",
+   "    eamHalo",
+   "commHalo",
+   "commReduce"
+};
+
+/// Timer data collected.  Also facilitates computing averages and
+/// statistics.
+typedef struct TimersSt
+{
+   uint64_t start;     //!< call start time
+   uint64_t total;     //!< current total time
+   uint64_t count;     //!< current call count
+   uint64_t elapsed;   //!< lap time
+ 
+   int minRank;        //!< rank with min value
+   int maxRank;        //!< rank with max value
+
+   double minValue;    //!< min over ranks
+   double maxValue;    //!< max over ranks
+   double average;     //!< average over ranks
+   double stdev;       //!< stdev across ranks
+} Timers;
+
+/// Global timing data collected.
+typedef struct TimerGlobalSt
+{
+   double atomRate;       //!< average time (us) per atom per rank 
+   double atomAllRate;    //!< average time (us) per atom
+   double atomsPerUSec;   //!< average atoms per time (us)
+} TimerGlobal;
+
+static Timers perfTimer[numberOfTimers];
+static TimerGlobal perfGlobal;
+
+void profileStart(const enum TimerHandle handle)
+{
+   perfTimer[handle].start = getTime();
+}
+
+void profileStop(const enum TimerHandle handle)
+{
+   perfTimer[handle].count += 1;
+   uint64_t delta = getTime() - perfTimer[handle].start;
+   perfTimer[handle].total += delta;
+   perfTimer[handle].elapsed += delta;
+}
+
+/// \details
+/// Return elapsed time (in seconds) since last call with this handle
+/// and clear for next lap.
+double getElapsedTime(const enum TimerHandle handle)
+{
+   double etime = getTick() * (double)perfTimer[handle].elapsed;
+   perfTimer[handle].elapsed = 0;
+
+   return etime;
+}
+
+/// \details
+/// The report contains two blocks.  The upper block is performance
+/// information for the printRank.  The lower block is statistical
+/// information over all ranks.
+void printPerformanceResults(int nGlobalAtoms, int printRate)
+{
+   // Collect timer statistics overall and across ranks
+   timerStats();
+
+   if (!printRank())
+      return;
+
+   // only print timers with non-zero values.
+   double tick = getTick();
+   double loopTime = perfTimer[loopTimer].total*tick;
+   
+   fprintf(screenOut, "\n\nTimings for Rank %d\n", getMyRank());
+   fprintf(screenOut, "        Timer        # Calls    Avg/Call (s)   Total (s)    %% Loop\n");
+   fprintf(screenOut, "___________________________________________________________________\n");
+   for (int ii=0; ii<numberOfTimers; ++ii)
+   {
+      double totalTime = perfTimer[ii].total*tick;
+      if (perfTimer[ii].count > 0)
+         fprintf(screenOut, "%-16s%12"PRIu64"     %8.4f      %8.4f    %8.2f\n", 
+                 timerName[ii],
+                 perfTimer[ii].count,
+                 totalTime/(double)perfTimer[ii].count,
+                 totalTime,
+                 totalTime/loopTime*100.0);
+   }
+
+   fprintf(screenOut, "\nTiming Statistics Across %d Ranks:\n", getNRanks());
+   fprintf(screenOut, "        Timer        Rank: Min(s)       Rank: Max(s)      Avg(s)    Stdev(s)\n");
+   fprintf(screenOut, "_____________________________________________________________________________\n");
+
+   for (int ii = 0; ii < numberOfTimers; ++ii)
+   {
+      if (perfTimer[ii].count > 0)
+         fprintf(screenOut, "%-16s%6d:%10.4f  %6d:%10.4f  %10.4f  %10.4f\n", 
+            timerName[ii], 
+            perfTimer[ii].minRank, perfTimer[ii].minValue*tick,
+            perfTimer[ii].maxRank, perfTimer[ii].maxValue*tick,
+            perfTimer[ii].average*tick, perfTimer[ii].stdev*tick);
+   }
+   double atomsPerTask = nGlobalAtoms/(real_t)getNRanks();
+   perfGlobal.atomRate = perfTimer[timestepTimer].average * tick * 1e6 /
+      (atomsPerTask * perfTimer[timestepTimer].count * printRate);
+   perfGlobal.atomAllRate = perfTimer[timestepTimer].average * tick * 1e6 /
+      (nGlobalAtoms * perfTimer[timestepTimer].count * printRate);
+   perfGlobal.atomsPerUSec = 1.0 / perfGlobal.atomAllRate;
+
+   fprintf(screenOut, "\n---------------------------------------------------\n");
+   fprintf(screenOut, " Average atom update rate:     %6.2f us/atom/task\n", perfGlobal.atomRate);
+   fprintf(screenOut, "---------------------------------------------------\n\n");
+
+   fprintf(screenOut, "\n---------------------------------------------------\n");
+   fprintf(screenOut, " Average all atom update rate: %6.2f us/atom\n", perfGlobal.atomAllRate);
+   fprintf(screenOut, "---------------------------------------------------\n\n");
+
+   fprintf(screenOut, "\n---------------------------------------------------\n");
+   fprintf(screenOut, " Average atom rate:            %6.2f atoms/us\n", perfGlobal.atomsPerUSec);
+   fprintf(screenOut, "---------------------------------------------------\n\n");
+}
+
+void printPerformanceResultsYaml(FILE* file)
+{
+   if (! printRank())
+      return;
+
+   double tick = getTick();
+   double loopTime = perfTimer[loopTimer].total*tick;
+
+   fprintf(file,"\nPerformance Results:\n");
+   fprintf(file, "  TotalRanks: %d\n", getNRanks());
+   fprintf(file, "  ReportingTimeUnits: seconds\n");
+   fprintf(file, "Performance Results For Rank %d:\n", getMyRank());
+   for (int ii = 0; ii < numberOfTimers; ii++)
+   {
+      if (perfTimer[ii].count > 0)
+      {
+         double totalTime = perfTimer[ii].total*tick;
+         fprintf(file, "  Timer: %s\n", timerName[ii]);
+         fprintf(file, "    CallCount:  %"PRIu64"\n", perfTimer[ii].count); 
+         fprintf(file, "    AvgPerCall: %8.4f\n", totalTime/(double)perfTimer[ii].count);
+         fprintf(file, "    Total:      %8.4f\n", totalTime);
+         fprintf(file, "    PercentLoop: %8.2f\n", totalTime/loopTime*100);
+      }
+   }
+
+   fprintf(file, "Performance Results Across Ranks:\n");
+   for (int ii = 0; ii < numberOfTimers; ii++)
+   {
+      if (perfTimer[ii].count > 0)
+      {
+         fprintf(file, "  Timer: %s\n", timerName[ii]);
+         fprintf(file, "    MinRank: %d\n", perfTimer[ii].minRank);
+         fprintf(file, "    MinTime: %8.4f\n", perfTimer[ii].minValue*tick);     
+         fprintf(file, "    MaxRank: %d\n", perfTimer[ii].maxRank);
+         fprintf(file, "    MaxTime: %8.4f\n", perfTimer[ii].maxValue*tick);
+         fprintf(file, "    AvgTime: %8.4f\n", perfTimer[ii].average*tick);
+         fprintf(file, "    StdevTime: %8.4f\n", perfTimer[ii].stdev*tick);
+      }
+   }
+
+   fprintf(file,"Performance Global Update Rates:\n");
+   fprintf(file, "  AtomUpdateRate:\n");
+   fprintf(file, "    AverageRate: %6.2f\n", perfGlobal.atomRate);
+   fprintf(file, "    Units: us/atom/task\n");
+   fprintf(file, "  AllAtomUpdateRate:\n");
+   fprintf(file, "    AverageRate: %6.2f\n", perfGlobal.atomAllRate);
+   fprintf(file, "    Units: us/atom\n");
+   fprintf(file, "  AtomRate:\n");
+   fprintf(file, "    AverageRate: %6.2f\n", perfGlobal.atomsPerUSec);
+   fprintf(file, "    Units: atoms/us\n");
+ 
+   fprintf(file, "\n");
+}
+
+/// Returns current time as a 64-bit integer.  This portable version
+/// returns the number of microseconds since mindight, Jamuary 1, 1970.
+/// Hence, timing data will have a resolution of 1 microsecond.
+/// Platforms with access to calls with lower latency or higher
+/// resolution (such as a cycle counter) may wish to replace this
+/// implementation and change the conversion factor in getTick as
+/// appropriate.
+/// \see getTick for the conversion factor between the integer time
+/// units of this function and seconds.
+static uint64_t getTime(void)
+{
+   struct timeval ptime;
+   uint64_t t = 0;
+   gettimeofday(&ptime, (struct timezone *)NULL);
+   t = ((uint64_t)1000000)*(uint64_t)ptime.tv_sec + (uint64_t)ptime.tv_usec;
+
+   return t; 
+}
+
+/// Returns the factor for converting the integer time reported by
+/// getTime into seconds.  The portable getTime returns values in units
+/// of microseconds so the conversion is simply 1e-6.
+/// \see getTime
+static double getTick(void)
+{
+   double seconds_per_cycle = 1.0e-6;
+   return seconds_per_cycle; 
+}
+
+/// Collect timer statistics across ranks.
+void timerStats(void)
+{
+   double sendBuf[numberOfTimers], recvBuf[numberOfTimers];
+   
+   // Determine average of each timer across ranks
+   for (int ii = 0; ii < numberOfTimers; ii++)
+      sendBuf[ii] = (double)perfTimer[ii].total;
+   addDoubleParallel(sendBuf, recvBuf, numberOfTimers);
+
+   for (int ii = 0; ii < numberOfTimers; ii++)
+      perfTimer[ii].average = recvBuf[ii] / (double)getNRanks();
+
+
+   // Determine min and max across ranks and which rank
+   RankReduceData reduceSendBuf[numberOfTimers], reduceRecvBuf[numberOfTimers];
+   for (int ii = 0; ii < numberOfTimers; ii++)
+   {
+      reduceSendBuf[ii].val = (double)perfTimer[ii].total;
+      reduceSendBuf[ii].rank = getMyRank();
+   }
+   minRankDoubleParallel(reduceSendBuf, reduceRecvBuf, numberOfTimers);   
+   for (int ii = 0; ii < numberOfTimers; ii++)
+   {
+      perfTimer[ii].minValue = reduceRecvBuf[ii].val;
+      perfTimer[ii].minRank = reduceRecvBuf[ii].rank;
+   }
+   maxRankDoubleParallel(reduceSendBuf, reduceRecvBuf, numberOfTimers);   
+   for (int ii = 0; ii < numberOfTimers; ii++)
+   {
+      perfTimer[ii].maxValue = reduceRecvBuf[ii].val;
+      perfTimer[ii].maxRank = reduceRecvBuf[ii].rank;
+   }
+   
+   // Determine standard deviation
+   for (int ii = 0; ii < numberOfTimers; ii++)
+   {
+      double temp = (double)perfTimer[ii].total - perfTimer[ii].average;
+      sendBuf[ii] = temp * temp;
+   }
+   addDoubleParallel(sendBuf, recvBuf, numberOfTimers);
+   for (int ii = 0; ii < numberOfTimers; ii++)
+   {
+      perfTimer[ii].stdev = sqrt(recvBuf[ii] / (double) getNRanks());
+   }
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.h
@@ -0,0 +1,20 @@
+/// \file
+/// Simple random number generators for uniform and gaussian
+/// distributions.
+
+#ifndef __RANDOM_H
+#define __RANDOM_H
+
+#include "mytype.h"
+#include <stdint.h>
+
+/// Return a random number from a Gaussian distribution.
+real_t gasdev(uint64_t* seed);
+
+/// Return a random number from a uniform distribution.
+double lcg61(uint64_t* seed);
+
+/// Return a seed suitable for calling lcg61 or gasdev.
+uint64_t mkSeed(uint32_t id, uint32_t callSite);
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.c
@@ -0,0 +1,75 @@
+/// \file
+///  Simple random number generators for uniform and Gaussian
+///  distributions.  The generator in lcg61 and the hash in mkSeed aren't
+///  really industrial strength, but they're more than good enough for
+///  present purposes.
+
+#include "random.h"
+
+#include <math.h>
+
+/// \details
+///  Use the Box-Muller method to sample a Gaussian distribution with
+///  zero mean and unit variance.  To ensure the same input seed always
+///  generates the same returned value we do not use the standard
+///  technique of saving one of the two generated randoms for the next
+///  call.
+///
+///  \param [in,out] seed  Seed for generator.
+///
+///  \return A pseudo-random number in the interval (-infinity, infinity).
+real_t gasdev(uint64_t* seed)
+{
+   real_t rsq,v1,v2;
+   do 
+   {
+      v1 = 2.0*lcg61(seed)-1.0;
+      v2 = 2.0*lcg61(seed)-1.0;
+      rsq = v1*v1+v2*v2;
+   } while (rsq >= 1.0 || rsq == 0.0);
+
+   return v2 * sqrt(-2.0*log(rsq)/rsq);
+} 
+
+/// \details
+///  A 61-bit prime modulus linear congruential generator with
+/// modulus = 2^61 -1.
+///
+///  \param [in,out] seed  Seed for generator.
+///
+///  \return A pseudo-random number in the interval [0, 1].
+double lcg61(uint64_t* seed)
+{
+   static const double convertToDouble = 1.0/UINT64_C(2305843009213693951);
+
+   *seed *= UINT64_C(437799614237992725);
+   *seed %= UINT64_C(2305843009213693951);
+
+   return *seed*convertToDouble;
+}
+
+/// \details
+///  Forms a 64-bit seed for lcg61 from the combination of 2 32-bit Knuth
+///  multiplicative hashes, then runs off 10 values to pass up the worst
+///  of the early low-bit correlations.
+///
+///  \param [in] id An id number such as an atom gid that is unique to
+///  each entity that requires random numbers.
+///
+///  \param [in] callSite A unique number for each call site in the code
+///  that needs to generate random seeds.  Using a different value for
+///  callSite allows different parts of the code to obtain different 
+///  random streams for the same id.
+/// 
+///  \return A 64-bit seed that is unique to the id and call site.
+uint64_t mkSeed(uint32_t id, uint32_t callSite)
+{
+   uint32_t s1 = id * UINT32_C(2654435761);
+   uint32_t s2 = (id+callSite) * UINT32_C(2654435761);
+
+   uint64_t iSeed = (UINT64_C(0x100000000) * s1) + s2;
+   for (unsigned jj=0; jj<10; ++jj)
+      lcg61(&iSeed);
+      
+   return iSeed;
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.h
@@ -0,0 +1,16 @@
+/// \file
+/// Leapfrog time integrator
+
+#ifndef __LEAPFROG_H
+#define __LEAPFROG_H
+
+#include "CoMDTypes.h"
+
+double timestep(SimFlat* s, int n, real_t dt);
+void computeForce(SimFlat* s);
+void kineticEnergy(SimFlat* s);
+
+/// Update local and remote link cells after atoms have moved.
+void redistributeAtoms(struct SimFlatSt* sim);
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.c
@@ -0,0 +1,146 @@
+/// \file
+/// Leapfrog time integrator
+
+#include "timestep.h"
+
+#include "CoMDTypes.h"
+#include "linkCells.h"
+#include "parallel.h"
+#include "performanceTimers.h"
+
+static void advanceVelocity(SimFlat* s, int nBoxes, real_t dt);
+static void advancePosition(SimFlat* s, int nBoxes, real_t dt);
+
+
+/// Advance the simulation time to t+dt using a leap frog method
+/// (equivalent to velocity verlet).
+///
+/// Forces must be computed before calling the integrator the first time.
+///
+///  - Advance velocities half time step using forces
+///  - Advance positions full time step using velocities
+///  - Update link cells and exchange remote particles
+///  - Compute forces
+///  - Update velocities half time step using forces
+///
+/// This leaves positions, velocities, and forces at t+dt, with the
+/// forces ready to perform the half step velocity update at the top of
+/// the next call.
+///
+/// After nSteps the kinetic energy is computed for diagnostic output.
+double timestep(SimFlat* s, int nSteps, real_t dt)
+{
+   for (int ii=0; ii<nSteps; ++ii)
+   {
+      startTimer(velocityTimer);
+      advanceVelocity(s, s->boxes->nLocalBoxes, 0.5*dt); 
+      stopTimer(velocityTimer);
+
+      startTimer(positionTimer);
+      advancePosition(s, s->boxes->nLocalBoxes, dt);
+      stopTimer(positionTimer);
+
+      startTimer(redistributeTimer);
+      redistributeAtoms(s);
+      stopTimer(redistributeTimer);
+
+      startTimer(computeForceTimer);
+      computeForce(s);
+      stopTimer(computeForceTimer);
+
+      startTimer(velocityTimer);
+      advanceVelocity(s, s->boxes->nLocalBoxes, 0.5*dt); 
+      stopTimer(velocityTimer);
+   }
+
+   kineticEnergy(s);
+
+   return s->ePotential;
+}
+
+void computeForce(SimFlat* s)
+{
+   s->pot->force(s);
+}
+
+
+void advanceVelocity(SimFlat* s, int nBoxes, real_t dt)
+{
+   for (int iBox=0; iBox<nBoxes; iBox++)
+   {
+      for (int iOff=MAXATOMS*iBox,ii=0; ii<s->boxes->nAtoms[iBox]; ii++,iOff++)
+      {
+         s->atoms->p[iOff][0] += dt*s->atoms->f[iOff][0];
+         s->atoms->p[iOff][1] += dt*s->atoms->f[iOff][1];
+         s->atoms->p[iOff][2] += dt*s->atoms->f[iOff][2];
+      }
+   }
+}
+
+void advancePosition(SimFlat* s, int nBoxes, real_t dt)
+{
+   for (int iBox=0; iBox<nBoxes; iBox++)
+   {
+      for (int iOff=MAXATOMS*iBox,ii=0; ii<s->boxes->nAtoms[iBox]; ii++,iOff++)
+      {
+         int iSpecies = s->atoms->iSpecies[iOff];
+         real_t invMass = 1.0/s->species[iSpecies].mass;
+         s->atoms->r[iOff][0] += dt*s->atoms->p[iOff][0]*invMass;
+         s->atoms->r[iOff][1] += dt*s->atoms->p[iOff][1]*invMass;
+         s->atoms->r[iOff][2] += dt*s->atoms->p[iOff][2]*invMass;
+      }
+   }
+}
+
+/// Calculates total kinetic and potential energy across all tasks.  The
+/// local potential energy is a by-product of the force routine.
+void kineticEnergy(SimFlat* s)
+{
+   real_t eLocal[2];
+   eLocal[0] = s->ePotential;
+   eLocal[1] = 0;
+   for (int iBox=0; iBox<s->boxes->nLocalBoxes; iBox++)
+   {
+      for (int iOff=MAXATOMS*iBox,ii=0; ii<s->boxes->nAtoms[iBox]; ii++,iOff++)
+      {
+         int iSpecies = s->atoms->iSpecies[iOff];
+         real_t invMass = 0.5/s->species[iSpecies].mass;
+         eLocal[1] += ( s->atoms->p[iOff][0] * s->atoms->p[iOff][0] +
+         s->atoms->p[iOff][1] * s->atoms->p[iOff][1] +
+         s->atoms->p[iOff][2] * s->atoms->p[iOff][2] )*invMass;
+      }
+   }
+
+   real_t eSum[2];
+   startTimer(commReduceTimer);
+   addRealParallel(eLocal, eSum, 2);
+   stopTimer(commReduceTimer);
+
+   s->ePotential = eSum[0];
+   s->eKinetic = eSum[1];
+}
+
+/// \details
+/// This function provides one-stop shopping for the sequence of events
+/// that must occur for a proper exchange of halo atoms after the atom
+/// positions have been updated by the integrator.
+///
+/// - updateLinkCells: Since atoms have moved, some may be in the wrong
+///   link cells.
+/// - haloExchange (atom version): Sends atom data to remote tasks. 
+/// - sort: Sort the atoms.
+///
+/// \see updateLinkCells
+/// \see initAtomHaloExchange
+/// \see sortAtomsInCell
+void redistributeAtoms(SimFlat* sim)
+{
+   updateLinkCells(sim->boxes, sim->atoms);
+
+   startTimer(atomHaloTimer);
+   haloExchange(sim->atomExchange, sim);
+   stopTimer(atomHaloTimer);
+
+   for (int ii=0; ii<sim->boxes->nTotalBoxes; ++ii)
+      sortAtomsInCell(sim->atoms, sim->boxes, ii);
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.h
@@ -0,0 +1,19 @@
+/// \file
+/// Write simulation information in YAML format.
+
+#ifndef __YAML_OUTPUT_H
+#define __YAML_OUTPUT_H
+
+#include <stdio.h>
+
+/// Provide access to the YAML file in other compilation units.
+extern FILE* yamlFile;
+
+void yamlBegin(void);
+void yamlEnd(void);
+
+void yamlAppInfo(FILE* file);
+
+void printSeparator(FILE* file);
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.c
@@ -0,0 +1,108 @@
+/// \file
+/// Write simulation information in YAML format.
+///
+/// Information regarding platform, run parameters, performance, etc.,
+/// are written to a file whose name is generated from the CoMDVariant
+/// and the time of the run.  This provides a simple mechanism to track
+/// and compare performance etc.
+///
+/// There are much more sophisticated libraries and routines available
+/// to handle YAML, but this simple implemenation handles everything we
+/// really need.
+
+#include "yamlOutput.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "CoMD_info.h"
+#include "mytype.h"
+#include "parallel.h"
+
+FILE* yamlFile = NULL;
+
+static const char* CoMDVersion = "1.1";
+static const char* CoMDVariant = CoMD_VARIANT;
+
+static void getTimeString(char* timestring)
+{
+#ifndef PRINT_TIMING
+   *timestring = '\0';
+   return;
+#endif
+
+   time_t rawtime;
+   struct tm* timeinfo;
+   time(&rawtime);
+   timeinfo = localtime(&rawtime);
+
+   sprintf(timestring,
+           "%4d-%02i-%02d, %02d:%02d:%02d",
+           timeinfo->tm_year+1900,
+           timeinfo->tm_mon+1,
+           timeinfo->tm_mday,
+           timeinfo->tm_hour,
+           timeinfo->tm_min,
+           timeinfo->tm_sec);
+}
+
+void yamlBegin(void)
+{
+   if (! printRank())
+      return;
+
+   char filename[64];
+   time_t rawtime;
+   time (&rawtime);
+   struct tm* ptm = localtime(&rawtime);
+   char sdate[25];
+  //use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
+   sprintf (sdate,"%04d:%02d:%02d-%02d:%02d:%02d",
+            ptm->tm_year + 1900, ptm->tm_mon+1,
+            ptm->tm_mday, ptm->tm_hour, ptm->tm_min,ptm->tm_sec);
+   sprintf(filename, "%s.%s.yaml", CoMDVariant, sdate);
+   yamlFile = fopen(filename, "w");
+}
+
+void yamlAppInfo(FILE* file)
+{
+   if (! printRank())
+      return;
+   printSeparator(file);
+   fprintf(file,"Mini-Application Name    : %s\n", CoMDVariant);
+   fprintf(file,"Mini-Application Version : %s\n", CoMDVersion);
+   fprintf(file,"Platform:\n");
+   fprintf(file,"  hostname: %s\n",         CoMD_HOSTNAME);
+   fprintf(file,"  kernel name: %s\n",      CoMD_KERNEL_NAME);
+   fprintf(file,"  kernel release: %s\n",   CoMD_KERNEL_RELEASE);
+   fprintf(file,"  processor: %s\n",        CoMD_PROCESSOR);
+   fprintf(file,"Build:\n");
+   fprintf(file,"  CC: %s\n",               CoMD_COMPILER);
+   fprintf(file,"  compiler version: %s\n", CoMD_COMPILER_VERSION);
+   fprintf(file,"  CFLAGS: %s\n",           CoMD_CFLAGS);
+   fprintf(file,"  LDFLAGS: %s\n",          CoMD_LDFLAGS);
+   fprintf(file,"  using MPI: %s\n",        builtWithMpi() ? "true":"false");
+   fprintf(file,"  Threading: none\n");
+   fprintf(file,"  Double Precision: %s\n", (sizeof(real_t)==sizeof(double)?"true":"false"));
+   char timestring[32];
+   getTimeString(timestring);
+   fprintf(file,"Run Date/Time: %s\n", timestring);
+   fprintf(file, "\n");
+   fflush(file);
+}
+
+void yamlEnd(void)
+{
+   if (! printRank())
+      return;
+
+   fclose(yamlFile);
+}
+
+void printSeparator(FILE* file)
+{
+      //fprintf(file,"=========================================================================\n");
+      fprintf(file,"\n");
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
@@ -6,6 +6,7 @@
                 Pathfinder \
                 miniGMG    \
                 RSBench    \
-                SimpleMOC
+                SimpleMOC  \
+                CoMD
 
 include $(LEVEL)/Makefile.programs