Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt @@ -4,3 +4,4 @@ add_subdirectory(miniGMG) add_subdirectory(RSBench) add_subdirectory(SimpleMOC) +add_subdirectory(CoMD) Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeLists.txt =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeLists.txt +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeLists.txt @@ -0,0 +1,6 @@ +list(APPEND CFLAGS -std=c99 -DDOUBLE) +list(APPEND LDFLAGS -lm) + +set(RUN_OPTIONS -e -x 10 -y 10 -z 10 --potDir .) +llvm_multisource(CoMD) +llvm_test_data(CoMD Cu_u6.eam) Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.c @@ -0,0 +1,1107 @@ +/// \file +/// Main program +/// +/// \mainpage CoMD: A Classical Molecular Dynamics Mini-app +/// +/// CoMD is a reference implementation of typical classical molecular +/// dynamics algorithms and workloads. It is created and maintained by +/// The Exascale Co-Design Center for Materials in Extreme Environments +/// (ExMatEx). http://codesign.lanl.gov/projects/exmatex. The +/// code is intended to serve as a vehicle for co-design by allowing +/// others to extend and/or reimplement it as needed to test performance of +/// new architectures, programming models, etc. +/// +/// The current version of CoMD is available from: +/// http://exmatex.github.io/CoMD +/// +/// To contact the developers of CoMD send email to: exmatex-comd@llnl.gov. +/// +/// Table of Contents +/// ================= +/// +/// Click on the links below to browse the CoMD documentation. +/// +/// \subpage pg_md_basics +/// +/// \subpage pg_building_comd +/// +/// \subpage pg_running_comd +/// +/// \subpage pg_measuring_performance +/// +/// \subpage pg_problem_selection_and_scaling +/// +/// \subpage pg_verifying_correctness +/// +/// \subpage pg_comd_architecture +/// +/// \subpage pg_optimization_targets +/// +/// \subpage pg_whats_new + +#include +#include +#include +#include +#include +#include + +#include "CoMDTypes.h" +#include "decomposition.h" +#include "linkCells.h" +#include "eam.h" +#include "ljForce.h" +#include "initAtoms.h" +#include "memUtils.h" +#include "yamlOutput.h" +#include "parallel.h" +#include "performanceTimers.h" +#include "mycommand.h" +#include "timestep.h" +#include "constants.h" + +#define REDIRECT_OUTPUT 0 +#define MIN(A,B) ((A) < (B) ? (A) : (B)) + +static SimFlat* initSimulation(Command cmd); +static void destroySimulation(SimFlat** ps); + +static void initSubsystems(void); +static void finalizeSubsystems(void); + +static BasePotential* initPotential( + int doeam, const char* potDir, const char* potName, const char* potType); +static SpeciesData* initSpecies(BasePotential* pot); +static Validate* initValidate(SimFlat* s); +static void validateResult(const Validate* val, SimFlat *sim); + +static void sumAtoms(SimFlat* s); +static void printThings(SimFlat* s, int iStep, double elapsedTime); +static void printSimulationDataYaml(FILE* file, SimFlat* s); +static void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8]); + + +int main(int argc, char** argv) +{ + // Prolog + initParallel(&argc, &argv); + profileStart(totalTimer); + initSubsystems(); + timestampBarrier("Starting Initialization\n"); + +#ifdef PRINT_YAML + yamlAppInfo(yamlFile); +#endif + yamlAppInfo(screenOut); + + Command cmd = parseCommandLine(argc, argv); +#ifdef PRINT_YAML + printCmdYaml(yamlFile, &cmd); +#endif + printCmdYaml(screenOut, &cmd); + + SimFlat* sim = initSimulation(cmd); +#ifdef PRINT_YAML + printSimulationDataYaml(yamlFile, sim); +#endif + printSimulationDataYaml(screenOut, sim); + + Validate* validate = initValidate(sim); // atom counts, energy + timestampBarrier("Initialization Finished\n"); + + timestampBarrier("Starting simulation\n"); + + // This is the CoMD main loop + const int nSteps = sim->nSteps; + const int printRate = sim->printRate; + int iStep = 0; + profileStart(loopTimer); + for (; iStepdt); + stopTimer(timestepTimer); + + iStep += printRate; + } + profileStop(loopTimer); + + sumAtoms(sim); + printThings(sim, iStep, getElapsedTime(timestepTimer)); + timestampBarrier("Ending simulation\n"); + + // Epilog + validateResult(validate, sim); + profileStop(totalTimer); + +#ifdef PRINT_TIMING + printPerformanceResults(sim->atoms->nGlobal, sim->printRate); + printPerformanceResultsYaml(yamlFile); +#endif + + destroySimulation(&sim); + comdFree(validate); + finalizeSubsystems(); + + timestampBarrier("CoMD Ending\n"); + destroyParallel(); + + return 0; +} + +/// Initialized the main CoMD data stucture, SimFlat, based on command +/// line input from the user. Also performs certain sanity checks on +/// the input to screen out certain non-sensical inputs. +/// +/// Simple data members such as the time step dt are initialized +/// directly, substructures such as the potential, the link cells, the +/// atoms, etc., are initialized by calling additional initialization +/// functions (initPotential(), initLinkCells(), initAtoms(), etc.). +/// Initialization order is set by the natural dependencies of the +/// substructure such as the atoms need the link cells so the link cells +/// must be initialized before the atoms. +SimFlat* initSimulation(Command cmd) +{ + SimFlat* sim = comdMalloc(sizeof(SimFlat)); + sim->nSteps = cmd.nSteps; + sim->printRate = cmd.printRate; + sim->dt = cmd.dt; + sim->domain = NULL; + sim->boxes = NULL; + sim->atoms = NULL; + sim->ePotential = 0.0; + sim->eKinetic = 0.0; + sim->atomExchange = NULL; + + sim->pot = initPotential(cmd.doeam, cmd.potDir, cmd.potName, cmd.potType); + real_t latticeConstant = cmd.lat; + if (cmd.lat < 0.0) + latticeConstant = sim->pot->lat; + + // ensure input parameters make sense. + sanityChecks(cmd, sim->pot->cutoff, latticeConstant, sim->pot->latticeType); + + sim->species = initSpecies(sim->pot); + + real3 globalExtent; + globalExtent[0] = cmd.nx * latticeConstant; + globalExtent[1] = cmd.ny * latticeConstant; + globalExtent[2] = cmd.nz * latticeConstant; + + sim->domain = initDecomposition( + cmd.xproc, cmd.yproc, cmd.zproc, globalExtent); + + sim->boxes = initLinkCells(sim->domain, sim->pot->cutoff); + sim->atoms = initAtoms(sim->boxes); + + // create lattice with desired temperature and displacement. + createFccLattice(cmd.nx, cmd.ny, cmd.nz, latticeConstant, sim); + setTemperature(sim, cmd.temperature); + randomDisplacements(sim, cmd.initialDelta); + + sim->atomExchange = initAtomHaloExchange(sim->domain, sim->boxes); + + // Forces must be computed before we call the time stepper. + startTimer(redistributeTimer); + redistributeAtoms(sim); + stopTimer(redistributeTimer); + + startTimer(computeForceTimer); + computeForce(sim); + stopTimer(computeForceTimer); + + kineticEnergy(sim); + + return sim; +} + +/// frees all data associated with *ps and frees *ps +void destroySimulation(SimFlat** ps) +{ + if ( ! ps ) return; + + SimFlat* s = *ps; + if ( ! s ) return; + + BasePotential* pot = s->pot; + if ( pot) pot->destroy(&pot); + destroyLinkCells(&(s->boxes)); + destroyAtoms(s->atoms); + destroyHaloExchange(&(s->atomExchange)); + comdFree(s->species); + comdFree(s->domain); + comdFree(s); + *ps = NULL; + + return; +} + +void initSubsystems(void) +{ +#if REDIRECT_OUTPUT + freopen("testOut.txt","w",screenOut); +#endif + +#ifdef PRINT_YAML + yamlBegin(); +#endif +} + +void finalizeSubsystems(void) +{ +#if REDIRECT_OUTPUT + fclose(screenOut); +#endif + +#ifdef PRINT_YAML + yamlEnd(); +#endif +} + +/// decide whether to get LJ or EAM potentials +BasePotential* initPotential( + int doeam, const char* potDir, const char* potName, const char* potType) +{ + BasePotential* pot = NULL; + + if (doeam) + pot = initEamPot(potDir, potName, potType); + else + pot = initLjPot(); + assert(pot); + return pot; +} + +SpeciesData* initSpecies(BasePotential* pot) +{ + SpeciesData* species = comdMalloc(sizeof(SpeciesData)); + + strcpy(species->name, pot->name); + species->atomicNo = pot->atomicNo; + species->mass = pot->mass; + + return species; +} + +Validate* initValidate(SimFlat* sim) +{ + sumAtoms(sim); + Validate* val = comdMalloc(sizeof(Validate)); + val->eTot0 = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal; + val->nAtoms0 = sim->atoms->nGlobal; + + if (printRank()) + { + fprintf(screenOut, "\n"); + printSeparator(screenOut); + fprintf(screenOut, "Initial energy : %14.12f, atom count : %d \n", + val->eTot0, val->nAtoms0); + fprintf(screenOut, "\n"); + } + return val; +} + +void validateResult(const Validate* val, SimFlat* sim) +{ + if (printRank()) + { + real_t eFinal = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal; + + int nAtomsDelta = (sim->atoms->nGlobal - val->nAtoms0); + + fprintf(screenOut, "\n"); + fprintf(screenOut, "\n"); + fprintf(screenOut, "Simulation Validation:\n"); + + fprintf(screenOut, " Initial energy : %14.12f\n", val->eTot0); + fprintf(screenOut, " Final energy : %14.12f\n", eFinal); + fprintf(screenOut, " eFinal/eInitial : %f\n", eFinal/val->eTot0); + if ( nAtomsDelta == 0) + { + fprintf(screenOut, " Final atom count : %d, no atoms lost\n", + sim->atoms->nGlobal); + } + else + { + fprintf(screenOut, "#############################\n"); + fprintf(screenOut, "# WARNING: %6d atoms lost #\n", nAtomsDelta); + fprintf(screenOut, "#############################\n"); + } + } +} + +void sumAtoms(SimFlat* s) +{ + // sum atoms across all processers + s->atoms->nLocal = 0; + for (int i = 0; i < s->boxes->nLocalBoxes; i++) + { + s->atoms->nLocal += s->boxes->nAtoms[i]; + } + + startTimer(commReduceTimer); + addIntParallel(&s->atoms->nLocal, &s->atoms->nGlobal, 1); + stopTimer(commReduceTimer); +} + +/// Prints current time, energy, performance etc to monitor the state of +/// the running simulation. Performance per atom is scaled by the +/// number of local atoms per process this should give consistent timing +/// assuming reasonable load balance +void printThings(SimFlat* s, int iStep, double elapsedTime) +{ + // keep track previous value of iStep so we can calculate number of steps. + static int iStepPrev = -1; + static int firstCall = 1; + + int nEval = iStep - iStepPrev; // gives nEval = 1 for zeroth step. + iStepPrev = iStep; + + if (! printRank() ) + return; + + if (firstCall) + { + firstCall = 0; + fprintf(screenOut, + "# Performance\n" + "# Loop Time(fs) Total Energy Potential Energy Kinetic Energy Temperature (us/atom) # Atoms\n"); + fflush(screenOut); + } + + real_t time = iStep*s->dt; + real_t eTotal = (s->ePotential+s->eKinetic) / s->atoms->nGlobal; + real_t eK = s->eKinetic / s->atoms->nGlobal; + real_t eU = s->ePotential / s->atoms->nGlobal; + real_t Temp = (s->eKinetic / s->atoms->nGlobal) / (kB_eV * 1.5); + + double timePerAtom = 1.0e6*elapsedTime/(double)(nEval*s->atoms->nLocal); +#ifndef PRINT_TIMING + timePerAtom = 0.0; +#endif + + fprintf(screenOut, " %6d %10.2f %18.12f %18.12f %18.12f %12.4f %10.4f %12d\n", + iStep, time, eTotal, eU, eK, Temp, timePerAtom, s->atoms->nGlobal); +} + +/// Print information about the simulation in a format that is (mostly) +/// YAML compliant. +void printSimulationDataYaml(FILE* file, SimFlat* s) +{ + // All ranks get maxOccupancy + int maxOcc = maxOccupancy(s->boxes); + + // Only rank 0 prints + if (! printRank()) + return; + + fprintf(file,"Simulation data: \n"); + fprintf(file," Total atoms : %d\n", + s->atoms->nGlobal); + fprintf(file," Min global bounds : [ %14.10f, %14.10f, %14.10f ]\n", + s->domain->globalMin[0], s->domain->globalMin[1], s->domain->globalMin[2]); + fprintf(file," Max global bounds : [ %14.10f, %14.10f, %14.10f ]\n", + s->domain->globalMax[0], s->domain->globalMax[1], s->domain->globalMax[2]); + printSeparator(file); + fprintf(file,"Decomposition data: \n"); + fprintf(file," Processors : %6d,%6d,%6d\n", + s->domain->procGrid[0], s->domain->procGrid[1], s->domain->procGrid[2]); + fprintf(file," Local boxes : %6d,%6d,%6d = %8d\n", + s->boxes->gridSize[0], s->boxes->gridSize[1], s->boxes->gridSize[2], + s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2]); + fprintf(file," Box size : [ %14.10f, %14.10f, %14.10f ]\n", + s->boxes->boxSize[0], s->boxes->boxSize[1], s->boxes->boxSize[2]); + fprintf(file," Box factor : [ %14.10f, %14.10f, %14.10f ] \n", + s->boxes->boxSize[0]/s->pot->cutoff, + s->boxes->boxSize[1]/s->pot->cutoff, + s->boxes->boxSize[2]/s->pot->cutoff); + fprintf(file, " Max Link Cell Occupancy: %d of %d\n", + maxOcc, MAXATOMS); + printSeparator(file); + fprintf(file,"Potential data: \n"); + s->pot->print(file, s->pot); + + // Memory footprint diagnostics + int perAtomSize = 10*sizeof(real_t)+2*sizeof(int); + float mbPerAtom = perAtomSize/1024/1024; + float totalMemLocal = (float)(perAtomSize*s->atoms->nLocal)/1024/1024; + float totalMemGlobal = (float)(perAtomSize*s->atoms->nGlobal)/1024/1024; + + int nLocalBoxes = s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2]; + int nTotalBoxes = (s->boxes->gridSize[0]+2)*(s->boxes->gridSize[1]+2)*(s->boxes->gridSize[2]+2); + float paddedMemLocal = (float) nLocalBoxes*(perAtomSize*MAXATOMS)/1024/1024; + float paddedMemTotal = (float) nTotalBoxes*(perAtomSize*MAXATOMS)/1024/1024; + + printSeparator(file); + fprintf(file,"Memory data: \n"); + fprintf(file, " Intrinsic atom footprint = %4d B/atom \n", perAtomSize); + fprintf(file, " Total atom footprint = %7.3f MB (%6.2f MB/node)\n", totalMemGlobal, totalMemLocal); + fprintf(file, " Link cell atom footprint = %7.3f MB/node\n", paddedMemLocal); + fprintf(file, " Link cell atom footprint = %7.3f MB/node (including halo cell data\n", paddedMemTotal); + + fflush(file); +} + +/// Check that the user input meets certain criteria. +void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8]) +{ + int failCode = 0; + + // Check that domain grid matches number of ranks. (fail code 1) + int nProcs = cmd.xproc * cmd.yproc * cmd.zproc; + if (nProcs != getNRanks()) + { + failCode |= 1; + if (printRank() ) + fprintf(screenOut, + "\nNumber of MPI ranks must match xproc * yproc * zproc\n"); + } + + // Check whether simuation is too small (fail code 2) + double minx = 2*cutoff*cmd.xproc; + double miny = 2*cutoff*cmd.yproc; + double minz = 2*cutoff*cmd.zproc; + double sizex = cmd.nx*latticeConst; + double sizey = cmd.ny*latticeConst; + double sizez = cmd.nz*latticeConst; + + if ( sizex < minx || sizey < miny || sizez < minz) + { + failCode |= 2; + if (printRank()) + fprintf(screenOut,"\nSimulation too small.\n" + " Increase the number of unit cells to make the simulation\n" + " at least (%3.2f, %3.2f. %3.2f) Ansgstroms in size\n", + minx, miny, minz); + } + + // Check for supported lattice structure (fail code 4) + if (strcasecmp(latticeType, "FCC") != 0) + { + failCode |= 4; + if ( printRank() ) + fprintf(screenOut, + "\nOnly FCC Lattice type supported, not %s. Fatal Error.\n", + latticeType); + } + int checkCode = failCode; + bcastParallel(&checkCode, sizeof(int), 0); + // This assertion can only fail if different tasks failed different + // sanity checks. That should not be possible. + assert(checkCode == failCode); + + if (failCode != 0) + exit(failCode); +} + +// -------------------------------------------------------------- + + +/// \page pg_building_comd Building CoMD +/// +/// CoMD is written with portability in mind and should compile using +/// practically any compiler that implements the C99 standard. You will +/// need to create a Makefile by copying the sample provided with the +/// distribution (Makefile.vanilla). +/// +/// $ cp Makefile.vanilla Makefile +/// +/// and use the make command to build the code +/// +/// $ make +/// +/// The sample Makefile will compile the code on many platforms. See +/// comments in Makefile.vanilla for information about specifying the +/// name of the C compiler, and/or additional compiler switches that +/// might be necessary for your platform. +/// +/// The main options available in the Makefile are toggling single/double +/// precision and enabling/disabling MPI. In the event MPI is not +/// available, setting the DO_MPI flag to OFF will create a purely +/// serial build (you will likely also need to change the setting of +/// CC). +/// +/// The makefile should handle all the dependency checking needed, via +/// makedepend. +/// +/// 'make clean' removes the object and dependency files. +/// +/// 'make distclean' additionally removes the executable file and the +/// documentation files. +/// +/// Other build options +/// ------------------- +/// +/// Various other options are made available by \#define arguments within +/// some of the source files. +/// +/// #REDIRECT_OUTPUT in CoMD.c +/// +/// Setting this to 1 will redirect all screen output to a file, +/// currently set to 'testOut.txt'. +/// +/// #POT_SHIFT in ljForce.c +/// +/// This is set to 1.0 by default, and shifts the values of the cohesive +/// energy given by the Lennard-Jones potential so it is zero at the +/// cutoff radius. This setting improves energy conservation +/// step-to-step as it reduces the noise generated by atoms crossing the +/// cutoff threshold. However, it does not affect the long-term energy +/// conservation of the code. +/// +/// #MAXATOMS in linkCells.h +/// +/// The default value is 64, which allows ample padding of the linkCell +/// structure to allow for density fluctuations. Reducing it may improve +/// the efficiency of the code via improved thread utilization and +/// reduced memory footprint. + +// -------------------------------------------------------------- + + +// -------------------------------------------------------------- + + +/// \page pg_measuring_performance Measuring Performance +/// +/// CoMD implements a simple and extensible system of internal timers to +/// measure the performance profile of the code. As explained in +/// performanceTimers.c, it is easy to create additional timers and +/// associate them with code regions of specific interest. In addition, +/// the getTime() and getTick() functions can be easily reimplemented to +/// take advantage of platform specific timing resources. +/// +/// A timing report is printed at the end of each simulation. +/// +/// ~~~~ +/// Timings for Rank 0 +/// Timer # Calls Avg/Call (s) Total (s) % Loop +/// ___________________________________________________________________ +/// total 1 50.6701 50.6701 100.04 +/// loop 1 50.6505 50.6505 100.00 +/// timestep 1 50.6505 50.6505 100.00 +/// position 10000 0.0000 0.0441 0.09 +/// velocity 20000 0.0000 0.0388 0.08 +/// redistribute 10001 0.0003 3.4842 6.88 +/// atomHalo 10001 0.0002 2.4577 4.85 +/// force 10001 0.0047 47.0856 92.96 +/// eamHalo 10001 0.0001 1.0592 2.09 +/// commHalo 60006 0.0000 1.7550 3.46 +/// commReduce 12 0.0000 0.0003 0.00 +/// +/// Timing Statistics Across 8 Ranks: +/// Timer Rank: Min(s) Rank: Max(s) Avg(s) Stdev(s) +/// _____________________________________________________________________________ +/// total 3: 50.6697 0: 50.6701 50.6699 0.0001 +/// loop 0: 50.6505 4: 50.6505 50.6505 0.0000 +/// timestep 0: 50.6505 4: 50.6505 50.6505 0.0000 +/// position 2: 0.0437 0: 0.0441 0.0439 0.0001 +/// velocity 2: 0.0380 4: 0.0392 0.0385 0.0004 +/// redistribute 0: 3.4842 1: 3.7085 3.6015 0.0622 +/// atomHalo 0: 2.4577 7: 2.6441 2.5780 0.0549 +/// force 1: 46.8624 0: 47.0856 46.9689 0.0619 +/// eamHalo 3: 0.2269 6: 1.2936 1.0951 0.3344 +/// commHalo 3: 1.0803 6: 2.1856 1.9363 0.3462 +/// commReduce 6: 0.0002 2: 0.0003 0.0003 0.0000 +/// +/// --------------------------------------------------- +/// Average atom update rate: 9.39 us/atom/task +/// --------------------------------------------------- +/// +/// ~~~~ +/// This report consists of two blocks. The upper block lists the absolute +/// wall clock time spent in each timer on rank 0 of the job. The lower +/// block reports minimum, maximum, average, and standard deviation of +/// times across all tasks. +/// The ranks where the minimum and maximum values occured are also reported +/// to aid in identifying hotspots or load imbalances. +/// +/// The last line of the report gives the atom update rate in +/// microseconds/atom/task. Since this quantity is normalized by both +/// the number of atoms and the number of tasks it provides a simple +/// figure of merit to compare performance between runs with different +/// numbers of atoms and different numbers of tasks. Any increase in +/// this number relative to a large number of atoms on a single task +/// represents a loss of parallel efficiency. +/// +/// Choosing the problem size correctly has important implications for the +/// reported performance. Small problem sizes may run entirely in the cache +/// of some architectures, leading to very good performance results. +/// For general characterization of performance, it is probably best to +/// choose problem sizes which force the code to access main memory, even +/// though there may be strong scaling scenarios where the code is indeed +/// running mainly in cache. +/// +/// *** Architecture/Configuration for above timing numbers: +/// SGI XE1300 cluster with dual-socket Intel quad-core Nehalem processors. +/// Each node has 2 Quad-Core Xeon X5550 processors runnning at 2.66 GHz +/// with 3 GB of memory per core. + +// -------------------------------------------------------------- + + +/// \page pg_problem_selection_and_scaling Problem Selection and Scaling +/// +/// CoMD is a reference molecular dynamics simulation code as used in +/// materials science. +/// +/// Problem Specification {#sec_problem_spec} +/// ====================== +/// +/// The reference problem is solid Copper starting from a face-centered +/// cubic (FCC) lattice. The initial thermodynamic conditions +/// (Temperature and Volume (via the lattice spacing, lat))can be specified +/// from the command line input. The default is 600 K and standard +/// volume (lat = 3.615 Angstroms). +/// Different temperatures (e.g. T =3000K) and volumes can be +/// specified to melt the system and enhance the interchange of atoms +/// between domains. +/// +/// The dynamics is micro-canonical (NVE = constant Number of atoms, +/// constant total system Volume, and constant total system Energy). As +/// a result, the temperature is not fixed. Rather, the temperature will +/// adjust from the initial temperature (as specified on the command line) +/// to a final temperature as the total system kinetic energy comes into +/// equilibrium with the total system potential energy. +/// +/// The total size of the problem (number of atoms) is specified by the +/// number (nx, ny, nz) of FCC unit cells in the x, y, z directions: nAtoms +/// = 4 * nx * ny * nz. The default size is nx = ny = nz = 20 or 32,000 atoms. +/// +/// The simulation models bulk copper by replicating itself in every +/// direction using periodic boundary conditions. +/// +/// Two interatomic force models are available: the Lennard-Jones (LJ) +/// two-body potential (ljForce.c) and the many-body Embedded-Atom Model (EAM) +/// potential (eam.c). The LJ potential is included for comparison and +/// is a valid approximation for constant volume and uniform +/// density. The EAM potential is a more accurate model of cohesion in +/// simple metals like Copper and includes the energetics necessary to +/// model non-uniform density and free surfaces. +/// +/// Scaling Studies in CoMD {#sec_scaling_studies} +/// ======================= +/// +/// CoMD implements a simple geometric domain decomposition to divide +/// the total problem space into domains, which are owned by MPI +/// ranks. Each domain is a single-program multiple data (SPMD) +/// partition of the larger problem. +/// +/// Caution: When doing scaling studies, it is important to distinguish +/// between the problem setup phase and the problem execution phase. Both +/// are important to the workflow of doing molecular dynamics, but it +/// is the execution phase we want to quantify in the scaling studies +/// described below, for that dominates the execution time for long runs +/// (millions of time steps). The problem setup can be an appreciable fraction +/// of the execution time for short runs (the default is 100 time steps) +/// and erroneous conclusions drawn. +/// +/// This code is configured with timers. The times are reported per particle +/// and the timers for the force calculation, timestep, etc start after the +/// initialization phase is done. +/// +/// Weak Scaling {#ssec_weak_scaling} +/// ----------- +/// +/// A weak scaling test fixes the amount of work per processor and +/// compares the execution time over number of processors. Weak scaling +/// keeps the ratio of inter-processor communication (surface) to +/// intra-processor work (volume) fixed. The amount of inter-processor +/// work scales with the number of processors in the domain and O(1000) +/// atoms per domain are needed for reasonable performance. +/// +/// Examples, +/// +/// - Increase in processor count by 8:
+/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=40) +/// +/// - Increase in processor count by 2:
+/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=20, nz=40) +/// +/// In general, it is wise to keep the ratio of processor count to +/// system size in each direction fixed (i.e. cubic domains): xproc_0 / nx_0 = xproc_1 / +/// nx_1, since this minimizes surface area to volume. +/// Feel free to experiment, you might learn something about +/// algorithms to optimize communication relative to work. +/// +/// Strong Scaling {#ssec_strong_scaling} +/// --------------- +/// +/// A strong scaling test fixes the total problem size and compares the +/// execution time for different numbers of processors. Strong scaling +/// increases the ratio of inter-processor communication (surface) to +/// intra-processor work (volume). +/// +/// Examples, +/// +/// - Increase in processor count by 8:
+/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=20) +/// +/// - Increase in processor count by 2:
+/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=nz=20) +/// +/// The domain decomposition requires O(1000) atoms per domain and +/// begins to scale poorly for small numbers of atoms per domain. +/// Again, feel free to experiment, you might learn something here as +/// well. For example, when molecular dynamics codes were written for +/// vector supercomputers, large lists of force pairs were created for +/// the vector processor. These force lists provide a natural force +/// decomposition for early parallel computers (Fast Parallel Algorithms +/// for Short-Range Molecular Dynamics, S. J. Plimpton, J Comp Phys, +/// 117, 1-19 (1995).) Using replicated data, force decomposition can +/// scale to fewer than one atom per processor and is a natural +/// mechanism to exploit intra-processor parallelism. +/// +/// For further details see for example: +/// https://support.scinet.utoronto.ca/wiki/index.php/Introduction_To_Performance + + +// -------------------------------------------------------------- + + +/// \page pg_verifying_correctness Verifying Correctness +/// +/// Verifying the correctness of an MD simulation is challenging. +/// Because MD is Lyapunov unstable, any small errors, even harmless +/// round-off errors, will lead to a long-term divergence in the atom +/// trajectories. Hence, comparing atom positions at the end of a run +/// is not always a useful verification technique. (Such divergences +/// are not a problem for science applications of MD since they do not +/// alter the statistical physics.) Small, single-particle errors can +/// also be difficult to detect in system-wide quantities such as the +/// kinetic or potential energy that are averaged over a large number of +/// particles. +/// +/// In spite of these challenges, there are several methods which are +/// likely to catch significant errors. +/// +/// Cohesive Energy {#sec_ver_cohesive_energy} +/// =============== +/// +/// With a perfect lattice as the initial structure (this is the +/// default), the potential energy per atom is the cohesive energy. +/// This value should be computed correctly to many decimal places. Any +/// variation beyond the last 1 or 2 decimal places is cause for +/// investigation. The correct values for the cohesive energy are +/// +/// | Potential | Cohesive Energy | +/// | :------------- | :-------------- | +/// | Lennard-Jones | -1.243619295058 | +/// | EAM (Adams) | -3.538079224691 | +/// | EAM (Mishin) | -3.539999969176 | +/// +/// The \link sec_command_line_options command +/// line options \endlink documentation explains the switches used to +/// select the potential used in the simulation. +/// +/// Note that the cohesive energy calculation is not sensitive to errors +/// in forces. It is also performed on a highly symmetric structure so +/// there are many errors this will not catch. Still, it is a good +/// first check. +/// +/// Energy Conservation {#sec_ver_energy_conservation} +/// =================== +/// +/// A correctly implemented force kernel, with an appropriate time step +/// (the default value of 1 fs is conservative for temperatures under +/// 10,000K) will conserve total energy over long times to 5 or more +/// digits. Any long term systematic drift in the total energy is a +/// cause for concern. +/// +/// To facilitate checking energy conservation CoMD prints the final and +/// initial values of the total energy. When comparing these values, pay +/// careful attention to these details: +/// +/// - It is common to observe an initial transient change in the total +/// energy. Differences in the total energy of 2-3% can be expected in +/// the first 10-100 time steps. +/// - The best way to check energy conservation is to run at least +/// several thousand steps and look at the slope of the total energy +/// ignoring at least the first one or two thousand steps. More steps +/// are even better. +/// - Set the temperature to at least several hundred K. This ensures +/// that atoms will sample a large range of configurations and expose +/// possible errors. +/// - Fluctuations in the energy can make it difficult to tell if +/// conservation is observed. Increasing the number of atoms will reduce +/// the fluctuations. +/// +/// +/// Particle Conservation {#sec_ver_particle_conservation} +/// ===================== +/// +/// The simulation should always end with the same number of particles +/// it started with. Any change is a bug. CoMD checks the initial and +/// final number of particles and prints a warning at the end of the +/// simulation if they are not equal. +/// +/// Reproducibility {#sec_ver_reproducibility} +/// =============== +/// +/// The same simulation run repeatedly on the same hardware should +/// produce the same result. Because parallel computing can add +/// elements of non-determinism we do not expect perfect long term +/// reproducibility, however over a few hundred to a few thousand time +/// steps the energies should not exhibit run-to-run differences outside +/// the last 1 or 2 decimal places. Larger differences are a sign of +/// trouble and should be investigated. This kind of test is +/// practically the only way to detect race conditions in shared memory +/// parallelism. +/// +/// Portability {#sec_ver_portability} +/// =========== +/// +/// In our experience, simulations that start from the same initial +/// condition tend to produce very similar trajectories over short terms +/// (100 to 1000 time step), even on different hardware platforms. +/// Short term differences beyond the last 1 or 2 decimal places should +/// likely be investigated. +/// +/// General Principles {#sec_ver_general} +/// ======================= +/// +/// - Simulations run at 0K are too trivial for verification, set +/// the initial temperature to at least several hundred K. +/// - Longer runs are better to check conservation. Compare +/// energies after initial transients are damped out. +/// - Larger runs are better to check conservation. Fluctuations in the +/// energy are averaged out. +/// - Short term (order 100 time steps) discrepancies from run-to-run +/// or platform-to platform beyond the last one or two decimal places +/// are reason for concern. Differences in 4th or 5th decimal place +/// is almost certainly a bug. +/// - Contact the CoMD developers (exmatex-comd@llnl.gov) if you have +/// questions about validation. +/// + +// -------------------------------------------------------------- + + +/// \page pg_comd_architecture CoMD Architecture +/// +/// Program Flow {#sec_program_flow} +/// ============ +/// +/// We have attempted to make the program flow in CoMD 1.1 as simple and +/// transparent as possible. The main program consists of three blocks: +/// prolog, main loop, and epilog. +/// +/// Prolog {#ssec_flow_prolog} +/// ------- +/// +/// The job of the prolog is to initialize the simulation and prepare +/// for the main loop. Notable tasks in the prolog include calling +/// - initParallel() to start MPI +/// - parseCommandLine() to read the command line options +/// - initSimulation() to initialize the main data structure, SimFlatSt. +/// This includes tasks such as +/// - initEamPot() to read tabular data for the potential function +/// - initDecomposition() to set up the domain decomposition +/// - createFccLattice() to generate an initial structure for the atoms +/// - initValidate() to store initial data for a simple validation check +/// +/// In CoMD 1.1 all atomic structures are internally generated so +/// there is no need to read large files with atom coordinate data. +/// +/// Main Loop {#ssec_flow_main_loop} +/// --------- +/// +/// The main loop calls +/// - timestep(), the integrator to update particle positions, +/// - printThings() to periodically prints simulation information +/// +/// The timestep() function is the heart of the code as it choreographs +/// updating the particle positions, along with computing forces +/// (computeForce()) and communicating atoms between ranks +/// (redistributeAtoms()). +/// +/// Epilog {#ssec_flow_epilog} +/// ------- +/// +/// The epilog code handles end of run bookkeeping such as +/// - validateResult() to check validation +/// - printPerformanceResults() to print a performance summary +/// - destroySimulation() to free memory +/// +/// Key Data Structures {#sec_key_data_structures} +/// ================== +/// +/// Practically all data in CoMD belongs to the SimFlatSt structure. +/// This includes: +/// - BasePotentialSt A polymorphic structure for the potential model +/// - HaloExchangeSt A polymorphic strcuture for communication halo data +/// - DomainSt The parallel domain decomposition +/// - LinkCellSt The link cells +/// - AtomsSt The atom coordinates and velocities +/// - SpeciesDataSt Properties of the atomic species being simulated. +/// +/// Consult the individual pages for each of these structures to learn +/// more. The descriptions in haloExchange.c and initLinkCells() are +/// especially useful to understand how the atoms are commuicated +/// between tasks and stored in link cells for fast pair finding. + +// -------------------------------------------------------------- + + +/// \page pg_optimization_targets Optimization Targets +/// +/// Computation {#sec_computation} +/// ============ +/// +/// The computational effort of classical MD is usually highly focused +/// in the force kernel. The two force kernels supplied by CoMD are +/// eamForce() and ljForce(). Both kernels are fundamentally loops over +/// pairs of atoms with significant opportunity to exploit high levels +/// of concurrency. One potential challenge when reordering or +/// parallelizing the pair loop structure is preventing race conditions +/// that result if two concurrent pair evaluations try to simultaneously +/// increment the forces and energies on the same atom. +/// +/// The supplied EAM kernel uses interpolation from tabular data to +/// evaluate functions. Hence the interpolate() function is another +/// potential optimization target. Note that the two potential files +/// distributed with CoMD have very different sizes. The Adams +/// potential (Cu_u6.eam) has 500 points per function in the table while +/// the Mishin potential (Cu01.eam.alloy) has 10,000 points per +/// function. This difference could potentially impact important +/// details such as cache miss rates. +/// +/// Communication {#sec_communication} +/// ============= +/// +/// As the number of atoms per MPI rank decreases, the communication +/// routines will start to require a significant fraction of the +/// run time. The main communication routine in CoMD is haloExchange(). +/// The halo exchange is simple nearest neighbor, point-to-point +/// communication so it should scale well to practically any number of +/// nodes. +/// +/// The halo exchange in CoMD 1.1 is a very simple 6-direction +/// structured halo exchange (see haloExchange.c). Other exchange +/// algorithms can be implemented without much difficulty. +/// +/// The halo exchange function is called in two very different contexts. +/// The main usage is to exchange halo particle information (see +/// initAtomHaloExchange()). This process is coordinated by the +/// redistributeAtoms() function. +/// +/// In addition to the atom exchange, when using the EAM potential, a +/// halo exchange is performed in the force routine (see +/// initForceHaloExchange()). + + +// -------------------------------------------------------------- + + +/// \page pg_whats_new New Features and Changes in CoMD 1.1 +/// +/// The main goals of the 1.1 release were to add support for MPI and to +/// improve the structure and clarity of the code. Achieving these +/// goals required considerable changes compared to the 1.0 release. +/// However, the core structure of the most computationally intensive +/// kernels (the force routines) is mostly unchanged. We believe that +/// lessons learned from optimizing 1.0 force kernels to specific +/// hardware or programming models can be quickly transferred to kernels +/// in the 1.1 release. +/// +/// Significant changes in CoMD 1.1 include: +/// +/// - MPI support. Both MPI and single node serial executables can be +/// built from the same source files. +/// +/// - Improved modularity and code clarity. Major data structures are +/// now organized with their own structs and initialization routines. +/// +/// - The build system has been simplified to use only standard +/// Makefiles instead of CMake. +/// +/// - The halo exchange operation needed to communicate remote particle +/// data between MPI ranks also creates "image" particles in the +/// serial build. +/// +/// - Unified force kernels for both serial and MPI builds +/// +/// - The addition of remote/image atoms allows periodic boundary +/// conditions to be handled outside the force loop. +/// +/// - An additional communication/data copy step to handle electron +/// density on remote/image atoms has been added to the EAM force +/// loop. +/// +/// - The coordinate system has been simplified to a single global +/// coordinate system for all particles. +/// +/// - Evaluation of energies and forces using a Chebyshev polynomial +/// fits has been removed. Polynomial approximation of energies and +/// forces will return in a future CoMD version. +/// +/// - Atomic structures are now generated internally, eliminating the +/// requirement to read, write, and distribute large atom +/// configuration files. Arbitrarily large initial structures can +/// be generated with specified initial temperature and random +/// displacements from lattice positions. Code to read/write atomic +/// positions has been removed. +/// +/// - EAM potentials are now read from standard funcfl and setfl format +/// files. Voter style files are no longer supported. +/// +/// - Collection of performance metrics is significantly improved. +/// Developers can easily add new timers to regions of interest. The +/// system is also designed to allow easy integration with platform +/// specific API's to high resolution timers, cycle counters, +/// hardware counters, etc. +/// +/// +/// - Hooks to in-situ analysis and visualization have been removed. +/// In-situ analysis capabilities will return in a future CoMD release. +/// +/// Please contact the CoMD developers (exematex-comd@llnl.gov) if +/// any of the deleted features negative impacts your work. We +/// may be able to help produce a custom version that includes the code +/// you need. + + +// -------------------------------------------------------------- + + +/// \page pg_md_basics MD Basics +/// +/// The molecular dynamics (MD) computer simulation method is a well +/// established and important tool for the study of the dynamical +/// properties of liquids, solids, and other systems of interest in +/// Materials Science and Engineering, Chemistry and Biology. A material +/// is represented in terms of atoms and molecules. The method of MD +/// simulation involves the evaluation of the force acting on each atom +/// due to all other atoms in the system and the numerical integration +/// of the Newtonian equations of motion. Though MD was initially +/// developed to compute the equilibrium thermodynamic behavior of +/// materials (equation of state), most recent applications have used MD +/// to study non-equilibrium processes. +/// +/// Wikipeda offers a basic introduction to molecular dynamics with +/// many references: +/// +/// http://en.wikipedia.org/wiki/Molecular_dynamics +/// +/// For a thorough treatment of MD methods, see: +/// - "Computer simulation of liquids" by M.P. Allen and D.J. Tildesley +/// (Oxford, 1989) +/// ISBN-10: 0198556454 | ISBN-13: 978-0198556459. +/// +/// For an understanding of MD simulations and application to statistical mechanics: +/// - "Understanding Molecular Simulation, Second Edition: From Algorithms +/// to Applications," by D. Frenkel and B. Smit (Academic Press, 2001) +/// ISBN-10: 0122673514 | ISBN-13: 978-0122673511 +/// - "Statistical and Thermal Physics: With Computer Applications," by +/// H. Gould and J. Tobochnik (Princeton, 2010) +/// ISBN-10: 0691137447 | ISBN-13: 978-0691137445 +/// +/// CoMD implements both the Lennard-Jones Potential (ljForce.c) and the +/// Embedded Atom Method Potential (eam.c). +/// Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.reference_output =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.reference_output +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.reference_output @@ -0,0 +1,97 @@ +: Starting Initialization + + +Mini-Application Name : CoMD-serial +Mini-Application Version : 1.1 +Platform: + hostname: test-suite + kernel name: 'test-suite' + kernel release: 'test-suite' + processor: 'test-suite' +Build: + CC: 'test-suite' + compiler version: 'test-suite' + CFLAGS: '-std=c99 -DDOUBLE' + LDFLAGS: '-lm' + using MPI: false + Threading: none + Double Precision: true +Run Date/Time: + +Command Line Parameters: + doeam: 1 + potDir: . + potName: Cu_u6.eam + potType: funcfl + nx: 10 + ny: 10 + nz: 10 + xproc: 1 + yproc: 1 + zproc: 1 + Lattice constant: -1 Angstroms + nSteps: 100 + printRate: 10 + Time step: 1 fs + Initial Temperature: 600 K + Initial Delta: 0 Angstroms + +Simulation data: + Total atoms : 4000 + Min global bounds : [ 0.0000000000, 0.0000000000, 0.0000000000 ] + Max global bounds : [ 36.1500000000, 36.1500000000, 36.1500000000 ] + +Decomposition data: + Processors : 1, 1, 1 + Local boxes : 7, 7, 7 = 343 + Box size : [ 5.1642857143, 5.1642857143, 5.1642857143 ] + Box factor : [ 1.0432900433, 1.0432900433, 1.0432900433 ] + Max Link Cell Occupancy: 14 of 64 + +Potential data: + Potential type : EAM + Species name : Cu + Atomic number : 29 + Mass : 63.55 amu + Lattice type : FCC + Lattice spacing : 3.615 Angstroms + Cutoff : 4.95 Angstroms + +Memory data: + Intrinsic atom footprint = 88 B/atom + Total atom footprint = 0.336 MB ( 0.34 MB/node) + Link cell atom footprint = 1.842 MB/node + Link cell atom footprint = 3.916 MB/node (including halo cell data + + +Initial energy : -3.460523233088, atom count : 4000 + +: Initialization Finished + +: Starting simulation + +# Performance +# Loop Time(fs) Total Energy Potential Energy Kinetic Energy Temperature (us/atom) # Atoms + 0 0.00 -3.460523233088 -3.538079224688 0.077555991600 600.0000 0.0000 4000 + 10 10.00 -3.460522591854 -3.529960109661 0.069437517806 537.1927 0.0000 4000 + 20 20.00 -3.460524196370 -3.509840489408 0.049316293038 381.5279 0.0000 4000 + 30 30.00 -3.460527804962 -3.488665662974 0.028137858012 217.6842 0.0000 4000 + 40 40.00 -3.460532043042 -3.477611644844 0.017079601802 132.1337 0.0000 4000 + 50 50.00 -3.460536332237 -3.479795957699 0.019259625462 148.9991 0.0000 4000 + 60 60.00 -3.460538324455 -3.488980357513 0.028442033059 220.0374 0.0000 4000 + 70 70.00 -3.460537014134 -3.496718290600 0.036181276467 279.9109 0.0000 4000 + 80 80.00 -3.460534287349 -3.499004390616 0.038470103268 297.6180 0.0000 4000 + 90 90.00 -3.460531717488 -3.497316540132 0.036784822643 284.5801 0.0000 4000 + 100 100.00 -3.460530357326 -3.495731031242 0.035200673916 272.3246 0.0000 4000 +: Ending simulation + + + +Simulation Validation: + Initial energy : -3.460523233088 + Final energy : -3.460530357326 + eFinal/eInitial : 1.000002 + Final atom count : 4000, no atoms lost +: CoMD Ending + +exit 0 Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMDTypes.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMDTypes.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMDTypes.h @@ -0,0 +1,85 @@ +/// \file +/// CoMD data structures. + +#ifndef __COMDTYPES_H_ +#define __COMDTYPES_H_ + +#include +#include "mytype.h" +#include "haloExchange.h" +#include "linkCells.h" +#include "decomposition.h" +#include "initAtoms.h" + +struct SimFlatSt; + +/// The base struct from which all potentials derive. Think of this as an +/// abstract base class. +/// +/// CoMD uses the following units: +/// - distance is in Angstroms +/// - energy is in eV +/// - time is in fs +/// - force in in eV/Angstrom +/// +/// The choice of distance, energy, and time units means that the unit +/// of mass is eV*fs^2/Angstrom^2. Hence, we must convert masses that +/// are input in AMU (atomic mass units) into internal mass units. +typedef struct BasePotentialSt +{ + real_t cutoff; //!< potential cutoff distance in Angstroms + real_t mass; //!< mass of atoms in intenal units + real_t lat; //!< lattice spacing (angs) of unit cell + char latticeType[8]; //!< lattice type, e.g. FCC, BCC, etc. + char name[3]; //!< element name + int atomicNo; //!< atomic number + int (*force)(struct SimFlatSt* s); //!< function pointer to force routine + void (*print)(FILE* file, struct BasePotentialSt* pot); + void (*destroy)(struct BasePotentialSt** pot); //!< destruction of the potential +} BasePotential; + + +/// species data: chosen to match the data found in the setfl/funcfl files +typedef struct SpeciesDataSt +{ + char name[3]; //!< element name + int atomicNo; //!< atomic number + real_t mass; //!< mass in internal units +} SpeciesData; + +/// Simple struct to store the initial energy and number of atoms. +/// Used to check energy conservation and atom conservation. +typedef struct ValidateSt +{ + double eTot0; // +#include +#include + +#include "mytype.h" +#include "memUtils.h" + +#define nextOption(o) ((MyOption*) o->next) + +typedef struct MyOptionSt +{ + char* help; + char* longArg; + unsigned char shortArg[2]; + int argFlag; + char type; + int sz; + void* ptr; + void* next; +} MyOption; + +static int longest = 1; +static MyOption* myargs=NULL; + +static char* dupString(const char* s) +{ + char* d; + if ( ! s ) s = ""; + d = (char*)comdCalloc((strlen(s)+1),sizeof(char)); + strcpy(d, s); + return d; +} + +static MyOption* myOptionAlloc( + const char* longOption, const char shortOption, + int has_arg, const char type, void* dataPtr, int dataSize, const char* help) +{ + static int iBase=129; + MyOption* o = (MyOption*)comdCalloc(1, sizeof(MyOption)); + o->help = dupString(help); + o->longArg = dupString(longOption); + if(shortOption) o->shortArg[0] = (unsigned char)shortOption; + else + { + o->shortArg[0] = iBase; + iBase++; + } + o->argFlag = has_arg; + o->type = type; + o->ptr = dataPtr; + o->sz = dataSize; + if(longOption) longest = (longest>strlen(longOption)?longest:strlen(longOption)); + return o; +} + +static MyOption* myOptionFree(MyOption* o) +{ + MyOption* r; + if(!o) return NULL; + r = nextOption(o); + if(o->longArg)free(o->longArg); + if(o->help)free(o->help); + free(o); + return r; +} + +static MyOption* lastOption(MyOption* o) +{ + if ( ! o) return o; + while(nextOption(o)) o = nextOption(o); + return o; +} + +static MyOption* findOption(MyOption* o, unsigned char shortArg) +{ + while(o) + { + if (o->shortArg[0] == shortArg) return o; + o = nextOption(o); + } + return o; +} + + +int addArg(const char* longOption, const char shortOption, + int has_arg, const char type, void* dataPtr, int dataSize, + const char* help) +{ + MyOption* o; + MyOption* p; + o = myOptionAlloc(longOption,shortOption,has_arg,type,dataPtr,dataSize, help); + if ( ! o ) return 1; + if ( ! myargs) myargs = o; + else + { + p = lastOption(myargs); + p->next = (void *)o; + } + return 0; +} + + +void freeArgs() +{ + while(myargs) + { + myargs = myOptionFree(myargs); + } + return; +} + +void printArgs() +{ + MyOption* o = myargs; + char s[4096]; + unsigned char *shortArg; + fprintf(screenOut,"\n" + " Arguments are: \n"); + sprintf(s," --%%-%ds",longest); + while(o) + { + if(o->shortArg[0]<0xFF) shortArg = o->shortArg; + else shortArg = (unsigned char *) "---"; + fprintf(screenOut,s,o->longArg); + fprintf(screenOut," -%c arg=%1d type=%c %s\n",shortArg[0],o->argFlag,o->type,o->help); + o = nextOption(o); + + } + fprintf(screenOut,"\n\n"); + return; +} + +void processArgs(int argc, char** argv) +{ + MyOption* o; + int n=0; + int i; + struct option* opts; + char* sArgs; + int c; + + if ( ! myargs) return; + o = myargs; + while(o) + {n++,o=nextOption(o);} + + o = myargs; + sArgs= (char*)comdCalloc(2*(n+2),sizeof(char)); + opts = (struct option*)comdCalloc(n,sizeof(struct option)); + for (i=0; ilongArg; + opts[i].has_arg = o->argFlag; + opts[i].flag = 0; + opts[i].val = o->shortArg[0]; + + strcat(sArgs,(char*) o->shortArg); + if(o->argFlag) strcat(sArgs,":"); + o = nextOption(o); + } + + while(1) + { + + int option_index = 0; + + c = getopt_long (argc, argv, sArgs, opts, &option_index); + if ( c == -1) break; + o = findOption(myargs,c); + if ( ! o ) + { + fprintf(screenOut,"\n\n" + " invalid switch : -%c in getopt()\n" + "\n\n", + c); + break; + } + if(! o->argFlag) + { + int* i = (int*)o->ptr; + *i = 1; + } + else + { + switch(o->type) + { + case 'i': + sscanf(optarg,"%d",(int*)o->ptr); + break; + case 'f': + sscanf(optarg,"%f",(float*)o->ptr); + break; + case 'd': + sscanf(optarg,"%lf",(double*)o->ptr); + break; + case 's': + strncpy((char*)o->ptr,(char*)optarg,o->sz); + ((char*)o->ptr)[o->sz-1] = '\0'; + break; + case 'c': + sscanf(optarg,"%c",(char*)o->ptr); + break; + default: + fprintf(screenOut,"\n\n" + " invalid type : %c in getopt()\n" + " valid values are 'e', 'z'. 'i','d','f','s', and 'c'\n" + "\n\n", + c); + } + } + } + + free(opts); + free(sArgs); + + return; +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/constants.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/constants.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/constants.h @@ -0,0 +1,41 @@ +/// \file +/// Contains constants for unit conversions. +/// +/// The units for this code are: +/// - Time in femtoseconds (fs) +/// - Length in Angstroms (Angs) +/// - Energy in electron Volts (eV) +/// - Mass read in as Atomic Mass Units (amu) and then converted for +/// consistency (energy*time^2/length^2) +/// Values are taken from NIST, http://physics.nist.gov/cuu/Constants/ + +#ifndef _CONSTANTS_H_ +#define _CONSTANTS_H_ + +/// 1 amu in kilograms +#define amuInKilograms 1.660538921e-27 + +/// 1 fs in seconds +#define fsInSeconds 1.0e-15 + +/// 1 Ang in meters +#define AngsInMeters 1.0e-10 + +/// 1 eV in Joules +#define eVInJoules 1.602176565e-19 + +/// Internal mass units are eV * fs^2 / Ang^2 +static const double amuToInternalMass = + amuInKilograms * AngsInMeters * AngsInMeters + / (fsInSeconds * fsInSeconds * eVInJoules); + +/// Boltmann constant in eV's +static const double kB_eV = 8.6173324e-5; // eV/K + +/// Hartrees to eVs +static const double hartreeToEv = 27.21138505; + +/// Bohrs to Angstroms +static const double bohrToAngs = 0.52917721092; + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.h @@ -0,0 +1,33 @@ +/// \file +/// Parallel domain decomposition. + +#ifndef _GEOMETRY_H_ +#define _GEOMETRY_H_ + +#include "mytype.h" + +/// Domain decomposition information. +typedef struct DomainSt +{ + // process-layout data + int procGrid[3]; //!< number of processors in each dimension + int procCoord[3]; //!< i,j,k for this processor + + // global bounds data + real3 globalMin; //!< minimum global coordinate (angstroms) + real3 globalMax; //!< maximum global coordinate (angstroms) + real3 globalExtent; //!< global size: globalMax - globalMin + + // local bounds data + real3 localMin; //!< minimum coordinate on local processor + real3 localMax; //!< maximum coordinate on local processor + real3 localExtent; //!< localMax - localMin +} Domain; + +struct DomainSt* initDecomposition(int xproc, int yproc, int zproc, + real3 globalExtent); + +/// Find the MPI rank of a neighbor domain from a relative coordinate. +int processorNum(Domain* domain, int dix, int diy, int dik); + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/decomposition.c @@ -0,0 +1,66 @@ +/// \file +/// Parallel domain decomposition. This version of CoMD uses a simple +/// spatial Cartesian domain decomposition. The simulation box is +/// divided into equal size bricks by a grid that is xproc by yproc by +/// zproc in size. + +#include "decomposition.h" + +#include + +#include "memUtils.h" +#include "parallel.h" + +/// \param [in] xproc x-size of domain decomposition grid. +/// \param [in] yproc y-size of domain decomposition grid. +/// \param [in] zproc z-size of domain decomposition grid. +/// \param [in] globalExtent Size of the simulation domain (in Angstroms). +Domain* initDecomposition(int xproc, int yproc, int zproc, real3 globalExtent) +{ + assert( xproc * yproc * zproc == getNRanks()); + + Domain* dd = comdMalloc(sizeof(Domain)); + dd->procGrid[0] = xproc; + dd->procGrid[1] = yproc; + dd->procGrid[2] = zproc; + // calculate grid coordinates i,j,k for this processor + int myRank = getMyRank(); + dd->procCoord[0] = myRank % dd->procGrid[0]; + myRank /= dd->procGrid[0]; + dd->procCoord[1] = myRank % dd->procGrid[1]; + dd->procCoord[2] = myRank / dd->procGrid[1]; + + // initialialize global bounds + for (int i = 0; i < 3; i++) + { + dd->globalMin[i] = 0; + dd->globalMax[i] = globalExtent[i]; + dd->globalExtent[i] = dd->globalMax[i] - dd->globalMin[i]; + } + + // initialize local bounds on this processor + for (int i = 0; i < 3; i++) + { + dd->localExtent[i] = dd->globalExtent[i] / dd->procGrid[i]; + dd->localMin[i] = dd->globalMin[i] + dd->procCoord[i] * dd->localExtent[i]; + dd->localMax[i] = dd->globalMin[i] + (dd->procCoord[i]+1) * dd->localExtent[i]; + } + + return dd; +} + +/// \details +/// Calculates the rank of the processor with grid coordinates +/// (ix+dix, iy+diy, iz+diz) where (ix, iy, iz) are the grid coordinates +/// of the local rank. Assumes periodic boundary conditions. The +/// deltas cannot be smaller than -procGrid[ii]. +int processorNum(Domain* domain, int dix, int diy, int diz) +{ + const int* procCoord = domain->procCoord; // alias + const int* procGrid = domain->procGrid; // alias + int ix = (procCoord[0] + dix + procGrid[0]) % procGrid[0]; + int iy = (procCoord[1] + diy + procGrid[1]) % procGrid[1]; + int iz = (procCoord[2] + diz + procGrid[2]) % procGrid[2]; + + return ix + procGrid[0] *(iy + procGrid[1]*iz); +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/eam.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/eam.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/eam.h @@ -0,0 +1,23 @@ +/// \file +/// Compute forces for the Embedded Atom Model (EAM). + +#ifndef __EAM_H +#define __EAM_H + +#include "mytype.h" + +struct BasePotentialSt; +struct LinkCellSt; + +/// Pointers to the data that is needed in the load and unload functions +/// for the force halo exchange. +/// \see loadForceBuffer +/// \see unloadForceBuffer +typedef struct ForceExchangeDataSt +{ + real_t* dfEmbed; // +#include +#include +#include + +#include "constants.h" +#include "memUtils.h" +#include "parallel.h" +#include "linkCells.h" +#include "CoMDTypes.h" +#include "performanceTimers.h" +#include "haloExchange.h" + +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + +/// Handles interpolation of tabular data. +/// +/// \see initInterpolationObject +/// \see interpolate +typedef struct InterpolationObjectSt +{ + int n; //!< the number of values in the table + real_t x0; //!< the starting ordinate range + real_t invDx; //!< the inverse of the table spacing + real_t* values; //!< the abscissa values +} InterpolationObject; + +/// Derived struct for an EAM potential. +/// Uses table lookups for function evaluation. +/// Polymorphic with BasePotential. +/// \see BasePotential +typedef struct EamPotentialSt +{ + real_t cutoff; //!< potential cutoff distance in Angstroms + real_t mass; //!< mass of atoms in intenal units + real_t lat; //!< lattice spacing (angs) of unit cell + char latticeType[8]; //!< lattice type, e.g. FCC, BCC, etc. + char name[3]; //!< element name + int atomicNo; //!< atomic number + int (*force)(SimFlat* s); //!< function pointer to force routine + void (*print)(FILE* file, BasePotential* pot); + void (*destroy)(BasePotential** pot); //!< destruction of the potential + InterpolationObject* phi; //!< Pair energy + InterpolationObject* rho; //!< Electron Density + InterpolationObject* f; //!< Embedding Energy + + real_t* rhobar; //!< per atom storage for rhobar + real_t* dfEmbed; //!< per atom storage for derivative of Embedding + HaloExchange* forceExchange; + ForceExchangeData* forceExchangeData; +} EamPotential; + +// EAM functionality +static int eamForce(SimFlat* s); +static void eamPrint(FILE* file, BasePotential* pot); +static void eamDestroy(BasePotential** pot); +static void eamBcastPotential(EamPotential* pot); + + +// Table interpolation functionality +static InterpolationObject* initInterpolationObject( + int n, real_t x0, real_t dx, real_t* data); +static void destroyInterpolationObject(InterpolationObject** table); +static void interpolate(InterpolationObject* table, real_t r, real_t* f, real_t* df); +static void bcastInterpolationObject(InterpolationObject** table); +static void printTableData(InterpolationObject* table, const char* fileName); + + +// Read potential tables from files. +static void eamReadSetfl(EamPotential* pot, const char* dir, const char* potName); +static void eamReadFuncfl(EamPotential* pot, const char* dir, const char* potName); +static void fileNotFound(const char* callSite, const char* filename); +static void notAlloyReady(const char* callSite); +static void typeNotSupported(const char* callSite, const char* type); + + +/// Allocate and initialize the EAM potential data structure. +/// +/// \param [in] dir The directory in which potential table files are found. +/// \param [in] file The name of the potential table file. +/// \param [in] type The file format of the potential file (setfl or funcfl). +BasePotential* initEamPot(const char* dir, const char* file, const char* type) +{ + EamPotential* pot = comdMalloc(sizeof(EamPotential)); + assert(pot); + pot->force = eamForce; + pot->print = eamPrint; + pot->destroy = eamDestroy; + pot->phi = NULL; + pot->rho = NULL; + pot->f = NULL; + + // Initialization of the next three items requires information about + // the parallel decomposition and link cells that isn't available + // with the potential is initialized. Hence, we defer their + // initialization until the first time we call the force routine. + pot->dfEmbed = NULL; + pot->rhobar = NULL; + pot->forceExchange = NULL; + + if (getMyRank() == 0) + { + if (strcmp(type, "setfl" ) == 0) + eamReadSetfl(pot, dir, file); + else if (strcmp(type,"funcfl") == 0) + eamReadFuncfl(pot, dir, file); + else + typeNotSupported("initEamPot", type); + } + eamBcastPotential(pot); + + return (BasePotential*) pot; +} + +/// Calculate potential energy and forces for the EAM potential. +/// +/// Three steps are required: +/// +/// -# Loop over all atoms and their neighbors, compute the two-body +/// interaction and the electron density at each atom +/// -# Loop over all atoms, compute the embedding energy and its +/// derivative for each atom +/// -# Loop over all atoms and their neighbors, compute the embedding +/// energy contribution to the force and add to the two-body force +/// +int eamForce(SimFlat* s) +{ + EamPotential* pot = (EamPotential*) s->pot; + assert(pot); + + // set up halo exchange and internal storage on first call to forces. + if (pot->forceExchange == NULL) + { + int maxTotalAtoms = MAXATOMS*s->boxes->nTotalBoxes; + pot->dfEmbed = comdMalloc(maxTotalAtoms*sizeof(real_t)); + pot->rhobar = comdMalloc(maxTotalAtoms*sizeof(real_t)); + pot->forceExchange = initForceHaloExchange(s->domain, s->boxes); + pot->forceExchangeData = comdMalloc(sizeof(ForceExchangeData)); + pot->forceExchangeData->dfEmbed = pot->dfEmbed; + pot->forceExchangeData->boxes = s->boxes; + } + + real_t rCut2 = pot->cutoff*pot->cutoff; + + // zero forces / energy / rho /rhoprime + real_t etot = 0.0; + memset(s->atoms->f, 0, s->boxes->nTotalBoxes*MAXATOMS*sizeof(real3)); + memset(s->atoms->U, 0, s->boxes->nTotalBoxes*MAXATOMS*sizeof(real_t)); + memset(pot->dfEmbed, 0, s->boxes->nTotalBoxes*MAXATOMS*sizeof(real_t)); + memset(pot->rhobar, 0, s->boxes->nTotalBoxes*MAXATOMS*sizeof(real_t)); + + int nbrBoxes[27]; + // loop over local boxes + for (int iBox=0; iBoxboxes->nLocalBoxes; iBox++) + { + int nIBox = s->boxes->nAtoms[iBox]; + int nNbrBoxes = getNeighborBoxes(s->boxes, iBox, nbrBoxes); + // loop over neighbor boxes of iBox (some may be halo boxes) + for (int jTmp=0; jTmpboxes->nAtoms[jBox]; + // loop over atoms in iBox + for (int iOff=MAXATOMS*iBox,ii=0; iiatoms->r[iOff][k]-s->atoms->r[jOff][k]; + r2+=dr[k]*dr[k]; + } + if(r2>rCut2) continue; + + double r = sqrt(r2); + + real_t phiTmp, dPhi, rhoTmp, dRho; + interpolate(pot->phi, r, &phiTmp, &dPhi); + interpolate(pot->rho, r, &rhoTmp, &dRho); + + for (int k=0; k<3; k++) + { + s->atoms->f[iOff][k] -= dPhi*dr[k]/r; + s->atoms->f[jOff][k] += dPhi*dr[k]/r; + } + + // update energy terms + // calculate energy contribution based on whether + // the neighbor box is local or remote + if (jBox < s->boxes->nLocalBoxes) + etot += phiTmp; + else + etot += 0.5*phiTmp; + + s->atoms->U[iOff] += 0.5*phiTmp; + s->atoms->U[jOff] += 0.5*phiTmp; + + // accumulate rhobar for each atom + pot->rhobar[iOff] += rhoTmp; + pot->rhobar[jOff] += rhoTmp; + + } // loop over atoms in jBox + } // loop over atoms in iBox + } // loop over neighbor boxes + } // loop over local boxes + + // Compute Embedding Energy + // loop over all local boxes + for (int iBox=0; iBoxboxes->nLocalBoxes; iBox++) + { + int iOff; + int nIBox = s->boxes->nAtoms[iBox]; + + // loop over atoms in iBox + for (int iOff=MAXATOMS*iBox,ii=0; iif, pot->rhobar[iOff], &fEmbed, &dfEmbed); + pot->dfEmbed[iOff] = dfEmbed; // save derivative for halo exchange + etot += fEmbed; + s->atoms->U[iOff] += fEmbed; + } + } + + // exchange derivative of the embedding energy with repsect to rhobar + startTimer(eamHaloTimer); + haloExchange(pot->forceExchange, pot->forceExchangeData); + stopTimer(eamHaloTimer); + + // third pass + // loop over local boxes + for (int iBox=0; iBoxboxes->nLocalBoxes; iBox++) + { + int nIBox = s->boxes->nAtoms[iBox]; + int nNbrBoxes = getNeighborBoxes(s->boxes, iBox, nbrBoxes); + // loop over neighbor boxes of iBox (some may be halo boxes) + for (int jTmp=0; jTmpboxes->nAtoms[jBox]; + // loop over atoms in iBox + for (int iOff=MAXATOMS*iBox,ii=0; iiatoms->r[iOff][k]-s->atoms->r[jOff][k]; + r2+=dr[k]*dr[k]; + } + if(r2>=rCut2) continue; + + real_t r = sqrt(r2); + + real_t rhoTmp, dRho; + interpolate(pot->rho, r, &rhoTmp, &dRho); + + for (int k=0; k<3; k++) + { + s->atoms->f[iOff][k] -= (pot->dfEmbed[iOff]+pot->dfEmbed[jOff])*dRho*dr[k]/r; + s->atoms->f[jOff][k] += (pot->dfEmbed[iOff]+pot->dfEmbed[jOff])*dRho*dr[k]/r; + } + + } // loop over atoms in jBox + } // loop over atoms in iBox + } // loop over neighbor boxes + } // loop over local boxes + + s->ePotential = (real_t) etot; + + return 0; +} + +void eamPrint(FILE* file, BasePotential* pot) +{ + EamPotential *eamPot = (EamPotential*) pot; + fprintf(file, " Potential type : EAM\n"); + fprintf(file, " Species name : %s\n", eamPot->name); + fprintf(file, " Atomic number : %d\n", eamPot->atomicNo); + fprintf(file, " Mass : "FMT1" amu\n", eamPot->mass/amuToInternalMass); // print in amu + fprintf(file, " Lattice type : %s\n", eamPot->latticeType); + fprintf(file, " Lattice spacing : "FMT1" Angstroms\n", eamPot->lat); + fprintf(file, " Cutoff : "FMT1" Angstroms\n", eamPot->cutoff); +} + +void eamDestroy(BasePotential** pPot) +{ + if ( ! pPot ) return; + EamPotential* pot = *(EamPotential**)pPot; + if ( ! pot ) return; + destroyInterpolationObject(&(pot->phi)); + destroyInterpolationObject(&(pot->rho)); + destroyInterpolationObject(&(pot->f)); + destroyHaloExchange(&(pot->forceExchange)); + comdFree(pot); + *pPot = NULL; + + return; +} + +/// Broadcasts an EamPotential from rank 0 to all other ranks. +/// If the table coefficients are read from a file only rank 0 does the +/// read. Hence we need to broadcast the potential to all other ranks. +void eamBcastPotential(EamPotential* pot) +{ + assert(pot); + + struct + { + real_t cutoff, mass, lat; + char latticeType[8]; + char name[3]; + int atomicNo; + } buf; + if (getMyRank() == 0) + { + buf.cutoff = pot->cutoff; + buf.mass = pot->mass; + buf.lat = pot->lat; + buf.atomicNo = pot->atomicNo; + strcpy(buf.latticeType, pot->latticeType); + strcpy(buf.name, pot->name); + } + bcastParallel(&buf, sizeof(buf), 0); + pot->cutoff = buf.cutoff; + pot->mass = buf.mass; + pot->lat = buf.lat; + pot->atomicNo = buf.atomicNo; + strcpy(pot->latticeType, buf.latticeType); + strcpy(pot->name, buf.name); + + bcastInterpolationObject(&pot->phi); + bcastInterpolationObject(&pot->rho); + bcastInterpolationObject(&pot->f); +} + +/// Builds a structure to store interpolation data for a tabular +/// function. Interpolation must be supported on the range +/// \f$[x_0, x_n]\f$, where \f$x_n = n*dx\f$. +/// +/// \see interpolate +/// \see bcastInterpolationObject +/// \see destroyInterpolationObject +/// +/// \param [in] n number of values in the table. +/// \param [in] x0 minimum ordinate value of the table. +/// \param [in] dx spacing of the ordinate values. +/// \param [in] data abscissa values. An array of size n. +InterpolationObject* initInterpolationObject( + int n, real_t x0, real_t dx, real_t* data) +{ + InterpolationObject* table = + (InterpolationObject *)comdMalloc(sizeof(InterpolationObject)) ; + assert(table); + + table->values = (real_t*)comdCalloc(1, (n+3)*sizeof(real_t)); + assert(table->values); + + table->values++; + table->n = n; + table->invDx = 1.0/dx; + table->x0 = x0; + + for (int ii=0; iivalues[ii] = data[ii]; + + table->values[-1] = table->values[0]; + table->values[n+1] = table->values[n] = table->values[n-1]; + + return table; +} + +void destroyInterpolationObject(InterpolationObject** a) +{ + if ( ! a ) return; + if ( ! *a ) return; + if ( (*a)->values) + { + (*a)->values--; + comdFree((*a)->values); + } + comdFree(*a); + *a = NULL; + + return; +} + +/// Interpolate a table to determine f(r) and its derivative f'(r). +/// +/// The forces on the particle are much more sensitive to the derivative +/// of the potential than on the potential itself. It is therefore +/// absolutely essential that the interpolated derivatives are smooth +/// and continuous. This function uses simple quadratic interpolation +/// to find f(r). Since quadric interpolants don't have smooth +/// derivatives, f'(r) is computed using a 4 point finite difference +/// stencil. +/// +/// Interpolation is used heavily by the EAM force routine so this +/// function is a potential performance hot spot. Feel free to +/// reimplement this function (and initInterpolationObject if necessay) +/// with any higher performing implementation of interpolation, as long +/// as the alternate implmentation that has the required smoothness +/// properties. Cubic splines are one common alternate choice. +/// +/// \param [in] table Interpolation table. +/// \param [in] r Point where function value is needed. +/// \param [out] f The interpolated value of f(r). +/// \param [out] df The interpolated value of df(r)/dr. +void interpolate(InterpolationObject* table, real_t r, real_t* f, real_t* df) +{ + const real_t* tt = table->values; // alias + + if ( r < table->x0 ) r = table->x0; + + r = (r-table->x0)*(table->invDx) ; + int ii = (int)floor(r); + if (ii > table->n) + { + ii = table->n; + r = table->n / table->invDx; + } + // reset r to fractional distance + r = r - floor(r); + + real_t g1 = tt[ii+1] - tt[ii-1]; + real_t g2 = tt[ii+2] - tt[ii]; + + *f = tt[ii] + 0.5*r*(g1 + r*(tt[ii+1] + tt[ii-1] - 2.0*tt[ii]) ); + + *df = 0.5*(g1 + r*(g2-g1))*table->invDx; +} + +/// Broadcasts an InterpolationObject from rank 0 to all other ranks. +/// +/// It is commonly the case that the data needed to create the +/// interpolation table is available on only one task (for example, only +/// one task has read the data from a file). Broadcasting the table +/// eliminates the need to put broadcast code in multiple table readers. +/// +/// \see eamBcastPotential +void bcastInterpolationObject(InterpolationObject** table) +{ + struct + { + int n; + real_t x0, invDx; + } buf; + + if (getMyRank() == 0) + { + buf.n = (*table)->n; + buf.x0 = (*table)->x0; + buf.invDx = (*table)->invDx; + } + bcastParallel(&buf, sizeof(buf), 0); + + if (getMyRank() != 0) + { + assert(*table == NULL); + *table = comdMalloc(sizeof(InterpolationObject)); + (*table)->n = buf.n; + (*table)->x0 = buf.x0; + (*table)->invDx = buf.invDx; + (*table)->values = comdMalloc(sizeof(real_t) * (buf.n+3) ); + (*table)->values++; + } + + int valuesSize = sizeof(real_t) * ((*table)->n+3); + bcastParallel((*table)->values-1, valuesSize, 0); +} + +void printTableData(InterpolationObject* table, const char* fileName) +{ + if (!printRank()) return; + + FILE* potData; + potData = fopen(fileName,"w"); + real_t dR = 1.0/table->invDx; + for (int i = 0; in; i++) + { + real_t r = table->x0+i*dR; + fprintf(potData, "%d %e %e\n", i, r, table->values[i]); + } + fclose(potData); +} + +/// Reads potential data from a setfl file and populates +/// corresponding members and InterpolationObjects in an EamPotential. +/// +/// setfl is a file format for tabulated potential functions used by +/// the original EAM code DYNAMO. A setfl file contains EAM +/// potentials for multiple elements. +/// +/// The contents of a setfl file are: +/// +/// | Line Num | Description +/// | :------: | :---------- +/// | 1 - 3 | comments +/// | 4 | ntypes type1 type2 ... typen +/// | 5 | nrho drho nr dr rcutoff +/// | F, rho | Following line 5 there is a block for each atom type with F, and rho. +/// | b1 | ielem(i) amass(i) latConst(i) latType(i) +/// | b2 | embedding function values F(rhobar) starting at rhobar=0 +/// | ... | (nrho values. Multiple values per line allowed.) +/// | bn | electron density, starting at r=0 +/// | ... | (nr values. Multiple values per line allowed.) +/// | repeat | Return to b1 for each atom type. +/// | phi | phi_ij for (1,1), (2,1), (2,2), (3,1), (3,2), (3,3), (4,1), ..., +/// | p1 | pair potential between type i and type j, starting at r=0 +/// | ... | (nr values. Multiple values per line allowed.) +/// | repeat | Return to p1 for each phi_ij +/// +/// Where: +/// - ntypes : number of element types in the potential +/// - nrho : number of points the embedding energy F(rhobar) +/// - drho : table spacing for rhobar +/// - nr : number of points for rho(r) and phi(r) +/// - dr : table spacing for r in Angstroms +/// - rcutoff : cut-off distance in Angstroms +/// - ielem(i) : atomic number for element(i) +/// - amass(i) : atomic mass for element(i) in AMU +/// - latConst(i) : lattice constant for element(i) in Angstroms +/// - latType(i) : lattice type for element(i) +/// +/// setfl format stores r*phi(r), so we need to converted to the pair +/// potential phi(r). In the file, phi(r)*r is in eV*Angstroms. +/// NB: phi is not defined for r = 0 +/// +/// F(rhobar) is in eV. +/// +void eamReadSetfl(EamPotential* pot, const char* dir, const char* potName) +{ + char tmp[4096]; + sprintf(tmp, "%s/%s", dir, potName); + + FILE* potFile = fopen(tmp, "r"); + if (potFile == NULL) + fileNotFound("eamReadSetfl", tmp); + + // read the first 3 lines (comments) + fgets(tmp, sizeof(tmp), potFile); + fgets(tmp, sizeof(tmp), potFile); + fgets(tmp, sizeof(tmp), potFile); + + // line 4 + fgets(tmp, sizeof(tmp), potFile); + int nElems; + sscanf(tmp, "%d", &nElems); + if( nElems != 1 ) + notAlloyReady("eamReadSetfl"); + + //line 5 + int nRho, nR; + double dRho, dR, cutoff; + // The same cutoff is used by all alloys, NB: cutoff = nR * dR is redundant + fgets(tmp, sizeof(tmp), potFile); + sscanf(tmp, "%d %le %d %le %le", &nRho, &dRho, &nR, &dR, &cutoff); + pot->cutoff = cutoff; + + // **** THIS CODE IS RESTRICTED TO ONE ELEMENT + // Per-atom header + fgets(tmp, sizeof(tmp), potFile); + int nAtomic; + double mass, lat; + char latticeType[8]; + sscanf(tmp, "%d %le %le %s", &nAtomic, &mass, &lat, latticeType); + pot->atomicNo = nAtomic; + pot->lat = lat; + pot->mass = mass * amuToInternalMass; // file has mass in AMU. + strcpy(pot->latticeType, latticeType); + + // allocate read buffer + int bufSize = MAX(nRho, nR); + real_t* buf = comdMalloc(bufSize * sizeof(real_t)); + real_t x0 = 0.0; + + // Read embedding energy F(rhobar) + for (int ii=0; iif = initInterpolationObject(nRho, x0, dRho, buf); + + // Read electron density rho(r) + for (int ii=0; iirho = initInterpolationObject(nR, x0, dR, buf); + + // Read phi(r)*r and convert to phi(r) + for (int ii=0; iiphi = initInterpolationObject(nR, x0, dR, buf); + + comdFree(buf); + + // write to text file for comparison, currently commented out +/* printPot(pot->f, "SetflDataF.txt"); */ +/* printPot(pot->rho, "SetflDataRho.txt"); */ +/* printPot(pot->phi, "SetflDataPhi.txt"); */ +} + +/// Reads potential data from a funcfl file and populates +/// corresponding members and InterpolationObjects in an EamPotential. +/// +/// funcfl is a file format for tabulated potential functions used by +/// the original EAM code DYNAMO. A funcfl file contains an EAM +/// potential for a single element. +/// +/// The contents of a funcfl file are: +/// +/// | Line Num | Description +/// | :------: | :---------- +/// | 1 | comments +/// | 2 | elem amass latConstant latType +/// | 3 | nrho drho nr dr rcutoff +/// | 4 | embedding function values F(rhobar) starting at rhobar=0 +/// | ... | (nrho values. Multiple values per line allowed.) +/// | x' | electrostatic interation Z(r) starting at r=0 +/// | ... | (nr values. Multiple values per line allowed.) +/// | y' | electron density values rho(r) starting at r=0 +/// | ... | (nr values. Multiple values per line allowed.) +/// +/// Where: +/// - elem : atomic number for this element +/// - amass : atomic mass for this element in AMU +/// - latConstant : lattice constant for this elemnent in Angstroms +/// - lattticeType : lattice type for this element (e.g. FCC) +/// - nrho : number of values for the embedding function, F(rhobar) +/// - drho : table spacing for rhobar +/// - nr : number of values for Z(r) and rho(r) +/// - dr : table spacing for r in Angstroms +/// - rcutoff : potential cut-off distance in Angstroms +/// +/// funcfl format stores the "electrostatic interation" Z(r). This needs to +/// be converted to the pair potential phi(r). +/// using the formula +/// \f[phi = Z(r) * Z(r) / r\f] +/// NB: phi is not defined for r = 0 +/// +/// Z(r) is in atomic units (i.e., sqrt[Hartree * bohr]) so it is +/// necesary to convert to eV. +/// +/// F(rhobar) is in eV. +/// +void eamReadFuncfl(EamPotential* pot, const char* dir, const char* potName) +{ + char tmp[4096]; + + sprintf(tmp, "%s/%s", dir, potName); + FILE* potFile = fopen(tmp, "r"); + if (potFile == NULL) + fileNotFound("eamReadFuncfl", tmp); + + // line 1 + fgets(tmp, sizeof(tmp), potFile); + char name[3]; + sscanf(tmp, "%s", name); + strcpy(pot->name, name); + + // line 2 + int nAtomic; + double mass, lat; + char latticeType[8]; + fgets(tmp,sizeof(tmp),potFile); + sscanf(tmp, "%d %le %le %s", &nAtomic, &mass, &lat, latticeType); + pot->atomicNo = nAtomic; + pot->lat = lat; + pot->mass = mass*amuToInternalMass; // file has mass in AMU. + strcpy(pot->latticeType, latticeType); + + // line 3 + int nRho, nR; + double dRho, dR, cutoff; + fgets(tmp,sizeof(tmp),potFile); + sscanf(tmp, "%d %le %d %le %le", &nRho, &dRho, &nR, &dR, &cutoff); + pot->cutoff = cutoff; + real_t x0 = 0.0; // tables start at zero. + + // allocate read buffer + int bufSize = MAX(nRho, nR); + real_t* buf = comdMalloc(bufSize * sizeof(real_t)); + + // read embedding energy + for (int ii=0; iif = initInterpolationObject(nRho, x0, dRho, buf); + + // read Z(r) and convert to phi(r) + for (int ii=0; iiphi = initInterpolationObject(nR, x0, dR, buf); + + // read electron density rho + for (int ii=0; iirho = initInterpolationObject(nR, x0, dR, buf); + + comdFree(buf); + +/* printPot(pot->f, "funcflDataF.txt"); */ +/* printPot(pot->rho, "funcflDataRho.txt"); */ +/* printPot(pot->phi, "funcflDataPhi.txt"); */ +} + +void fileNotFound(const char* callSite, const char* filename) +{ + fprintf(screenOut, + "%s: Can't open file %s. Fatal Error.\n", callSite, filename); + exit(-1); +} + +void notAlloyReady(const char* callSite) +{ + fprintf(screenOut, + "%s: CoMD 1.1 does not support alloys and cannot\n" + " read setfl files with multiple species. Fatal Error.\n", callSite); + exit(-1); +} + +void typeNotSupported(const char* callSite, const char* type) +{ + fprintf(screenOut, + "%s: Potential type %s not supported. Fatal Error.\n", callSite, type); + exit(-1); +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.h @@ -0,0 +1,86 @@ +/// \file +/// Communicate halo data such as "ghost" atoms with neighboring tasks. + +#ifndef __HALO_EXCHANGE_ +#define __HALO_EXCHANGE_ + +#include "mytype.h" +struct AtomsSt; +struct LinkCellSt; +struct DomainSt; + +/// A polymorphic structure to store information about a halo exchange. +/// This structure can be thought of as an abstract base class that +/// specifies the interface and implements the communication patterns of +/// a halo exchange. Concrete sub-classes supply actual implementations +/// of the loadBuffer, unloadBuffer, and destroy functions, that are +/// specific to the actual data being exchanged. If the subclass needs +/// additional data members, these can be stored in a structure that is +/// pointed to by parms. +/// +/// Designing the structure this way allows us to re-use the +/// communication code for both atom data and partial force data. +/// +/// \see eamForce +/// \see redistributeAtoms +typedef struct HaloExchangeSt +{ + /// The MPI ranks of the six face neighbors of the local domain. + /// Ranks are stored in the order specified in HaloFaceOrder. + int nbrRank[6]; + /// The maximum send/recv buffer size (in bytes) that will be needed + /// for this halo exchange. + int bufCapacity; + /// Pointer to a sub-class specific function to load the send buffer. + /// \param [in] parms The parms member of the structure. This is a + /// pointer to a sub-class specific structure that can + /// be used by the load and unload functions to store + /// sub-class specific data. + /// \param [in] data A pointer to a structure that the contains the data + /// that is needed by the loadBuffer function. The + /// loadBuffer function will cast the pointer to a + /// concrete type that is appropriate for the data + /// being exchanged. + /// \param [in] face Specifies the face across which data is being sent. + /// \param [in] buf The send buffer to be loaded + /// \return The number of bytes loaded into the send buffer. + int (*loadBuffer)(void* parms, void* data, int face, char* buf); + /// Pointer to a sub-class specific function to unload the recv buffer. + /// \param [in] parms The parms member of the structure. This is a + /// pointer to a sub-class specific structure that can + /// be used by the load and unload functions to store + /// sub-class specific data. + /// \param [out] data A pointer to a structure that the contains the data + /// that is needed by the unloadBuffer function. The + /// unloadBuffer function will cast the pointer to a + /// concrete type that is appropriate for the data + /// being exchanged. + /// \param [in] face Specifies the face across which data is being sent. + /// \param [in] bufSize The number of bytes in the recv buffer. + /// \param [in] buf The recv buffer to be unloaded. + void (*unloadBuffer)(void* parms, void* data, int face, int bufSize, char* buf); + /// Pointer to a function to deallocate any memory used by the + /// sub-class parms. Essentially this is a virtual destructor. + void (*destroy)(void* parms); + /// A pointer to a sub-class specific structure that contains + /// additional data members needed by the sub-class. + void* parms; +} +HaloExchange; + +/// Create a HaloExchange for atom data. +HaloExchange* initAtomHaloExchange(struct DomainSt* domain, struct LinkCellSt* boxes); + +/// Create a HaloExchange for force data. +HaloExchange* initForceHaloExchange(struct DomainSt* domain, struct LinkCellSt* boxes); + +/// HaloExchange destructor. +void destroyHaloExchange(HaloExchange** haloExchange); + +/// Execute a halo exchange. +void haloExchange(HaloExchange* haloExchange, void* data); + +/// Sort the atoms by gid in the specified link cell. +void sortAtomsInCell(struct AtomsSt* atoms, struct LinkCellSt* boxes, int iBox); + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/haloExchange.c @@ -0,0 +1,662 @@ +/// \file +/// Communicate halo data such as "ghost" atoms with neighboring tasks. +/// In addition to ghost atoms, the EAM potential also needs to exchange +/// some force information. Hence this file implements both an atom +/// exchange and a force exchange, each with slightly different +/// properties due to their different roles. +/// +/// The halo exchange in CoMD 1.1 takes advantage of the Cartesian domain +/// decomposition as well as the link cell structure to quickly +/// determine what data needs to be sent. +/// +/// This halo exchange implementation is able to send data to all 26 +/// neighboring tasks using only 6 messages. This is accomplished by +/// sending data across the x-faces, then the y-faces, and finally +/// across the z-faces. Some of the data that was received from the +/// x-faces is included in the y-face sends and so on. This +/// accumulation of data allows data to reach edge neighbors and corner +/// neighbors by a two or three step process. +/// +/// The advantage of this type of structured halo exchange is that it +/// minimizes the number of MPI messages to send, and maximizes the size +/// of those messages. +/// +/// The disadvantage of this halo exchange is that it serializes message +/// traffic. Only two messages can be in flight at once. The x-axis +/// messages must be received and processed before the y-axis messages +/// can begin. Architectures with low message latency and many off node +/// network links would likely benefit from alternate halo exchange +/// strategies that send independent messages to each neighbor task. + +#include "haloExchange.h" + +#include + +#include "CoMDTypes.h" +#include "decomposition.h" +#include "parallel.h" +#include "linkCells.h" +#include "eam.h" +#include "memUtils.h" +#include "performanceTimers.h" + +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + +/// Don't change the order of the faces in this enum. +enum HaloFaceOrder {HALO_X_MINUS, HALO_X_PLUS, + HALO_Y_MINUS, HALO_Y_PLUS, + HALO_Z_MINUS, HALO_Z_PLUS}; + +/// Don't change the order of the axes in this enum. +enum HaloAxisOrder {HALO_X_AXIS, HALO_Y_AXIS, HALO_Z_AXIS}; + +/// Extra data members that are needed for the exchange of atom data. +/// For an atom exchange, the HaloExchangeSt::parms will point to a +/// structure of this type. +typedef struct AtomExchangeParmsSt +{ + int nCells[6]; //!< Number of cells in cellList for each face. + int* cellList[6]; //!< List of link cells from which to load data for each face. + real_t* pbcFactor[6]; //!< Whether this face is a periodic boundary. +} +AtomExchangeParms; + +/// Extra data members that are needed for the exchange of force data. +/// For an force exchange, the HaloExchangeSt::parms will point to a +/// structure of this type. +typedef struct ForceExchangeParmsSt +{ + int nCells[6]; //!< Number of cells to send/recv for each face. + int* sendCells[6]; //!< List of link cells to send for each face. + int* recvCells[6]; //!< List of link cells to recv for each face. +} +ForceExchangeParms; + +/// A structure to package data for a single atom to pack into a +/// send/recv buffer. Also used for sorting atoms within link cells. +typedef struct AtomMsgSt +{ + int gid; + int type; + real_t rx, ry, rz; + real_t px, py, pz; +} +AtomMsg; + +/// Package data for the force exchange. +typedef struct ForceMsgSt +{ + real_t dfEmbed; +} +ForceMsg; + +static HaloExchange* initHaloExchange(Domain* domain); +static void exchangeData(HaloExchange* haloExchange, void* data, int iAxis); + +static int* mkAtomCellList(LinkCell* boxes, enum HaloFaceOrder iFace, const int nCells); +static int loadAtomsBuffer(void* vparms, void* data, int face, char* charBuf); +static void unloadAtomsBuffer(void* vparms, void* data, int face, int bufSize, char* charBuf); +static void destroyAtomsExchange(void* vparms); + +static int* mkForceSendCellList(LinkCell* boxes, int face, int nCells); +static int* mkForceRecvCellList(LinkCell* boxes, int face, int nCells); +static int loadForceBuffer(void* vparms, void* data, int face, char* charBuf); +static void unloadForceBuffer(void* vparms, void* data, int face, int bufSize, char* charBuf); +static void destroyForceExchange(void* vparms); +static int sortAtomsById(const void* a, const void* b); + +/// \details +/// When called in proper sequence by redistributeAtoms, the atom halo +/// exchange helps serve three purposes: +/// - Send ghost atom data to neighbor tasks. +/// - Shift atom coordinates by the global simulation size when they cross +/// periodic boundaries. This shift is performed in loadAtomsBuffer. +/// - Transfer ownership of atoms between tasks as the atoms move across +/// spatial domain boundaries. This transfer of ownership occurs in +/// two places. The former owner gives up ownership when +/// updateLinkCells moves a formerly local atom into a halo link cell. +/// The new owner accepts ownership when unloadAtomsBuffer calls +/// putAtomInBox to place a received atom into a local link cell. +/// +/// This constructor does the following: +/// +/// - Sets the bufCapacity to hold the largest possible number of atoms +/// that can be sent across a face. +/// - Initialize function pointers to the atom-specific versions +/// - Sets the number of link cells to send across each face. +/// - Builds the list of link cells to send across each face. As +/// explained in the comments for mkAtomCellList, this list must +/// include any link cell, local or halo, that could possibly contain +/// an atom that needs to be sent across the face. Atoms that need to +/// be sent include "ghost atoms" that are located in local link +/// cells that correspond to halo link cells on receiving tasks as well as +/// formerly local atoms that have just moved into halo link cells and +/// need to be sent to the rank that owns the spatial domain the atom +/// has moved into. +/// - Sets a coordinate shift factor for each face to account for +/// periodic boundary conditions. For most faces the factor is zero. +/// For faces on the +x, +y, or +z face of the simulation domain +/// the factor is -1.0 (to shift the coordinates by -1 times the +/// simulation domain size). For -x, -y, and -z faces of the +/// simulation domain, the factor is +1.0. +/// +/// \see redistributeAtoms +HaloExchange* initAtomHaloExchange(Domain* domain, LinkCell* boxes) +{ + HaloExchange* hh = initHaloExchange(domain); + + int size0 = (boxes->gridSize[1]+2)*(boxes->gridSize[2]+2); + int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]+2); + int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); + int maxSize = MAX(size0, size1); + maxSize = MAX(size1, size2); + hh->bufCapacity = maxSize*2*MAXATOMS*sizeof(AtomMsg); + + hh->loadBuffer = loadAtomsBuffer; + hh->unloadBuffer = unloadAtomsBuffer; + hh->destroy = destroyAtomsExchange; + + AtomExchangeParms* parms = comdMalloc(sizeof(AtomExchangeParms)); + + parms->nCells[HALO_X_MINUS] = 2*(boxes->gridSize[1]+2)*(boxes->gridSize[2]+2); + parms->nCells[HALO_Y_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[2]+2); + parms->nCells[HALO_Z_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); + parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; + parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; + parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; + + for (int ii=0; ii<6; ++ii) + parms->cellList[ii] = mkAtomCellList(boxes, ii, parms->nCells[ii]); + + for (int ii=0; ii<6; ++ii) + { + parms->pbcFactor[ii] = comdMalloc(3*sizeof(real_t)); + for (int jj=0; jj<3; ++jj) + parms->pbcFactor[ii][jj] = 0.0; + } + int* procCoord = domain->procCoord; //alias + int* procGrid = domain->procGrid; //alias + if (procCoord[HALO_X_AXIS] == 0) parms->pbcFactor[HALO_X_MINUS][HALO_X_AXIS] = +1.0; + if (procCoord[HALO_X_AXIS] == procGrid[HALO_X_AXIS]-1) parms->pbcFactor[HALO_X_PLUS][HALO_X_AXIS] = -1.0; + if (procCoord[HALO_Y_AXIS] == 0) parms->pbcFactor[HALO_Y_MINUS][HALO_Y_AXIS] = +1.0; + if (procCoord[HALO_Y_AXIS] == procGrid[HALO_Y_AXIS]-1) parms->pbcFactor[HALO_Y_PLUS][HALO_Y_AXIS] = -1.0; + if (procCoord[HALO_Z_AXIS] == 0) parms->pbcFactor[HALO_Z_MINUS][HALO_Z_AXIS] = +1.0; + if (procCoord[HALO_Z_AXIS] == procGrid[HALO_Z_AXIS]-1) parms->pbcFactor[HALO_Z_PLUS][HALO_Z_AXIS] = -1.0; + + hh->parms = parms; + return hh; +} + +/// The force exchange is considerably simpler than the atom exchange. +/// In the force case we only need to exchange data that is needed to +/// complete the force calculation. Since the atoms have not moved we +/// only need to send data from local link cells and we are guaranteed +/// that the same atoms exist in the same order in corresponding halo +/// cells on remote tasks. The only tricky part is the size of the +/// plane of local cells that needs to be sent grows in each direction. +/// This is because the y-axis send must send some of the data that was +/// received from the x-axis send, and the z-axis must send some data +/// from the y-axis send. This accumulation of data to send is +/// responsible for data reaching neighbor cells that share only edges +/// or corners. +/// +/// \see eam.c for an explanation of the requirement to exchange +/// force data. +HaloExchange* initForceHaloExchange(Domain* domain, LinkCell* boxes) +{ + HaloExchange* hh = initHaloExchange(domain); + + hh->loadBuffer = loadForceBuffer; + hh->unloadBuffer = unloadForceBuffer; + hh->destroy = destroyForceExchange; + + int size0 = (boxes->gridSize[1])*(boxes->gridSize[2]); + int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]); + int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); + int maxSize = MAX(size0, size1); + maxSize = MAX(size1, size2); + hh->bufCapacity = (maxSize)*MAXATOMS*sizeof(ForceMsg); + + ForceExchangeParms* parms = comdMalloc(sizeof(ForceExchangeParms)); + + parms->nCells[HALO_X_MINUS] = (boxes->gridSize[1] )*(boxes->gridSize[2] ); + parms->nCells[HALO_Y_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[2] ); + parms->nCells[HALO_Z_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); + parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; + parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; + parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; + + for (int ii=0; ii<6; ++ii) + { + parms->sendCells[ii] = mkForceSendCellList(boxes, ii, parms->nCells[ii]); + parms->recvCells[ii] = mkForceRecvCellList(boxes, ii, parms->nCells[ii]); + } + + hh->parms = parms; + return hh; +} + +void destroyHaloExchange(HaloExchange** haloExchange) +{ + (*haloExchange)->destroy((*haloExchange)->parms); + comdFree((*haloExchange)->parms); + comdFree(*haloExchange); + *haloExchange = NULL; +} + +void haloExchange(HaloExchange* haloExchange, void* data) +{ + for (int iAxis=0; iAxis<3; ++iAxis) + exchangeData(haloExchange, data, iAxis); +} + +/// Base class constructor. +HaloExchange* initHaloExchange(Domain* domain) +{ + HaloExchange* hh = comdMalloc(sizeof(HaloExchange)); + + // Rank of neighbor task for each face. + hh->nbrRank[HALO_X_MINUS] = processorNum(domain, -1, 0, 0); + hh->nbrRank[HALO_X_PLUS] = processorNum(domain, +1, 0, 0); + hh->nbrRank[HALO_Y_MINUS] = processorNum(domain, 0, -1, 0); + hh->nbrRank[HALO_Y_PLUS] = processorNum(domain, 0, +1, 0); + hh->nbrRank[HALO_Z_MINUS] = processorNum(domain, 0, 0, -1); + hh->nbrRank[HALO_Z_PLUS] = processorNum(domain, 0, 0, +1); + hh->bufCapacity = 0; // will be set by sub-class. + + return hh; +} + +/// This is the function that does the heavy lifting for the +/// communication of halo data. It is called once for each axis and +/// sends and receives two message. Loading and unloading of the +/// buffers is in the hands of the sub-class virtual functions. +/// +/// \param [in] iAxis Axis index. +/// \param [in, out] data Pointer to data that will be passed to the load and +/// unload functions +void exchangeData(HaloExchange* haloExchange, void* data, int iAxis) +{ + enum HaloFaceOrder faceM = 2*iAxis; + enum HaloFaceOrder faceP = faceM+1; + + char* sendBufM = comdMalloc(haloExchange->bufCapacity); + char* sendBufP = comdMalloc(haloExchange->bufCapacity); + char* recvBufM = comdMalloc(haloExchange->bufCapacity); + char* recvBufP = comdMalloc(haloExchange->bufCapacity); + + int nSendM = haloExchange->loadBuffer(haloExchange->parms, data, faceM, sendBufM); + int nSendP = haloExchange->loadBuffer(haloExchange->parms, data, faceP, sendBufP); + + int nbrRankM = haloExchange->nbrRank[faceM]; + int nbrRankP = haloExchange->nbrRank[faceP]; + + int nRecvM, nRecvP; + + startTimer(commHaloTimer); + nRecvP = sendReceiveParallel(sendBufM, nSendM, nbrRankM, recvBufP, haloExchange->bufCapacity, nbrRankP); + nRecvM = sendReceiveParallel(sendBufP, nSendP, nbrRankP, recvBufM, haloExchange->bufCapacity, nbrRankM); + stopTimer(commHaloTimer); + + haloExchange->unloadBuffer(haloExchange->parms, data, faceM, nRecvM, recvBufM); + haloExchange->unloadBuffer(haloExchange->parms, data, faceP, nRecvP, recvBufP); + comdFree(recvBufP); + comdFree(recvBufM); + comdFree(sendBufP); + comdFree(sendBufM); +} + +/// Make a list of link cells that need to be sent across the specified +/// face. For each face, the list must include all cells, local and +/// halo, in the first two planes of link cells. Halo cells must be +/// included in the list of link cells to send since local atoms may +/// have moved from local cells into halo cells on this time step. +/// (Actual remote atoms should have been deleted, so the halo cells +/// should contain only these few atoms that have just crossed.) +/// Sending these atoms will allow them to be reassigned to the task +/// that covers the spatial domain they have moved into. +/// +/// Note that link cell grid coordinates range from -1 to gridSize[iAxis]. +/// \see initLinkCells for an explanation link cell grid coordinates. +/// +/// \param [in] boxes Link cell information. +/// \param [in] iFace Index of the face data will be sent across. +/// \param [in] nCells Number of cells to send. This is used for a +/// consistency check. +/// \return The list of cells to send. Caller is responsible to free +/// the list. +int* mkAtomCellList(LinkCell* boxes, enum HaloFaceOrder iFace, const int nCells) +{ + int* list = comdMalloc(nCells*sizeof(int)); + int xBegin = -1; + int xEnd = boxes->gridSize[0]+1; + int yBegin = -1; + int yEnd = boxes->gridSize[1]+1; + int zBegin = -1; + int zEnd = boxes->gridSize[2]+1; + + if (iFace == HALO_X_MINUS) xEnd = xBegin+2; + if (iFace == HALO_X_PLUS) xBegin = xEnd-2; + if (iFace == HALO_Y_MINUS) yEnd = yBegin+2; + if (iFace == HALO_Y_PLUS) yBegin = yEnd-2; + if (iFace == HALO_Z_MINUS) zEnd = zBegin+2; + if (iFace == HALO_Z_PLUS) zBegin = zEnd-2; + + int count = 0; + for (int ix=xBegin; ixpbcFactor[face]; + real3 shift; + shift[0] = pbcFactor[0] * s->domain->globalExtent[0]; + shift[1] = pbcFactor[1] * s->domain->globalExtent[1]; + shift[2] = pbcFactor[2] * s->domain->globalExtent[2]; + + int nCells = parms->nCells[face]; + int* cellList = parms->cellList[face]; + int nBuf = 0; + for (int iCell=0; iCellboxes->nAtoms[iBox]; ++ii) + { + buf[nBuf].gid = s->atoms->gid[ii]; + buf[nBuf].type = s->atoms->iSpecies[ii]; + buf[nBuf].rx = s->atoms->r[ii][0] + shift[0]; + buf[nBuf].ry = s->atoms->r[ii][1] + shift[1]; + buf[nBuf].rz = s->atoms->r[ii][2] + shift[2]; + buf[nBuf].px = s->atoms->p[ii][0]; + buf[nBuf].py = s->atoms->p[ii][1]; + buf[nBuf].pz = s->atoms->p[ii][2]; + ++nBuf; + } + } + return nBuf*sizeof(AtomMsg); +} + +/// The unloadBuffer function for a halo exchange of atom data. +/// Iterates the receive buffer and places each atom that was received +/// into the link cell that corresponds to the atom coordinate. Note +/// that this naturally accomplishes transfer of ownership of atoms that +/// have moved from one spatial domain to another. Atoms with +/// coordinates in local link cells automatically become local +/// particles. Atoms that are owned by other ranks are automatically +/// placed in halo kink cells. +/// \see HaloExchangeSt::unloadBuffer for an explanation of the +/// unloadBuffer parameters. +void unloadAtomsBuffer(void* vparms, void* data, int face, int bufSize, char* charBuf) +{ + AtomExchangeParms* parms = (AtomExchangeParms*) vparms; + SimFlat* s = (SimFlat*) data; + AtomMsg* buf = (AtomMsg*) charBuf; + int nBuf = bufSize / sizeof(AtomMsg); + assert(bufSize % sizeof(AtomMsg) == 0); + + for (int ii=0; iiboxes, s->atoms, gid, type, rx, ry, rz, px, py, pz); + } +} + +void destroyAtomsExchange(void* vparms) +{ + AtomExchangeParms* parms = (AtomExchangeParms*) vparms; + + for (int ii=0; ii<6; ++ii) + { + comdFree(parms->pbcFactor[ii]); + comdFree(parms->cellList[ii]); + } +} + +/// Make a list of link cells that need to send data across the +/// specified face. Note that this list must be compatible with the +/// corresponding recv list to ensure that the data goes to the correct +/// atoms. +/// +/// \see initLinkCells for information about the conventions for grid +/// coordinates of link cells. +int* mkForceSendCellList(LinkCell* boxes, int face, int nCells) +{ + int* list = comdMalloc(nCells*sizeof(int)); + int xBegin, xEnd, yBegin, yEnd, zBegin, zEnd; + + int nx = boxes->gridSize[0]; + int ny = boxes->gridSize[1]; + int nz = boxes->gridSize[2]; + switch(face) + { + case HALO_X_MINUS: + xBegin=0; xEnd=1; yBegin=0; yEnd=ny; zBegin=0; zEnd=nz; + break; + case HALO_X_PLUS: + xBegin=nx-1; xEnd=nx; yBegin=0; yEnd=ny; zBegin=0; zEnd=nz; + break; + case HALO_Y_MINUS: + xBegin=-1; xEnd=nx+1; yBegin=0; yEnd=1; zBegin=0; zEnd=nz; + break; + case HALO_Y_PLUS: + xBegin=-1; xEnd=nx+1; yBegin=ny-1; yEnd=ny; zBegin=0; zEnd=nz; + break; + case HALO_Z_MINUS: + xBegin=-1; xEnd=nx+1; yBegin=-1; yEnd=ny+1; zBegin=0; zEnd=1; + break; + case HALO_Z_PLUS: + xBegin=-1; xEnd=nx+1; yBegin=-1; yEnd=ny+1; zBegin=nz-1; zEnd=nz; + break; + default: + assert(1==0); + } + + int count = 0; + for (int ix=xBegin; ixgridSize[0]; + int ny = boxes->gridSize[1]; + int nz = boxes->gridSize[2]; + switch(face) + { + case HALO_X_MINUS: + xBegin=-1; xEnd=0; yBegin=0; yEnd=ny; zBegin=0; zEnd=nz; + break; + case HALO_X_PLUS: + xBegin=nx; xEnd=nx+1; yBegin=0; yEnd=ny; zBegin=0; zEnd=nz; + break; + case HALO_Y_MINUS: + xBegin=-1; xEnd=nx+1; yBegin=-1; yEnd=0; zBegin=0; zEnd=nz; + break; + case HALO_Y_PLUS: + xBegin=-1; xEnd=nx+1; yBegin=ny; yEnd=ny+1; zBegin=0; zEnd=nz; + break; + case HALO_Z_MINUS: + xBegin=-1; xEnd=nx+1; yBegin=-1; yEnd=ny+1; zBegin=-1; zEnd=0; + break; + case HALO_Z_PLUS: + xBegin=-1; xEnd=nx+1; yBegin=-1; yEnd=ny+1; zBegin=nz; zEnd=nz+1; + break; + default: + assert(1==0); + } + + int count = 0; + for (int ix=xBegin; ixnCells[face]; + int* cellList = parms->sendCells[face]; + int nBuf = 0; + for (int iCell=0; iCellboxes->nAtoms[iBox]; ++ii) + { + buf[nBuf].dfEmbed = data->dfEmbed[ii]; + ++nBuf; + } + } + return nBuf*sizeof(ForceMsg); +} + +/// The unloadBuffer function for a force exchange. +/// Data is received in an order that naturally aligns with the atom +/// storage so it is simple to put the data where it belongs. +/// +/// \see HaloExchangeSt::unloadBuffer for an explanation of the +/// unloadBuffer parameters. +void unloadForceBuffer(void* vparms, void* vdata, int face, int bufSize, char* charBuf) +{ + ForceExchangeParms* parms = (ForceExchangeParms*) vparms; + ForceExchangeData* data = (ForceExchangeData*) vdata; + ForceMsg* buf = (ForceMsg*) charBuf; + assert(bufSize % sizeof(ForceMsg) == 0); + + int nCells = parms->nCells[face]; + int* cellList = parms->recvCells[face]; + int iBuf = 0; + for (int iCell=0; iCellboxes->nAtoms[iBox]; ++ii) + { + data->dfEmbed[ii] = buf[iBuf].dfEmbed; + ++iBuf; + } + } + assert(iBuf == bufSize/ sizeof(ForceMsg)); +} + +void destroyForceExchange(void* vparms) +{ + ForceExchangeParms* parms = (ForceExchangeParms*) vparms; + + for (int ii=0; ii<6; ++ii) + { + comdFree(parms->sendCells[ii]); + comdFree(parms->recvCells[ii]); + } +} + +/// \details +/// The force exchange assumes that the atoms are in the same order in +/// both a given local link cell and the corresponding remote cell(s). +/// However, the atom exchange does not guarantee this property, +/// especially when atoms cross a domain decomposition boundary and move +/// from one task to another. Trying to maintain the atom order during +/// the atom exchange would immensely complicate that code. Instead, we +/// just sort the atoms after the atom exchange. +void sortAtomsInCell(Atoms* atoms, LinkCell* boxes, int iBox) +{ + int nAtoms = boxes->nAtoms[iBox]; + + AtomMsg tmp[nAtoms]; + + int begin = iBox*MAXATOMS; + int end = begin + nAtoms; + for (int ii=begin, iTmp=0; iigid[ii]; + tmp[iTmp].type = atoms->iSpecies[ii]; + tmp[iTmp].rx = atoms->r[ii][0]; + tmp[iTmp].ry = atoms->r[ii][1]; + tmp[iTmp].rz = atoms->r[ii][2]; + tmp[iTmp].px = atoms->p[ii][0]; + tmp[iTmp].py = atoms->p[ii][1]; + tmp[iTmp].pz = atoms->p[ii][2]; + } + qsort(&tmp, nAtoms, sizeof(AtomMsg), sortAtomsById); + for (int ii=begin, iTmp=0; iigid[ii] = tmp[iTmp].gid; + atoms->iSpecies[ii] = tmp[iTmp].type; + atoms->r[ii][0] = tmp[iTmp].rx; + atoms->r[ii][1] = tmp[iTmp].ry; + atoms->r[ii][2] = tmp[iTmp].rz; + atoms->p[ii][0] = tmp[iTmp].px; + atoms->p[ii][1] = tmp[iTmp].py; + atoms->p[ii][2] = tmp[iTmp].pz; + } + +} + +/// A function suitable for passing to qsort to sort atoms by gid. +/// Because every atom in the simulation is supposed to have a unique +/// id, this function checks that the atoms have different gids. If +/// that assertion ever fails it is a sign that something has gone +/// wrong elsewhere in the code. +int sortAtomsById(const void* a, const void* b) +{ + int aId = ((AtomMsg*) a)->gid; + int bId = ((AtomMsg*) b)->gid; + assert(aId != bId); + + if (aId < bId) + return -1; + return 1; +} + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.h @@ -0,0 +1,38 @@ +/// \file +/// Initialize the atom configuration. + +#ifndef __INIT_ATOMS_H +#define __INIT_ATOMS_H + +#include "mytype.h" + +struct SimFlatSt; +struct LinkCellSt; + +/// Atom data +typedef struct AtomsSt +{ + // atom-specific data + int nLocal; //!< total number of atoms on this processor + int nGlobal; //!< total number of atoms in simulation + + int* gid; //!< A globally unique id for each atom + int* iSpecies; //!< the species index of the atom + + real3* r; //!< positions + real3* p; //!< momenta of atoms + real3* f; //!< forces + real_t* U; //!< potential energy per atom +} Atoms; + + +/// Allocates memory to store atom data. +Atoms* initAtoms(struct LinkCellSt* boxes); +void destroyAtoms(struct AtomsSt* atoms); + +void createFccLattice(int nx, int ny, int nz, real_t lat, struct SimFlatSt* s); + +void setVcm(struct SimFlatSt* s, real_t vcm[3]); +void setTemperature(struct SimFlatSt* s, real_t temperature); +void randomDisplacements(struct SimFlatSt* s, real_t delta); +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/initAtoms.c @@ -0,0 +1,232 @@ +/// \file +/// Initialize the atom configuration. + +#include "initAtoms.h" + +#include +#include + +#include "constants.h" +#include "decomposition.h" +#include "parallel.h" +#include "random.h" +#include "linkCells.h" +#include "timestep.h" +#include "memUtils.h" +#include "performanceTimers.h" + +static void computeVcm(SimFlat* s, real_t vcm[3]); + +/// \details +/// Call functions such as createFccLattice and setTemperature to set up +/// initial atom positions and momenta. +Atoms* initAtoms(LinkCell* boxes) +{ + Atoms* atoms = comdMalloc(sizeof(Atoms)); + + int maxTotalAtoms = MAXATOMS*boxes->nTotalBoxes; + + atoms->gid = (int*) comdMalloc(maxTotalAtoms*sizeof(int)); + atoms->iSpecies = (int*) comdMalloc(maxTotalAtoms*sizeof(int)); + atoms->r = (real3*) comdMalloc(maxTotalAtoms*sizeof(real3)); + atoms->p = (real3*) comdMalloc(maxTotalAtoms*sizeof(real3)); + atoms->f = (real3*) comdMalloc(maxTotalAtoms*sizeof(real3)); + atoms->U = (real_t*)comdMalloc(maxTotalAtoms*sizeof(real_t)); + + atoms->nLocal = 0; + atoms->nGlobal = 0; + + for (int iOff = 0; iOff < maxTotalAtoms; iOff++) + { + atoms->gid[iOff] = 0; + atoms->iSpecies[iOff] = 0; + zeroReal3(atoms->r[iOff]); + zeroReal3(atoms->p[iOff]); + zeroReal3(atoms->f[iOff]); + atoms->U[iOff] = 0.; + } + + return atoms; +} + +void destroyAtoms(Atoms *atoms) +{ + freeMe(atoms,gid); + freeMe(atoms,iSpecies); + freeMe(atoms,r); + freeMe(atoms,p); + freeMe(atoms,f); + freeMe(atoms,U); + comdFree(atoms); +} + +/// Creates atom positions on a face centered cubic (FCC) lattice with +/// nx * ny * nz unit cells and lattice constant lat. +/// Set momenta to zero. +void createFccLattice(int nx, int ny, int nz, real_t lat, SimFlat* s) +{ + const real_t* localMin = s->domain->localMin; // alias + const real_t* localMax = s->domain->localMax; // alias + + int nb = 4; // number of atoms in the basis + real3 basis[4] = { {0.25, 0.25, 0.25}, + {0.25, 0.75, 0.75}, + {0.75, 0.25, 0.75}, + {0.75, 0.75, 0.25} }; + + // create and place atoms + int begin[3]; + int end[3]; + for (int ii=0; ii<3; ++ii) + { + begin[ii] = floor(localMin[ii]/lat); + end[ii] = ceil (localMax[ii]/lat); + } + + real_t px,py,pz; + px=py=pz=0.0; + for (int ix=begin[0]; ix= localMax[0]) continue; + if (ry < localMin[1] || ry >= localMax[1]) continue; + if (rz < localMin[2] || rz >= localMax[2]) continue; + int id = ib+nb*(iz+nz*(iy+ny*(ix))); + putAtomInBox(s->boxes, s->atoms, id, 0, rx, ry, rz, px, py, pz); + } + + // set total atoms in simulation + startTimer(commReduceTimer); + addIntParallel(&s->atoms->nLocal, &s->atoms->nGlobal, 1); + stopTimer(commReduceTimer); + + assert(s->atoms->nGlobal == nb*nx*ny*nz); +} + +/// Sets the center of mass velocity of the system. +/// \param [in] newVcm The desired center of mass velocity. +void setVcm(SimFlat* s, real_t newVcm[3]) +{ + real_t oldVcm[3]; + computeVcm(s, oldVcm); + + real_t vShift[3]; + vShift[0] = (newVcm[0] - oldVcm[0]); + vShift[1] = (newVcm[1] - oldVcm[1]); + vShift[2] = (newVcm[2] - oldVcm[2]); + + for (int iBox=0; iBoxboxes->nLocalBoxes; ++iBox) + { + for (int iOff=MAXATOMS*iBox, ii=0; iiboxes->nAtoms[iBox]; ++ii, ++iOff) + { + int iSpecies = s->atoms->iSpecies[iOff]; + real_t mass = s->species[iSpecies].mass; + + s->atoms->p[iOff][0] += mass * vShift[0]; + s->atoms->p[iOff][1] += mass * vShift[1]; + s->atoms->p[iOff][2] += mass * vShift[2]; + } + } +} + +/// Sets the temperature of system. +/// +/// Selects atom velocities randomly from a boltzmann (equilibrium) +/// distribution that corresponds to the specified temperature. This +/// random process will typically result in a small, but non zero center +/// of mass velocity and a small difference from the specified +/// temperature. For typical MD runs these small differences are +/// unimportant, However, to avoid possible confusion, we set the center +/// of mass velocity to zero and scale the velocities to exactly match +/// the input temperature. +void setTemperature(SimFlat* s, real_t temperature) +{ + // set initial velocities for the distribution + for (int iBox=0; iBoxboxes->nLocalBoxes; ++iBox) + { + for (int iOff=MAXATOMS*iBox, ii=0; iiboxes->nAtoms[iBox]; ++ii, ++iOff) + { + int iType = s->atoms->iSpecies[iOff]; + real_t mass = s->species[iType].mass; + real_t sigma = sqrt(kB_eV * temperature/mass); + uint64_t seed = mkSeed(s->atoms->gid[iOff], 123); + s->atoms->p[iOff][0] = mass * sigma * gasdev(&seed); + s->atoms->p[iOff][1] = mass * sigma * gasdev(&seed); + s->atoms->p[iOff][2] = mass * sigma * gasdev(&seed); + } + } + // compute the resulting temperature + // kinetic energy = 3/2 kB * Temperature + if (temperature == 0.0) return; + real_t vZero[3] = {0., 0., 0.}; + setVcm(s, vZero); + kineticEnergy(s); + real_t temp = (s->eKinetic/s->atoms->nGlobal)/kB_eV/1.5; + // scale the velocities to achieve the target temperature + real_t scaleFactor = sqrt(temperature/temp); + for (int iBox=0; iBoxboxes->nLocalBoxes; ++iBox) + { + for (int iOff=MAXATOMS*iBox, ii=0; iiboxes->nAtoms[iBox]; ++ii, ++iOff) + { + s->atoms->p[iOff][0] *= scaleFactor; + s->atoms->p[iOff][1] *= scaleFactor; + s->atoms->p[iOff][2] *= scaleFactor; + } + } + kineticEnergy(s); + temp = s->eKinetic/s->atoms->nGlobal/kB_eV/1.5; +} + +/// Add a random displacement to the atom positions. +/// Atoms are displaced by a random distance in the range +/// [-delta, +delta] along each axis. +/// \param [in] delta The maximum displacement (along each axis). +void randomDisplacements(SimFlat* s, real_t delta) +{ + for (int iBox=0; iBoxboxes->nLocalBoxes; ++iBox) + { + for (int iOff=MAXATOMS*iBox, ii=0; iiboxes->nAtoms[iBox]; ++ii, ++iOff) + { + uint64_t seed = mkSeed(s->atoms->gid[iOff], 457); + s->atoms->r[iOff][0] += (2.0*lcg61(&seed)-1.0) * delta; + s->atoms->r[iOff][1] += (2.0*lcg61(&seed)-1.0) * delta; + s->atoms->r[iOff][2] += (2.0*lcg61(&seed)-1.0) * delta; + } + } +} + +/// Computes the center of mass velocity of the system. +void computeVcm(SimFlat* s, real_t vcm[3]) +{ + real_t vcmLocal[4] = {0., 0., 0., 0.}; + real_t vcmSum[4] = {0., 0., 0., 0.}; + + // sum the momenta and particle masses + for (int iBox=0; iBoxboxes->nLocalBoxes; ++iBox) + { + for (int iOff=MAXATOMS*iBox, ii=0; iiboxes->nAtoms[iBox]; ++ii, ++iOff) + { + vcmLocal[0] += s->atoms->p[iOff][0]; + vcmLocal[1] += s->atoms->p[iOff][1]; + vcmLocal[2] += s->atoms->p[iOff][2]; + + int iSpecies = s->atoms->iSpecies[iOff]; + vcmLocal[3] += s->species[iSpecies].mass; + } + } + + startTimer(commReduceTimer); + addRealParallel(vcmLocal, vcmSum, 4); + stopTimer(commReduceTimer); + + real_t totalMass = vcmSum[3]; + vcm[0] = vcmSum[0]/totalMass; + vcm[1] = vcmSum[1]/totalMass; + vcm[2] = vcmSum[2]/totalMass; +} + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.h @@ -0,0 +1,50 @@ +/// \file +/// Functions to maintain link cell structures for fast pair finding. + +#ifndef __LINK_CELLS_H_ +#define __LINK_CELLS_H_ + +#include "mytype.h" + +/// The maximum number of atoms that can be stored in a link cell. +#define MAXATOMS 64 + +struct DomainSt; +struct AtomsSt; + +/// Link cell data. For convenience, we keep a copy of the localMin and +/// localMax coordinates that are also found in the DomainsSt. +typedef struct LinkCellSt +{ + int gridSize[3]; //!< number of boxes in each dimension on processor + int nLocalBoxes; //!< total number of local boxes on processor + int nHaloBoxes; //!< total number of remote halo/ghost boxes on processor + int nTotalBoxes; //!< total number of boxes on processor + //!< nLocalBoxes + nHaloBoxes + real3 localMin; //!< minimum local bounds on processor + real3 localMax; //!< maximum local bounds on processor + real3 boxSize; //!< size of box in each dimension + real3 invBoxSize; //!< inverse size of box in each dimension + + int* nAtoms; //!< total number of atoms in each box +} LinkCell; + +LinkCell* initLinkCells(const struct DomainSt* domain, real_t cutoff); +void destroyLinkCells(LinkCell** boxes); + +int getNeighborBoxes(LinkCell* boxes, int iBox, int* nbrBoxes); +void putAtomInBox(LinkCell* boxes, struct AtomsSt* atoms, + const int gid, const int iType, + const real_t x, const real_t y, const real_t z, + const real_t px, const real_t py, const real_t pz); +int getBoxFromTuple(LinkCell* boxes, int x, int y, int z); + +void moveAtom(LinkCell* boxes, struct AtomsSt* atoms, int iId, int iBox, int jBox); + +/// Update link cell data structures when the atoms have moved. +void updateLinkCells(LinkCell* boxes, struct AtomsSt* atoms); + +int maxOccupancy(LinkCell* boxes); + + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/linkCells.c @@ -0,0 +1,457 @@ +/// \file +/// Functions to maintain link cell structures for fast pair finding. +/// +/// In CoMD 1.1, atoms are stored in link cells. Link cells are widely +/// used in classical MD to avoid an O(N^2) search for atoms that +/// interact. Link cells are formed by subdividing the local spatial +/// domain with a Cartesian grid where the grid spacing in each +/// direction is at least as big as he potential's cutoff distance. +/// Because atoms don't interact beyond the potential cutoff, for an +/// atom iAtom in any given link cell, we can be certain that all atoms +/// that interact with iAtom are contained in the same link cell, or one +/// of the 26 neighboring link cells. +/// +/// CoMD chooses the link cell size (boxSize) on each axis to be the +/// shortest possible distance, longer than cutoff, such that the local +/// domain size divided by boxSize is an integer. I.e., the link cells +/// are commensurate with with the local domain size. While this does +/// not result in the smallest possible link cells, it does allow us to +/// keep a strict separation between the link cells that are entirely +/// inside the local domain and those that represent halo regions. +/// +/// The number of local link cells in each direction is stored in +/// gridSize. Local link cells have 3D grid coordinates (ix, iy, iz) +/// where ix, iy, and iz can range from 0 to gridSize[iAxis]-1, +/// whiere iAxis is 0 for x, 1 for y and 2 for the z direction. The +/// number of local link cells is thus nLocalBoxes = +/// gridSize[0]*gridSize[1]*gridSize[2]. +/// +/// The local link cells are surrounded by one complete shell of halo +/// link cells. The halo cells provide temporary storage for halo or +/// "ghost" atoms that belong to other tasks, but whose coordinates are +/// needed locally to complete the force calculation. Halo link cells +/// have at least one coordinate with a value of either -1 or +/// gridSize[iAxis]. +/// +/// Because CoMD stores data in ordinary 1D C arrays, a mapping is +/// needed from the 3D grid coords to a 1D array index. For the local +/// cells we use the conventional mapping ix + iy*nx + iz*nx*ny. This +/// keeps all of the local cells in a contiguous region of memory +/// starting from the beginning of any relevant array and makes it easy +/// to iterate the local cells in a single loop. Halo cells are mapped +/// differently. After the local cells, the two planes of link cells +/// that are face neighbors with local cells across the -x or +x axis +/// are next. These are followed by face neighbors across the -y and +y +/// axis (including cells that are y-face neighbors with an x-plane of +/// halo cells), followed by all remaining cells in the -z and +z planes +/// of halo cells. The total number of link cells (on each rank) is +/// nTotalBoxes. +/// +/// Data storage arrays that are used in association with link cells +/// should be allocated to store nTotalBoxes*MAXATOMS items. Data for +/// the first atom in linkCell iBox is stored at index iBox*MAXATOMS. +/// Data for subsequent atoms in the same link cell are stored +/// sequentially, and the number of atoms in link cell iBox is +/// nAtoms[iBox]. +/// +/// \see getBoxFromTuple is the 3D->1D mapping for link cell indices. +/// \see getTuple is the 1D->3D mapping +/// +/// \param [in] cutoff The cutoff distance of the potential. + +#include "linkCells.h" + +#include +#include +#include +#include + +#include "parallel.h" +#include "memUtils.h" +#include "decomposition.h" +#include "performanceTimers.h" +#include "CoMDTypes.h" + +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + +static void copyAtom(LinkCell* boxes, Atoms* atoms, int iAtom, int iBox, int jAtom, int jBox); +static int getBoxFromCoord(LinkCell* boxes, real_t rr[3]); +static void emptyHaloCells(LinkCell* boxes); +static void getTuple(LinkCell* boxes, int iBox, int* ixp, int* iyp, int* izp); + +LinkCell* initLinkCells(const Domain* domain, real_t cutoff) +{ + assert(domain); + LinkCell* ll = comdMalloc(sizeof(LinkCell)); + + for (int i = 0; i < 3; i++) + { + ll->localMin[i] = domain->localMin[i]; + ll->localMax[i] = domain->localMax[i]; + ll->gridSize[i] = domain->localExtent[i] / cutoff; // local number of boxes + ll->boxSize[i] = domain->localExtent[i] / ((real_t) ll->gridSize[i]); + ll->invBoxSize[i] = 1.0/ll->boxSize[i]; + } + + ll->nLocalBoxes = ll->gridSize[0] * ll->gridSize[1] * ll->gridSize[2]; + + ll->nHaloBoxes = 2 * ((ll->gridSize[0] + 2) * + (ll->gridSize[1] + ll->gridSize[2] + 2) + + (ll->gridSize[1] * ll->gridSize[2])); + + ll->nTotalBoxes = ll->nLocalBoxes + ll->nHaloBoxes; + + ll->nAtoms = comdMalloc(ll->nTotalBoxes*sizeof(int)); + for (int iBox=0; iBoxnTotalBoxes; ++iBox) + ll->nAtoms[iBox] = 0; + + assert ( (ll->gridSize[0] >= 2) && (ll->gridSize[1] >= 2) && (ll->gridSize[2] >= 2) ); + return ll; +} + +void destroyLinkCells(LinkCell** boxes) +{ + if (! boxes) return; + if (! *boxes) return; + + comdFree((*boxes)->nAtoms); + comdFree(*boxes); + *boxes = NULL; + + return; +} + +/// \details +/// Populates the nbrBoxes array with the 27 boxes that are adjacent to +/// iBox. The count is 27 instead of 26 because iBox is included in the +/// list (as neighbor 13). Caller is responsible to alloc and free +/// nbrBoxes. +/// \return The number of nbr boxes (always 27 in this implementation). +int getNeighborBoxes(LinkCell* boxes, int iBox, int* nbrBoxes) +{ + int ix, iy, iz; + getTuple(boxes, iBox, &ix, &iy, &iz); + + int count = 0; + for (int i=ix-1; i<=ix+1; i++) + for (int j=iy-1; j<=iy+1; j++) + for (int k=iz-1; k<=iz+1; k++) + nbrBoxes[count++] = getBoxFromTuple(boxes,i,j,k); + + return count; +} + +/// \details +/// Finds the appropriate link cell for an atom based on the spatial +/// coordinates and stores data in that link cell. +/// \param [in] gid The global of the atom. +/// \param [in] iType The species index of the atom. +/// \param [in] x The x-coordinate of the atom. +/// \param [in] y The y-coordinate of the atom. +/// \param [in] z The z-coordinate of the atom. +/// \param [in] px The x-component of the atom's momentum. +/// \param [in] py The y-component of the atom's momentum. +/// \param [in] pz The z-component of the atom's momentum. +void putAtomInBox(LinkCell* boxes, Atoms* atoms, + const int gid, const int iType, + const real_t x, const real_t y, const real_t z, + const real_t px, const real_t py, const real_t pz) +{ + real_t xyz[3] = {x,y,z}; + + // Find correct box. + int iBox = getBoxFromCoord(boxes, xyz); + int iOff = iBox*MAXATOMS; + iOff += boxes->nAtoms[iBox]; + + // assign values to array elements + if (iBox < boxes->nLocalBoxes) + atoms->nLocal++; + boxes->nAtoms[iBox]++; + atoms->gid[iOff] = gid; + atoms->iSpecies[iOff] = iType; + + atoms->r[iOff][0] = x; + atoms->r[iOff][1] = y; + atoms->r[iOff][2] = z; + + atoms->p[iOff][0] = px; + atoms->p[iOff][1] = py; + atoms->p[iOff][2] = pz; +} + +/// Calculates the link cell index from the grid coords. The valid +/// coordinate range in direction ii is [-1, gridSize[ii]]. Any +/// coordinate that involves a -1 or gridSize[ii] is a halo link cell. +/// Because of the order in which the local and halo link cells are +/// stored the indices of the halo cells are special cases. +/// \see initLinkCells for an explanation of storage order. +int getBoxFromTuple(LinkCell* boxes, int ix, int iy, int iz) +{ + int iBox = 0; + const int* gridSize = boxes->gridSize; // alias + + // Halo in Z+ + if (iz == gridSize[2]) + { + iBox = boxes->nLocalBoxes + 2*gridSize[2]*gridSize[1] + 2*gridSize[2]*(gridSize[0]+2) + + (gridSize[0]+2)*(gridSize[1]+2) + (gridSize[0]+2)*(iy+1) + (ix+1); + } + // Halo in Z- + else if (iz == -1) + { + iBox = boxes->nLocalBoxes + 2*gridSize[2]*gridSize[1] + 2*gridSize[2]*(gridSize[0]+2) + + (gridSize[0]+2)*(iy+1) + (ix+1); + } + // Halo in Y+ + else if (iy == gridSize[1]) + { + iBox = boxes->nLocalBoxes + 2*gridSize[2]*gridSize[1] + gridSize[2]*(gridSize[0]+2) + + (gridSize[0]+2)*iz + (ix+1); + } + // Halo in Y- + else if (iy == -1) + { + iBox = boxes->nLocalBoxes + 2*gridSize[2]*gridSize[1] + iz*(gridSize[0]+2) + (ix+1); + } + // Halo in X+ + else if (ix == gridSize[0]) + { + iBox = boxes->nLocalBoxes + gridSize[1]*gridSize[2] + iz*gridSize[1] + iy; + } + // Halo in X- + else if (ix == -1) + { + iBox = boxes->nLocalBoxes + iz*gridSize[1] + iy; + } + // local link celll. + else + { + iBox = ix + gridSize[0]*iy + gridSize[0]*gridSize[1]*iz; + } + assert(iBox >= 0); + assert(iBox < boxes->nTotalBoxes); + + return iBox; +} + +/// Move an atom from one link cell to another. +/// \param iId [in] The index with box iBox of the atom to be moved. +/// \param iBox [in] The index of the link cell the particle is moving from. +/// \param jBox [in] The index of the link cell the particle is moving to. +void moveAtom(LinkCell* boxes, Atoms* atoms, int iId, int iBox, int jBox) +{ + int nj = boxes->nAtoms[jBox]; + copyAtom(boxes, atoms, iId, iBox, nj, jBox); + boxes->nAtoms[jBox]++; + + assert(boxes->nAtoms[jBox] < MAXATOMS); + + boxes->nAtoms[iBox]--; + int ni = boxes->nAtoms[iBox]; + if (ni) copyAtom(boxes, atoms, ni, iBox, iId, iBox); + + if (jBox > boxes->nLocalBoxes) + --atoms->nLocal; + + return; +} + +/// \details +/// This is the first step in returning data structures to a consistent +/// state after the atoms move each time step. First we discard all +/// atoms in the halo link cells. These are all atoms that are +/// currently stored on other ranks and so any information we have about +/// them is stale. Next, we move any atoms that have crossed link cell +/// boundaries into their new link cells. It is likely that some atoms +/// will be moved into halo link cells. Since we have deleted halo +/// atoms from other tasks, it is clear that any atoms that are in halo +/// cells at the end of this routine have just transitioned from local +/// to halo atoms. Such atom must be sent to other tasks by a halo +/// exchange to avoid being lost. +/// \see redistributeAtoms +void updateLinkCells(LinkCell* boxes, Atoms* atoms) +{ + emptyHaloCells(boxes); + + for (int iBox=0; iBoxnLocalBoxes; ++iBox) + { + int iOff = iBox*MAXATOMS; + int ii=0; + while (ii < boxes->nAtoms[iBox]) + { + int jBox = getBoxFromCoord(boxes, atoms->r[iOff+ii]); + if (jBox != iBox) + moveAtom(boxes, atoms, ii, iBox, jBox); + else + ++ii; + } + } +} + +/// \return The largest number of atoms in any link cell. +int maxOccupancy(LinkCell* boxes) +{ + int localMax = 0; + for (int ii=0; iinLocalBoxes; ++ii) + localMax = MAX(localMax, boxes->nAtoms[ii]); + + int globalMax; + + startTimer(commReduceTimer); + maxIntParallel(&localMax, &globalMax, 1); + stopTimer(commReduceTimer); + + return globalMax; +} + +/// Copy atom iAtom in link cell iBox to atom jAtom in link cell jBox. +/// Any data at jAtom, jBox is overwritten. This routine can be used to +/// re-order atoms within a link cell. +void copyAtom(LinkCell* boxes, Atoms* atoms, int iAtom, int iBox, int jAtom, int jBox) +{ + const int iOff = MAXATOMS*iBox+iAtom; + const int jOff = MAXATOMS*jBox+jAtom; + atoms->gid[jOff] = atoms->gid[iOff]; + atoms->iSpecies[jOff] = atoms->iSpecies[iOff]; + memcpy(atoms->r[jOff], atoms->r[iOff], sizeof(real3)); + memcpy(atoms->p[jOff], atoms->p[iOff], sizeof(real3)); + memcpy(atoms->f[jOff], atoms->f[iOff], sizeof(real3)); + memcpy(atoms->U+jOff, atoms->U+iOff, sizeof(real_t)); +} + +/// Get the index of the link cell that contains the specified +/// coordinate. This can be either a halo or a local link cell. +/// +/// Because the rank ownership of an atom is strictly determined by the +/// atom's position, we need to take care that all ranks will agree which +/// rank owns an atom. The conditionals at the end of this function are +/// special care to ensure that all ranks make compatible link cell +/// assignments for atoms that are near a link cell boundaries. If no +/// ranks claim an atom in a local cell it will be lost. If multiple +/// ranks claim an atom it will be duplicated. +int getBoxFromCoord(LinkCell* boxes, real_t rr[3]) +{ + const real_t* localMin = boxes->localMin; // alias + const real_t* localMax = boxes->localMax; // alias + const int* gridSize = boxes->gridSize; // alias + int ix = (int)(floor((rr[0] - localMin[0])*boxes->invBoxSize[0])); + int iy = (int)(floor((rr[1] - localMin[1])*boxes->invBoxSize[1])); + int iz = (int)(floor((rr[2] - localMin[2])*boxes->invBoxSize[2])); + + + // For each axis, if we are inside the local domain, make sure we get + // a local link cell. Otherwise, make sure we get a halo link cell. + if (rr[0] < localMax[0]) + { + if (ix == gridSize[0]) ix = gridSize[0] - 1; + } + else + ix = gridSize[0]; // assign to halo cell + if (rr[1] < localMax[1]) + { + if (iy == gridSize[1]) iy = gridSize[1] - 1; + } + else + iy = gridSize[1]; + if (rr[2] < localMax[2]) + { + if (iz == gridSize[2]) iz = gridSize[2] - 1; + } + else + iz = gridSize[2]; + + return getBoxFromTuple(boxes, ix, iy, iz); +} + +/// Set the number of atoms to zero in all halo link cells. +void emptyHaloCells(LinkCell* boxes) +{ + for (int ii=boxes->nLocalBoxes; iinTotalBoxes; ++ii) + boxes->nAtoms[ii] = 0; +} + +/// Get the grid coordinates of the link cell with index iBox. Local +/// cells are easy as they use a standard 1D->3D mapping. Halo cell are +/// special cases. +/// \see initLinkCells for information on link cell order. +/// \param [in] iBox Index to link cell for which tuple is needed. +/// \param [out] ixp x grid coord of link cell. +/// \param [out] iyp y grid coord of link cell. +/// \param [out] izp z grid coord of link cell. +void getTuple(LinkCell* boxes, int iBox, int* ixp, int* iyp, int* izp) +{ + int ix, iy, iz; + const int* gridSize = boxes->gridSize; // alias + + // If a local box + if( iBox < boxes->nLocalBoxes) + { + ix = iBox % gridSize[0]; + iBox /= gridSize[0]; + iy = iBox % gridSize[1]; + iz = iBox / gridSize[1]; + } + // It's a halo box + else + { + int ink; + ink = iBox - boxes->nLocalBoxes; + if (ink < 2*gridSize[1]*gridSize[2]) + { + if (ink < gridSize[1]*gridSize[2]) + { + ix = 0; + } + else + { + ink -= gridSize[1]*gridSize[2]; + ix = gridSize[0] + 1; + } + iy = 1 + ink % gridSize[1]; + iz = 1 + ink / gridSize[1]; + } + else if (ink < (2 * gridSize[2] * (gridSize[1] + gridSize[0] + 2))) + { + ink -= 2 * gridSize[2] * gridSize[1]; + if (ink < ((gridSize[0] + 2) *gridSize[2])) + { + iy = 0; + } + else + { + ink -= (gridSize[0] + 2) * gridSize[2]; + iy = gridSize[1] + 1; + } + ix = ink % (gridSize[0] + 2); + iz = 1 + ink / (gridSize[0] + 2); + } + else + { + ink -= 2 * gridSize[2] * (gridSize[1] + gridSize[0] + 2); + if (ink < ((gridSize[0] + 2) * (gridSize[1] + 2))) + { + iz = 0; + } + else + { + ink -= (gridSize[0] + 2) * (gridSize[1] + 2); + iz = gridSize[2] + 1; + } + ix = ink % (gridSize[0] + 2); + iy = ink / (gridSize[0] + 2); + } + + // Calculated as off by 1 + ix--; + iy--; + iz--; + } + + *ixp = ix; + *iyp = iy; + *izp = iz; +} + + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.h @@ -0,0 +1,11 @@ +/// \file +/// Computes forces for the 12-6 Lennard Jones (LJ) potential. + +#ifndef _LJTYPES_H_ +#define _LJTYPES_H_ + +struct BasePotentialSt; +struct BasePotentialSt* initLjPot(void); + +#endif + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/ljForce.c @@ -0,0 +1,235 @@ +/// \file +/// Computes forces for the 12-6 Lennard Jones (LJ) potential. +/// +/// The Lennard-Jones model is not a good representation for the +/// bonding in copper, its use has been limited to constant volume +/// simulations where the embedding energy contribution to the cohesive +/// energy is not included in the two-body potential +/// +/// The parameters here are taken from Wolf and Phillpot and fit to the +/// room temperature lattice constant and the bulk melt temperature +/// Ref: D. Wolf and S.Yip eds. Materials Interfaces (Chapman & Hall +/// 1992) Page 230. +/// +/// Notes on LJ: +/// +/// http://en.wikipedia.org/wiki/Lennard_Jones_potential +/// +/// The total inter-atomic potential energy in the LJ model is: +/// +/// \f[ +/// E_{tot} = \sum_{ij} U_{LJ}(r_{ij}) +/// \f] +/// \f[ +/// U_{LJ}(r_{ij}) = 4 \epsilon +/// \left\{ \left(\frac{\sigma}{r_{ij}}\right)^{12} +/// - \left(\frac{\sigma}{r_{ij}}\right)^6 \right\} +/// \f] +/// +/// where \f$\epsilon\f$ and \f$\sigma\f$ are the material parameters in the potential. +/// - \f$\epsilon\f$ = well depth +/// - \f$\sigma\f$ = hard sphere diameter +/// +/// To limit the interation range, the LJ potential is typically +/// truncated to zero at some cutoff distance. A common choice for the +/// cutoff distance is 2.5 * \f$\sigma\f$. +/// This implementation can optionally shift the potential slightly +/// upward so the value of the potential is zero at the cuotff +/// distance. This shift has no effect on the particle dynamics. +/// +/// +/// The force on atom i is given by +/// +/// \f[ +/// F_i = -\nabla_i \sum_{jk} U_{LJ}(r_{jk}) +/// \f] +/// +/// where the subsrcipt i on the gradient operator indicates that the +/// derivatives are taken with respect to the coordinates of atom i. +/// Liberal use of the chain rule leads to the expression +/// +/// \f{eqnarray*}{ +/// F_i &=& - \sum_j U'_{LJ}(r_{ij})\hat{r}_{ij}\\ +/// &=& \sum_j 24 \frac{\epsilon}{r_{ij}} \left\{ 2 \left(\frac{\sigma}{r_{ij}}\right)^{12} +/// - \left(\frac{\sigma}{r_{ij}}\right)^6 \right\} \hat{r}_{ij} +/// \f} +/// +/// where \f$\hat{r}_{ij}\f$ is a unit vector in the direction from atom +/// i to atom j. +/// +/// + +#include "ljForce.h" + +#include +#include +#include + +#include "constants.h" +#include "mytype.h" +#include "parallel.h" +#include "linkCells.h" +#include "memUtils.h" +#include "CoMDTypes.h" + +#define POT_SHIFT 1.0 + +/// Derived struct for a Lennard Jones potential. +/// Polymorphic with BasePotential. +/// \see BasePotential +typedef struct LjPotentialSt +{ + real_t cutoff; //!< potential cutoff distance in Angstroms + real_t mass; //!< mass of atoms in intenal units + real_t lat; //!< lattice spacing (angs) of unit cell + char latticeType[8]; //!< lattice type, e.g. FCC, BCC, etc. + char name[3]; //!< element name + int atomicNo; //!< atomic number + int (*force)(SimFlat* s); //!< function pointer to force routine + void (*print)(FILE* file, BasePotential* pot); + void (*destroy)(BasePotential** pot); //!< destruction of the potential + real_t sigma; + real_t epsilon; +} LjPotential; + +static int ljForce(SimFlat* s); +static void ljPrint(FILE* file, BasePotential* pot); + +void ljDestroy(BasePotential** inppot) +{ + if ( ! inppot ) return; + LjPotential* pot = (LjPotential*)(*inppot); + if ( ! pot ) return; + comdFree(pot); + *inppot = NULL; + + return; +} + +/// Initialize an Lennard Jones potential for Copper. +BasePotential* initLjPot(void) +{ + LjPotential *pot = (LjPotential*)comdMalloc(sizeof(LjPotential)); + pot->force = ljForce; + pot->print = ljPrint; + pot->destroy = ljDestroy; + pot->sigma = 2.315; // Angstrom + pot->epsilon = 0.167; // eV + pot->mass = 63.55 * amuToInternalMass; // Atomic Mass Units (amu) + + pot->lat = 3.615; // Equilibrium lattice const in Angs + strcpy(pot->latticeType, "FCC"); // lattice type, i.e. FCC, BCC, etc. + pot->cutoff = 2.5*pot->sigma; // Potential cutoff in Angs + + strcpy(pot->name, "Cu"); + pot->atomicNo = 29; + + return (BasePotential*) pot; +} + +void ljPrint(FILE* file, BasePotential* pot) +{ + LjPotential* ljPot = (LjPotential*) pot; + fprintf(file, " Potential type : Lennard-Jones\n"); + fprintf(file, " Species name : %s\n", ljPot->name); + fprintf(file, " Atomic number : %d\n", ljPot->atomicNo); + fprintf(file, " Mass : "FMT1" amu\n", ljPot->mass / amuToInternalMass); // print in amu + fprintf(file, " Lattice Type : %s\n", ljPot->latticeType); + fprintf(file, " Lattice spacing : "FMT1" Angstroms\n", ljPot->lat); + fprintf(file, " Cutoff : "FMT1" Angstroms\n", ljPot->cutoff); + fprintf(file, " Epsilon : "FMT1" eV\n", ljPot->epsilon); + fprintf(file, " Sigma : "FMT1" Angstroms\n", ljPot->sigma); +} + +int ljForce(SimFlat* s) +{ + LjPotential* pot = (LjPotential *) s->pot; + real_t sigma = pot->sigma; + real_t epsilon = pot->epsilon; + real_t rCut = pot->cutoff; + real_t rCut2 = rCut*rCut; + + // zero forces and energy + real_t ePot = 0.0; + s->ePotential = 0.0; + int fSize = s->boxes->nTotalBoxes*MAXATOMS; + for (int ii=0; iiatoms->f[ii]); + s->atoms->U[ii] = 0.; + } + + real_t s6 = sigma*sigma*sigma*sigma*sigma*sigma; + + real_t rCut6 = s6 / (rCut2*rCut2*rCut2); + real_t eShift = POT_SHIFT * rCut6 * (rCut6 - 1.0); + + int nbrBoxes[27]; + // loop over local boxes + for (int iBox=0; iBoxboxes->nLocalBoxes; iBox++) + { + int nIBox = s->boxes->nAtoms[iBox]; + if ( nIBox == 0 ) continue; + int nNbrBoxes = getNeighborBoxes(s->boxes, iBox, nbrBoxes); + // loop over neighbors of iBox + for (int jTmp=0; jTmp=0); + + int nJBox = s->boxes->nAtoms[jBox]; + if ( nJBox == 0 ) continue; + + // loop over atoms in iBox + for (int iOff=iBox*MAXATOMS,ii=0; iiatoms->gid[iOff]; + // loop over atoms in jBox + for (int jOff=MAXATOMS*jBox,ij=0; ijatoms->gid[jOff]; + if (jBox < s->boxes->nLocalBoxes && jId <= iId ) + continue; // don't double count local-local pairs. + real_t r2 = 0.0; + for (int m=0; m<3; m++) + { + dr[m] = s->atoms->r[iOff][m]-s->atoms->r[jOff][m]; + r2+=dr[m]*dr[m]; + } + + if ( r2 > rCut2) continue; + + // Important note: + // from this point on r actually refers to 1.0/r + r2 = 1.0/r2; + real_t r6 = s6 * (r2*r2*r2); + real_t eLocal = r6 * (r6 - 1.0) - eShift; + s->atoms->U[iOff] += 0.5*eLocal; + s->atoms->U[jOff] += 0.5*eLocal; + + // calculate energy contribution based on whether + // the neighbor box is local or remote + if (jBox < s->boxes->nLocalBoxes) + ePot += eLocal; + else + ePot += 0.5 * eLocal; + + // different formulation to avoid sqrt computation + real_t fr = - 4.0*epsilon*r6*r2*(12.0*r6 - 6.0); + for (int m=0; m<3; m++) + { + s->atoms->f[iOff][m] -= dr[m]*fr; + s->atoms->f[jOff][m] += dr[m]*fr; + } + } // loop over atoms in jBox + } // loop over atoms in iBox + } // loop over neighbor boxes + } // loop over local boxes in system + + ePot = ePot*4.0*epsilon; + s->ePotential = ePot; + + return 0; +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/memUtils.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/memUtils.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/memUtils.h @@ -0,0 +1,30 @@ +/// \file +/// Wrappers for memory allocation. + +#ifndef _MEMUTILS_H_ +#define _MEMUTILS_H_ + +#include + +#define freeMe(s,element) {if(s->element) comdFree(s->element); s->element = NULL;} + +static void* comdMalloc(size_t iSize) +{ + return malloc(iSize); +} + +static void* comdCalloc(size_t num, size_t iSize) +{ + return calloc(num, iSize); +} + +static void* comdRealloc(void* ptr, size_t iSize) +{ + return realloc(ptr, iSize); +} + +static void comdFree(void *ptr) +{ + free(ptr); +} +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.h @@ -0,0 +1,37 @@ +/// \file +/// Handle command line arguments. + +#ifndef MYCOMMAND_H +#define MYCOMMAND_H + +#include + +/// A structure to hold the value of every run-time parameter that can +/// be read from the command line. +typedef struct CommandSt +{ + char potDir[1024]; //!< the directory where EAM potentials reside + char potName[1024]; //!< the name of the potential + char potType[1024]; //!< the type of the potential (funcfl or setfl) + int doeam; //!< a flag to determine whether we're running EAM potentials + int nx; //!< number of unit cells in x + int ny; //!< number of unit cells in y + int nz; //!< number of unit cells in z + int xproc; //!< number of processors in x direction + int yproc; //!< number of processors in y direction + int zproc; //!< number of processors in z direction + int nSteps; //!< number of time steps to run + int printRate; //!< number of steps between output + double dt; //!< time step (in femtoseconds) + double lat; //!< lattice constant (in Angstroms) + double temperature; //!< simulation initial temperature (in Kelvin) + double initialDelta; //!< magnitude of initial displacement from lattice (in Angstroms) +} Command; + +/// Process command line arguments into an easy to handle structure. +Command parseCommandLine(int argc, char** argv); + +/// Print run parameters in yaml format on the supplied output stream. +void printCmdYaml(FILE* file, Command* cmd); + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mycommand.c @@ -0,0 +1,290 @@ +/// \file +/// Handle command line arguments. + +#include "mycommand.h" + +#include +#include + +#include "cmdLineParser.h" +#include "parallel.h" +#include "mytype.h" + +/// \page pg_running_comd Running CoMD +/// +/// \section sec_command_line_options Command Line Options +/// +/// CoMD accepts a number of command line options to set the parameters +/// of the simulation. Every option has both a long form and a short +/// form. The long and short form of the arguments are entirely +/// interchangeable and may be mixed. All the arguments are independent +/// with the exception of the \--potDir, \--potName, and \--potType, +/// (short forms -d, -n, and -t) arguments which are only relevant when +/// used in conjunction with \--doeam, (-e). +/// +/// Supported options are: +/// +/// | Long Form | Short Form | Default Value | Description +/// | :------------ | :---------: | :-----------: | :---------- +/// | \--help | -h | N/A | print this message +/// | \--potDir | -d | pots | potential directory +/// | \--potName | -p | Cu_u6.eam | potential name +/// | \--potType | -t | funcfl | potential type (funcfl or setfl) +/// | \--doeam | -e | N/A | compute eam potentials (default is LJ) +/// | \--nx | -x | 20 | number of unit cells in x +/// | \--ny | -y | 20 | number of unit cells in y +/// | \--nz | -z | 20 | number of unit cells in z +/// | \--xproc | -i | 1 | number of ranks in x direction +/// | \--yproc | -j | 1 | number of ranks in y direction +/// | \--zproc | -k | 1 | number of ranks in z direction +/// | \--nSteps | -N | 100 | total number of time steps +/// | \--printRate | -n | 10 | number of steps between output +/// | \--dt | -D | 1 | time step (in fs) +/// | \--lat | -l | -1 | lattice parameter (Angstroms) +/// | \--temp | -T | 600 | initial temperature (K) +/// | \--delta | -r | 0 | initial delta (Angstroms) +/// +/// Notes: +/// +/// The negative value for the lattice parameter (such as the default +/// value, -1) is interpreted as a flag to indicate that the lattice +/// parameter should be set from the potential. All supplied potentials +/// are for copper and have a lattice constant of 3.615 +/// Angstroms. Setting the lattice parameter to any positive value will +/// override the values provided in the potential files. +/// +/// The default potential name for the funcfl potential type is +/// Cu_u6.eam (Adams potential). For the setfl type the default +/// potential name is Cu01.eam.alloy (Mishin potential). Although these +/// will yield similar dynamics, the table have a very different number +/// of entries (500 vs. 10,000 points, respectively) This may give very +/// different performance, depending on the hardware. +/// +/// The default temperature is 600K. However, when using a perfect +/// lattice the system will rapidly cool to 300K due to equipartition of +/// energy. +/// +/// +/// \subsection ssec_example_command_lines Examples +/// +/// All of the examples below assume: +/// - The current working directory contains a copy of the pots dir (or +/// a link to it). +/// - The CoMD bin directory is located in ../bin +/// +/// Running in the examples directory will satisfy these requirements. +/// +/// ------------------------------ +/// +/// The canonical base simulation, is +/// +/// $ mpirun -np 1 ../bin/CoMD-mpi +/// +/// Or, if the code was built without MPI: +/// +/// $ ../bin/CoMD-serial +/// +/// ------------------------------ +/// +/// \subsubsection cmd_examples_potential Changing Potentials +/// +/// To run with the default (Adams) EAM potential, specify -e: +/// +/// $ ../bin/CoMD-mpi -e +/// +/// ------------------------------ +/// +/// To run using the Mishin EAM potential contained in the setfl file +/// Cu01.eam.alloy. This potential uses much larger tables (10,000 +/// entries vs. 500 for the Adams potential). +/// +/// $ ../bin/CoMD-mpi -e -t setfl +/// +/// ------------------------------ +/// +/// Selecting the name of a setfl file without setting the appropriate +/// potential type +/// +/// $ ../bin/CoMD-mpi -e -p Cu01.eam.alloy +/// +/// will result in an error message: +/// +/// Only FCC Lattice type supported, not . Fatal Error. +/// +/// Instead use: +/// +/// $ ../bin/CoMD-mpi -e -t setfl -p Cu01.eam.alloy +/// +/// ------------------------------ +/// +/// \subsubsection cmd_example_struct Initial Structure Modifications +/// +/// To change the lattice constant and run with an expanded or +/// compressed lattice: +/// +/// $ ../bin/CoMD-mpi -l 3.5 +/// +/// This can be useful to test that the potential is being correctly +/// evaluated as a function of interatomic spacing (the cold +/// curve). However, due to the high degree of symmetry of a perfect +/// lattice, this type of test is unlikely to detect errors in the force +/// computation. +/// +/// ------------------------------ +/// +/// Initialize with zero temperature (zero instantaneous particle +/// velocity) but with a random displacements of the atoms (in this +/// case the maximum displacement is 0.1 Angstrom along each axis). +/// +/// $ ../bin/CoMD-mpi --delta 0.1 -T 0 +/// +/// Typical values of delta are in the range of 0.1 to 0.5 Angstroms. +/// Larger values of delta correspond to higher initial potential energy +/// which in turn produce higer temperatures as the structure +/// equilibrates. +/// +/// ------------------------------ +/// +/// +/// \subsubsection cmd_examples_scaling Scaling Examples +/// +/// Simple shell scripts that demonstrate weak and strong scaling +/// studies are provided in the examples directory. +/// +/// ------------------------------ +/// +/// Run the default global simulation size (32,000 atoms) distributed +/// over 8 cubic subdomains, an example of strong scaling. If the +/// number of processors does not equal (i*j*k) the run will abort. +/// Notice that spaces are optional between short form options and their +/// arguments. +/// +/// $ mpirun -np 8 ../bin/CoMD-mpi -i2 -j2 -k2 +/// +/// ------------------------------ +/// +/// Run a weak scaling example: the simulation is doubled in each +/// dimension from the default 20 x 20 x 20 and the number of subdomains +/// in each direction is also doubled. +/// +/// $ mpirun -np 8 ../bin/CoMD-mpi -i2 -j2 -k2 -x 40 -y 40 -z 40 +/// +/// ------------------------------ +/// +/// The same weak scaling run, but for 10,000 timesteps, with output +/// only every 100 steps. +/// +/// $ mpirun -np 8 ../bin/CoMD-mpi -i2 -j2 -k2 -x 40 -y 40 -z 40 -N 10000 -n 100 +/// + +/// \details Initialize a Command structure with default values, then +/// parse any command line arguments that were supplied to overwrite +/// defaults. +/// +/// \param [in] argc the number of command line arguments +/// \param [in] argv the command line arguments array +Command parseCommandLine(int argc, char** argv) +{ + Command cmd; + + memset(cmd.potDir, 0, 1024); + memset(cmd.potName, 0, 1024); + memset(cmd.potType, 0, 1024); + strcpy(cmd.potDir, "pots"); + strcpy(cmd.potName, "\0"); // default depends on potType + strcpy(cmd.potType, "funcfl"); + cmd.doeam = 0; + cmd.nx = 20; + cmd.ny = 20; + cmd.nz = 20; + cmd.xproc = 1; + cmd.yproc = 1; + cmd.zproc = 1; + cmd.nSteps = 100; + cmd.printRate = 10; + cmd.dt = 1.0; + cmd.lat = -1.0; + cmd.temperature = 600.0; + cmd.initialDelta = 0.0; + + int help=0; + // add arguments for processing. Please update the html documentation too! + addArg("help", 'h', 0, 'i', &(help), 0, "print this message"); + addArg("potDir", 'd', 1, 's', cmd.potDir, sizeof(cmd.potDir), "potential directory"); + addArg("potName", 'p', 1, 's', cmd.potName, sizeof(cmd.potName), "potential name"); + addArg("potType", 't', 1, 's', cmd.potType, sizeof(cmd.potType), "potential type (funcfl or setfl)"); + addArg("doeam", 'e', 0, 'i', &(cmd.doeam), 0, "compute eam potentials"); + addArg("nx", 'x', 1, 'i', &(cmd.nx), 0, "number of unit cells in x"); + addArg("ny", 'y', 1, 'i', &(cmd.ny), 0, "number of unit cells in y"); + addArg("nz", 'z', 1, 'i', &(cmd.nz), 0, "number of unit cells in z"); + addArg("xproc", 'i', 1, 'i', &(cmd.xproc), 0, "processors in x direction"); + addArg("yproc", 'j', 1, 'i', &(cmd.yproc), 0, "processors in y direction"); + addArg("zproc", 'k', 1, 'i', &(cmd.zproc), 0, "processors in z direction"); + addArg("nSteps", 'N', 1, 'i', &(cmd.nSteps), 0, "number of time steps"); + addArg("printRate", 'n', 1, 'i', &(cmd.printRate), 0, "number of steps between output"); + addArg("dt", 'D', 1, 'd', &(cmd.dt), 0, "time step (in fs)"); + addArg("lat", 'l', 1, 'd', &(cmd.lat), 0, "lattice parameter (Angstroms)"); + addArg("temp", 'T', 1, 'd', &(cmd.temperature), 0, "initial temperature (K)"); + addArg("delta", 'r', 1, 'd', &(cmd.initialDelta), 0, "initial delta (Angstroms)"); + + processArgs(argc,argv); + + // If user didn't set potName, set type dependent default. + if (strlen(cmd.potName) == 0) + { + if (strcmp(cmd.potType, "setfl" ) == 0) + strcpy(cmd.potName, "Cu01.eam.alloy"); + if (strcmp(cmd.potType, "funcfl") == 0) + strcpy(cmd.potName, "Cu_u6.eam"); + } + + if (help) + { + printArgs(); + freeArgs(); + exit(2); + } + freeArgs(); + + return cmd; +} + +void printCmdYaml(FILE* file, Command* cmd) +{ + if (! printRank()) + return; + fprintf(file, + "Command Line Parameters:\n" + " doeam: %d\n" + " potDir: %s\n" + " potName: %s\n" + " potType: %s\n" + " nx: %d\n" + " ny: %d\n" + " nz: %d\n" + " xproc: %d\n" + " yproc: %d\n" + " zproc: %d\n" + " Lattice constant: %g Angstroms\n" + " nSteps: %d\n" + " printRate: %d\n" + " Time step: %g fs\n" + " Initial Temperature: %g K\n" + " Initial Delta: %g Angstroms\n" + "\n", + cmd->doeam, + cmd->potDir, + cmd->potName, + cmd->potType, + cmd->nx, cmd->ny, cmd->nz, + cmd->xproc, cmd->yproc, cmd->zproc, + cmd->lat, + cmd->nSteps, + cmd->printRate, + cmd->dt, + cmd->temperature, + cmd->initialDelta + ); + fflush(file); +} + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mytype.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mytype.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/mytype.h @@ -0,0 +1,29 @@ +/// \file +/// Frequently needed typedefs. + +#ifndef __MYTYPE_H_ +#define __MYTYPE_H_ + +/// \def SINGLE determines whether single or double precision is built +#ifdef SINGLE +typedef float real_t; //!< define native type for CoMD as single precision + #define FMT1 "%g" //!< /def format argument for floats + #define EMT1 "%e" //!< /def format argument for eng floats +#else +typedef double real_t; //!< define native type for CoMD as double precision + #define FMT1 "%lg" //!< \def format argument for doubles + #define EMT1 "%le" //!< \def format argument for eng doubles +#endif + +typedef real_t real3[3]; //!< a convenience vector with three real_t + +static void zeroReal3(real3 a) +{ + a[0] = 0.0; + a[1] = 0.0; + a[2] = 0.0; +} + +#define screenOut stdout + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.h @@ -0,0 +1,66 @@ +/// \file +/// Wrappers for MPI functions. + +#ifndef _PARALLEL_H_ +#define _PARALLEL_H_ + +#include "mytype.h" + +/// Structure for use with MPI_MINLOC and MPI_MAXLOC operations. +typedef struct RankReduceDataSt +{ + double val; + int rank; +} RankReduceData; + +/// Return total number of processors. +int getNRanks(void); + +/// Return local rank. +int getMyRank(void); + +/// Return non-zero if printing occurs from this rank. +int printRank(void); + +/// Print a timestamp and message when all tasks arrive. +void timestampBarrier(const char* msg); + +/// Wrapper for MPI_Init. +void initParallel(int *argc, char ***argv); + +/// Wrapper for MPI_Finalize. +void destroyParallel(void); + +/// Wrapper for MPI_Barrier(MPI_COMM_WORLD). +void barrierParallel(void); + +/// Wrapper for MPI_Sendrecv. +int sendReceiveParallel(void* sendBuf, int sendLen, int dest, + void* recvBuf, int recvLen, int source); + +/// Wrapper for MPI_Allreduce integer sum. +void addIntParallel(int* sendBuf, int* recvBuf, int count); + +/// Wrapper for MPI_Allreduce real sum. +void addRealParallel(real_t* sendBuf, real_t* recvBuf, int count); + +/// Wrapper for MPI_Allreduce double sum. +void addDoubleParallel(double* sendBuf, double* recvBuf, int count); + +/// Wrapper for MPI_Allreduce integer max. +void maxIntParallel(int* sendBuf, int* recvBuf, int count); + +/// Wrapper for MPI_Allreduce double min with rank. +void minRankDoubleParallel(RankReduceData* sendBuf, RankReduceData* recvBuf, int count); + +/// Wrapper for MPI_Allreduce double max with rank. +void maxRankDoubleParallel(RankReduceData* sendBuf, RankReduceData* recvBuf, int count); + +/// Wrapper for MPI_Bcast +void bcastParallel(void* buf, int len, int root); + +/// Return non-zero if code was built with MPI active. +int builtWithMpi(void); + +#endif + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/parallel.c @@ -0,0 +1,201 @@ +/// \file +/// Wrappers for MPI functions. This should be the only compilation +/// unit in the code that directly calls MPI functions. To build a pure +/// serial version of the code with no MPI, do not define DO_MPI. If +/// DO_MPI is not defined then all MPI functionality is replaced with +/// equivalent single task behavior. + +#include "parallel.h" + +#ifdef DO_MPI +#include +#endif + +#include +#include +#include +#include + +static int myRank = 0; +static int nRanks = 1; + +#ifdef DO_MPI +#ifdef SINGLE +#define REAL_MPI_TYPE MPI_FLOAT +#else +#define REAL_MPI_TYPE MPI_DOUBLE +#endif + +#endif + +int getNRanks() +{ + return nRanks; +} + +int getMyRank() +{ + return myRank; +} + +/// \details +/// For now this is just a check for rank 0 but in principle it could be +/// more complex. It is also possible to suppress practically all +/// output by causing this function to return 0 for all ranks. +int printRank() +{ + if (myRank == 0) return 1; + return 0; +} + +void timestampBarrier(const char* msg) +{ + barrierParallel(); + if (! printRank()) + return; + time_t t= time(NULL); + char* timeString = ctime(&t); + timeString[24] = '\0'; // clobber newline +#ifndef PRINT_TIMING + timeString[0] = '\0'; +#endif + + fprintf(screenOut, "%s: %s\n", timeString, msg); + fflush(screenOut); +} + +void initParallel(int* argc, char*** argv) +{ +#ifdef DO_MPI + MPI_Init(argc, argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myRank); + MPI_Comm_size(MPI_COMM_WORLD, &nRanks); +#endif +} + +void destroyParallel() +{ +#ifdef DO_MPI + MPI_Finalize(); +#endif +} + +void barrierParallel() +{ +#ifdef DO_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif +} + +/// \param [in] sendBuf Data to send. +/// \param [in] sendLen Number of bytes to send. +/// \param [in] dest Rank in MPI_COMM_WORLD where data will be sent. +/// \param [out] recvBuf Received data. +/// \param [in] recvLen Maximum number of bytes to receive. +/// \param [in] source Rank in MPI_COMM_WORLD from which to receive. +/// \return Number of bytes received. +int sendReceiveParallel(void* sendBuf, int sendLen, int dest, + void* recvBuf, int recvLen, int source) +{ +#ifdef DO_MPI + int bytesReceived; + MPI_Status status; + MPI_Sendrecv(sendBuf, sendLen, MPI_BYTE, dest, 0, + recvBuf, recvLen, MPI_BYTE, source, 0, + MPI_COMM_WORLD, &status); + MPI_Get_count(&status, MPI_BYTE, &bytesReceived); + + return bytesReceived; +#else + assert(source == dest); + memcpy(recvBuf, sendBuf, sendLen); + + return sendLen; +#endif +} + +void addIntParallel(int* sendBuf, int* recvBuf, int count) +{ +#ifdef DO_MPI + MPI_Allreduce(sendBuf, recvBuf, count, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#else + for (int ii=0; ii + +/// Timer handles +enum TimerHandle{ + totalTimer, + loopTimer, + timestepTimer, + positionTimer, + velocityTimer, + redistributeTimer, + atomHaloTimer, + computeForceTimer, + eamHaloTimer, + commHaloTimer, + commReduceTimer, + numberOfTimers}; + +/// Use the startTimer and stopTimer macros for timers in code regions +/// that may be performance sensitive. These can be compiled away by +/// defining NTIMING. If you are placing a timer anywere outside of a +/// tight loop, consider calling profileStart and profileStop instead. +/// +/// Place calls as follows to collect time for code pieces. +/// Time is collected everytime this portion of code is executed. +/// +/// ... +/// startTimer(computeForceTimer); +/// computeForce(sim); +/// stopTimer(computeForceTimer); +/// ... +/// +#ifndef NTIMING +#define startTimer(handle) \ + do \ +{ \ + profileStart(handle); \ +} while(0) +#define stopTimer(handle) \ + do \ +{ \ + profileStop(handle); \ +} while(0) +#else +#define startTimer(handle) +#define stopTimer(handle) +#endif + +/// Use profileStart and profileStop only for timers that should *never* +/// be turned off. Typically this means they are outside the main +/// simulation loop. If the timer is inside the main loop use +/// startTimer and stopTimer instead. +void profileStart(const enum TimerHandle handle); +void profileStop(const enum TimerHandle handle); + +/// Use to get elapsed time (lap timer). +double getElapsedTime(const enum TimerHandle handle); + +/// Print timing results. +void printPerformanceResults(int nGlobalAtoms, int printRate); + +/// Print timing results to Yaml file +void printPerformanceResultsYaml(FILE* file); +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/performanceTimers.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/performanceTimers.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/performanceTimers.c @@ -0,0 +1,309 @@ +/// \file +/// Performance timer functions. +/// +/// Use the timer functionality to collect timing and number of calls +/// information for chosen computations (such as force calls) and +/// communication (such as sends, receives, reductions). Timing results +/// are reported at the end of the run showing overall timings and +/// statistics of timings across ranks. +/// +/// A new timer can be added as follows: +/// -# add new handle to the TimerHandle in performanceTimers.h +/// -# provide a corresponding name in timerName +/// +/// Note that the order of the handles and names must be the +/// same. This order also determines the order in which the timers are +/// printed. Names can contain leading spaces to show a hierarchical +/// ordering. Timers with zero calls are omitted from the report. +/// +/// Raw timer data is obtained from the getTime() and getTick() +/// functions. The supplied portable versions of these functions can be +/// replaced with platform specific versions for improved accuracy or +/// lower latency. +/// \see TimerHandle +/// \see getTime +/// \see getTick +/// + + +#include "performanceTimers.h" + +#include +#include +#include +#include +#include +#include + +#include "performanceTimers.h" +#include "mytype.h" +#include "parallel.h" +#include "yamlOutput.h" + +static uint64_t getTime(void); +static double getTick(void); +static void timerStats(void); + +/// You must add timer name in same order as enum in .h file. +/// Leading spaces can be specified to show a hierarchy of timers. +char* timerName[numberOfTimers] = { + "total", + "loop", + "timestep", + " position", + " velocity", + " redistribute", + " atomHalo", + " force", + " eamHalo", + "commHalo", + "commReduce" +}; + +/// Timer data collected. Also facilitates computing averages and +/// statistics. +typedef struct TimersSt +{ + uint64_t start; //!< call start time + uint64_t total; //!< current total time + uint64_t count; //!< current call count + uint64_t elapsed; //!< lap time + + int minRank; //!< rank with min value + int maxRank; //!< rank with max value + + double minValue; //!< min over ranks + double maxValue; //!< max over ranks + double average; //!< average over ranks + double stdev; //!< stdev across ranks +} Timers; + +/// Global timing data collected. +typedef struct TimerGlobalSt +{ + double atomRate; //!< average time (us) per atom per rank + double atomAllRate; //!< average time (us) per atom + double atomsPerUSec; //!< average atoms per time (us) +} TimerGlobal; + +static Timers perfTimer[numberOfTimers]; +static TimerGlobal perfGlobal; + +void profileStart(const enum TimerHandle handle) +{ + perfTimer[handle].start = getTime(); +} + +void profileStop(const enum TimerHandle handle) +{ + perfTimer[handle].count += 1; + uint64_t delta = getTime() - perfTimer[handle].start; + perfTimer[handle].total += delta; + perfTimer[handle].elapsed += delta; +} + +/// \details +/// Return elapsed time (in seconds) since last call with this handle +/// and clear for next lap. +double getElapsedTime(const enum TimerHandle handle) +{ + double etime = getTick() * (double)perfTimer[handle].elapsed; + perfTimer[handle].elapsed = 0; + + return etime; +} + +/// \details +/// The report contains two blocks. The upper block is performance +/// information for the printRank. The lower block is statistical +/// information over all ranks. +void printPerformanceResults(int nGlobalAtoms, int printRate) +{ + // Collect timer statistics overall and across ranks + timerStats(); + + if (!printRank()) + return; + + // only print timers with non-zero values. + double tick = getTick(); + double loopTime = perfTimer[loopTimer].total*tick; + + fprintf(screenOut, "\n\nTimings for Rank %d\n", getMyRank()); + fprintf(screenOut, " Timer # Calls Avg/Call (s) Total (s) %% Loop\n"); + fprintf(screenOut, "___________________________________________________________________\n"); + for (int ii=0; ii 0) + fprintf(screenOut, "%-16s%12"PRIu64" %8.4f %8.4f %8.2f\n", + timerName[ii], + perfTimer[ii].count, + totalTime/(double)perfTimer[ii].count, + totalTime, + totalTime/loopTime*100.0); + } + + fprintf(screenOut, "\nTiming Statistics Across %d Ranks:\n", getNRanks()); + fprintf(screenOut, " Timer Rank: Min(s) Rank: Max(s) Avg(s) Stdev(s)\n"); + fprintf(screenOut, "_____________________________________________________________________________\n"); + + for (int ii = 0; ii < numberOfTimers; ++ii) + { + if (perfTimer[ii].count > 0) + fprintf(screenOut, "%-16s%6d:%10.4f %6d:%10.4f %10.4f %10.4f\n", + timerName[ii], + perfTimer[ii].minRank, perfTimer[ii].minValue*tick, + perfTimer[ii].maxRank, perfTimer[ii].maxValue*tick, + perfTimer[ii].average*tick, perfTimer[ii].stdev*tick); + } + double atomsPerTask = nGlobalAtoms/(real_t)getNRanks(); + perfGlobal.atomRate = perfTimer[timestepTimer].average * tick * 1e6 / + (atomsPerTask * perfTimer[timestepTimer].count * printRate); + perfGlobal.atomAllRate = perfTimer[timestepTimer].average * tick * 1e6 / + (nGlobalAtoms * perfTimer[timestepTimer].count * printRate); + perfGlobal.atomsPerUSec = 1.0 / perfGlobal.atomAllRate; + + fprintf(screenOut, "\n---------------------------------------------------\n"); + fprintf(screenOut, " Average atom update rate: %6.2f us/atom/task\n", perfGlobal.atomRate); + fprintf(screenOut, "---------------------------------------------------\n\n"); + + fprintf(screenOut, "\n---------------------------------------------------\n"); + fprintf(screenOut, " Average all atom update rate: %6.2f us/atom\n", perfGlobal.atomAllRate); + fprintf(screenOut, "---------------------------------------------------\n\n"); + + fprintf(screenOut, "\n---------------------------------------------------\n"); + fprintf(screenOut, " Average atom rate: %6.2f atoms/us\n", perfGlobal.atomsPerUSec); + fprintf(screenOut, "---------------------------------------------------\n\n"); +} + +void printPerformanceResultsYaml(FILE* file) +{ + if (! printRank()) + return; + + double tick = getTick(); + double loopTime = perfTimer[loopTimer].total*tick; + + fprintf(file,"\nPerformance Results:\n"); + fprintf(file, " TotalRanks: %d\n", getNRanks()); + fprintf(file, " ReportingTimeUnits: seconds\n"); + fprintf(file, "Performance Results For Rank %d:\n", getMyRank()); + for (int ii = 0; ii < numberOfTimers; ii++) + { + if (perfTimer[ii].count > 0) + { + double totalTime = perfTimer[ii].total*tick; + fprintf(file, " Timer: %s\n", timerName[ii]); + fprintf(file, " CallCount: %"PRIu64"\n", perfTimer[ii].count); + fprintf(file, " AvgPerCall: %8.4f\n", totalTime/(double)perfTimer[ii].count); + fprintf(file, " Total: %8.4f\n", totalTime); + fprintf(file, " PercentLoop: %8.2f\n", totalTime/loopTime*100); + } + } + + fprintf(file, "Performance Results Across Ranks:\n"); + for (int ii = 0; ii < numberOfTimers; ii++) + { + if (perfTimer[ii].count > 0) + { + fprintf(file, " Timer: %s\n", timerName[ii]); + fprintf(file, " MinRank: %d\n", perfTimer[ii].minRank); + fprintf(file, " MinTime: %8.4f\n", perfTimer[ii].minValue*tick); + fprintf(file, " MaxRank: %d\n", perfTimer[ii].maxRank); + fprintf(file, " MaxTime: %8.4f\n", perfTimer[ii].maxValue*tick); + fprintf(file, " AvgTime: %8.4f\n", perfTimer[ii].average*tick); + fprintf(file, " StdevTime: %8.4f\n", perfTimer[ii].stdev*tick); + } + } + + fprintf(file,"Performance Global Update Rates:\n"); + fprintf(file, " AtomUpdateRate:\n"); + fprintf(file, " AverageRate: %6.2f\n", perfGlobal.atomRate); + fprintf(file, " Units: us/atom/task\n"); + fprintf(file, " AllAtomUpdateRate:\n"); + fprintf(file, " AverageRate: %6.2f\n", perfGlobal.atomAllRate); + fprintf(file, " Units: us/atom\n"); + fprintf(file, " AtomRate:\n"); + fprintf(file, " AverageRate: %6.2f\n", perfGlobal.atomsPerUSec); + fprintf(file, " Units: atoms/us\n"); + + fprintf(file, "\n"); +} + +/// Returns current time as a 64-bit integer. This portable version +/// returns the number of microseconds since mindight, Jamuary 1, 1970. +/// Hence, timing data will have a resolution of 1 microsecond. +/// Platforms with access to calls with lower latency or higher +/// resolution (such as a cycle counter) may wish to replace this +/// implementation and change the conversion factor in getTick as +/// appropriate. +/// \see getTick for the conversion factor between the integer time +/// units of this function and seconds. +static uint64_t getTime(void) +{ + struct timeval ptime; + uint64_t t = 0; + gettimeofday(&ptime, (struct timezone *)NULL); + t = ((uint64_t)1000000)*(uint64_t)ptime.tv_sec + (uint64_t)ptime.tv_usec; + + return t; +} + +/// Returns the factor for converting the integer time reported by +/// getTime into seconds. The portable getTime returns values in units +/// of microseconds so the conversion is simply 1e-6. +/// \see getTime +static double getTick(void) +{ + double seconds_per_cycle = 1.0e-6; + return seconds_per_cycle; +} + +/// Collect timer statistics across ranks. +void timerStats(void) +{ + double sendBuf[numberOfTimers], recvBuf[numberOfTimers]; + + // Determine average of each timer across ranks + for (int ii = 0; ii < numberOfTimers; ii++) + sendBuf[ii] = (double)perfTimer[ii].total; + addDoubleParallel(sendBuf, recvBuf, numberOfTimers); + + for (int ii = 0; ii < numberOfTimers; ii++) + perfTimer[ii].average = recvBuf[ii] / (double)getNRanks(); + + + // Determine min and max across ranks and which rank + RankReduceData reduceSendBuf[numberOfTimers], reduceRecvBuf[numberOfTimers]; + for (int ii = 0; ii < numberOfTimers; ii++) + { + reduceSendBuf[ii].val = (double)perfTimer[ii].total; + reduceSendBuf[ii].rank = getMyRank(); + } + minRankDoubleParallel(reduceSendBuf, reduceRecvBuf, numberOfTimers); + for (int ii = 0; ii < numberOfTimers; ii++) + { + perfTimer[ii].minValue = reduceRecvBuf[ii].val; + perfTimer[ii].minRank = reduceRecvBuf[ii].rank; + } + maxRankDoubleParallel(reduceSendBuf, reduceRecvBuf, numberOfTimers); + for (int ii = 0; ii < numberOfTimers; ii++) + { + perfTimer[ii].maxValue = reduceRecvBuf[ii].val; + perfTimer[ii].maxRank = reduceRecvBuf[ii].rank; + } + + // Determine standard deviation + for (int ii = 0; ii < numberOfTimers; ii++) + { + double temp = (double)perfTimer[ii].total - perfTimer[ii].average; + sendBuf[ii] = temp * temp; + } + addDoubleParallel(sendBuf, recvBuf, numberOfTimers); + for (int ii = 0; ii < numberOfTimers; ii++) + { + perfTimer[ii].stdev = sqrt(recvBuf[ii] / (double) getNRanks()); + } +} + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.h @@ -0,0 +1,20 @@ +/// \file +/// Simple random number generators for uniform and gaussian +/// distributions. + +#ifndef __RANDOM_H +#define __RANDOM_H + +#include "mytype.h" +#include + +/// Return a random number from a Gaussian distribution. +real_t gasdev(uint64_t* seed); + +/// Return a random number from a uniform distribution. +double lcg61(uint64_t* seed); + +/// Return a seed suitable for calling lcg61 or gasdev. +uint64_t mkSeed(uint32_t id, uint32_t callSite); + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/random.c @@ -0,0 +1,75 @@ +/// \file +/// Simple random number generators for uniform and Gaussian +/// distributions. The generator in lcg61 and the hash in mkSeed aren't +/// really industrial strength, but they're more than good enough for +/// present purposes. + +#include "random.h" + +#include + +/// \details +/// Use the Box-Muller method to sample a Gaussian distribution with +/// zero mean and unit variance. To ensure the same input seed always +/// generates the same returned value we do not use the standard +/// technique of saving one of the two generated randoms for the next +/// call. +/// +/// \param [in,out] seed Seed for generator. +/// +/// \return A pseudo-random number in the interval (-infinity, infinity). +real_t gasdev(uint64_t* seed) +{ + real_t rsq,v1,v2; + do + { + v1 = 2.0*lcg61(seed)-1.0; + v2 = 2.0*lcg61(seed)-1.0; + rsq = v1*v1+v2*v2; + } while (rsq >= 1.0 || rsq == 0.0); + + return v2 * sqrt(-2.0*log(rsq)/rsq); +} + +/// \details +/// A 61-bit prime modulus linear congruential generator with +/// modulus = 2^61 -1. +/// +/// \param [in,out] seed Seed for generator. +/// +/// \return A pseudo-random number in the interval [0, 1]. +double lcg61(uint64_t* seed) +{ + static const double convertToDouble = 1.0/UINT64_C(2305843009213693951); + + *seed *= UINT64_C(437799614237992725); + *seed %= UINT64_C(2305843009213693951); + + return *seed*convertToDouble; +} + +/// \details +/// Forms a 64-bit seed for lcg61 from the combination of 2 32-bit Knuth +/// multiplicative hashes, then runs off 10 values to pass up the worst +/// of the early low-bit correlations. +/// +/// \param [in] id An id number such as an atom gid that is unique to +/// each entity that requires random numbers. +/// +/// \param [in] callSite A unique number for each call site in the code +/// that needs to generate random seeds. Using a different value for +/// callSite allows different parts of the code to obtain different +/// random streams for the same id. +/// +/// \return A 64-bit seed that is unique to the id and call site. +uint64_t mkSeed(uint32_t id, uint32_t callSite) +{ + uint32_t s1 = id * UINT32_C(2654435761); + uint32_t s2 = (id+callSite) * UINT32_C(2654435761); + + uint64_t iSeed = (UINT64_C(0x100000000) * s1) + s2; + for (unsigned jj=0; jj<10; ++jj) + lcg61(&iSeed); + + return iSeed; +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.h @@ -0,0 +1,16 @@ +/// \file +/// Leapfrog time integrator + +#ifndef __LEAPFROG_H +#define __LEAPFROG_H + +#include "CoMDTypes.h" + +double timestep(SimFlat* s, int n, real_t dt); +void computeForce(SimFlat* s); +void kineticEnergy(SimFlat* s); + +/// Update local and remote link cells after atoms have moved. +void redistributeAtoms(struct SimFlatSt* sim); + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/timestep.c @@ -0,0 +1,146 @@ +/// \file +/// Leapfrog time integrator + +#include "timestep.h" + +#include "CoMDTypes.h" +#include "linkCells.h" +#include "parallel.h" +#include "performanceTimers.h" + +static void advanceVelocity(SimFlat* s, int nBoxes, real_t dt); +static void advancePosition(SimFlat* s, int nBoxes, real_t dt); + + +/// Advance the simulation time to t+dt using a leap frog method +/// (equivalent to velocity verlet). +/// +/// Forces must be computed before calling the integrator the first time. +/// +/// - Advance velocities half time step using forces +/// - Advance positions full time step using velocities +/// - Update link cells and exchange remote particles +/// - Compute forces +/// - Update velocities half time step using forces +/// +/// This leaves positions, velocities, and forces at t+dt, with the +/// forces ready to perform the half step velocity update at the top of +/// the next call. +/// +/// After nSteps the kinetic energy is computed for diagnostic output. +double timestep(SimFlat* s, int nSteps, real_t dt) +{ + for (int ii=0; iiboxes->nLocalBoxes, 0.5*dt); + stopTimer(velocityTimer); + + startTimer(positionTimer); + advancePosition(s, s->boxes->nLocalBoxes, dt); + stopTimer(positionTimer); + + startTimer(redistributeTimer); + redistributeAtoms(s); + stopTimer(redistributeTimer); + + startTimer(computeForceTimer); + computeForce(s); + stopTimer(computeForceTimer); + + startTimer(velocityTimer); + advanceVelocity(s, s->boxes->nLocalBoxes, 0.5*dt); + stopTimer(velocityTimer); + } + + kineticEnergy(s); + + return s->ePotential; +} + +void computeForce(SimFlat* s) +{ + s->pot->force(s); +} + + +void advanceVelocity(SimFlat* s, int nBoxes, real_t dt) +{ + for (int iBox=0; iBoxboxes->nAtoms[iBox]; ii++,iOff++) + { + s->atoms->p[iOff][0] += dt*s->atoms->f[iOff][0]; + s->atoms->p[iOff][1] += dt*s->atoms->f[iOff][1]; + s->atoms->p[iOff][2] += dt*s->atoms->f[iOff][2]; + } + } +} + +void advancePosition(SimFlat* s, int nBoxes, real_t dt) +{ + for (int iBox=0; iBoxboxes->nAtoms[iBox]; ii++,iOff++) + { + int iSpecies = s->atoms->iSpecies[iOff]; + real_t invMass = 1.0/s->species[iSpecies].mass; + s->atoms->r[iOff][0] += dt*s->atoms->p[iOff][0]*invMass; + s->atoms->r[iOff][1] += dt*s->atoms->p[iOff][1]*invMass; + s->atoms->r[iOff][2] += dt*s->atoms->p[iOff][2]*invMass; + } + } +} + +/// Calculates total kinetic and potential energy across all tasks. The +/// local potential energy is a by-product of the force routine. +void kineticEnergy(SimFlat* s) +{ + real_t eLocal[2]; + eLocal[0] = s->ePotential; + eLocal[1] = 0; + for (int iBox=0; iBoxboxes->nLocalBoxes; iBox++) + { + for (int iOff=MAXATOMS*iBox,ii=0; iiboxes->nAtoms[iBox]; ii++,iOff++) + { + int iSpecies = s->atoms->iSpecies[iOff]; + real_t invMass = 0.5/s->species[iSpecies].mass; + eLocal[1] += ( s->atoms->p[iOff][0] * s->atoms->p[iOff][0] + + s->atoms->p[iOff][1] * s->atoms->p[iOff][1] + + s->atoms->p[iOff][2] * s->atoms->p[iOff][2] )*invMass; + } + } + + real_t eSum[2]; + startTimer(commReduceTimer); + addRealParallel(eLocal, eSum, 2); + stopTimer(commReduceTimer); + + s->ePotential = eSum[0]; + s->eKinetic = eSum[1]; +} + +/// \details +/// This function provides one-stop shopping for the sequence of events +/// that must occur for a proper exchange of halo atoms after the atom +/// positions have been updated by the integrator. +/// +/// - updateLinkCells: Since atoms have moved, some may be in the wrong +/// link cells. +/// - haloExchange (atom version): Sends atom data to remote tasks. +/// - sort: Sort the atoms. +/// +/// \see updateLinkCells +/// \see initAtomHaloExchange +/// \see sortAtomsInCell +void redistributeAtoms(SimFlat* sim) +{ + updateLinkCells(sim->boxes, sim->atoms); + + startTimer(atomHaloTimer); + haloExchange(sim->atomExchange, sim); + stopTimer(atomHaloTimer); + + for (int ii=0; iiboxes->nTotalBoxes; ++ii) + sortAtomsInCell(sim->atoms, sim->boxes, ii); +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.h @@ -0,0 +1,19 @@ +/// \file +/// Write simulation information in YAML format. + +#ifndef __YAML_OUTPUT_H +#define __YAML_OUTPUT_H + +#include + +/// Provide access to the YAML file in other compilation units. +extern FILE* yamlFile; + +void yamlBegin(void); +void yamlEnd(void); + +void yamlAppInfo(FILE* file); + +void printSeparator(FILE* file); + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/yamlOutput.c @@ -0,0 +1,108 @@ +/// \file +/// Write simulation information in YAML format. +/// +/// Information regarding platform, run parameters, performance, etc., +/// are written to a file whose name is generated from the CoMDVariant +/// and the time of the run. This provides a simple mechanism to track +/// and compare performance etc. +/// +/// There are much more sophisticated libraries and routines available +/// to handle YAML, but this simple implemenation handles everything we +/// really need. + +#include "yamlOutput.h" + +#include +#include +#include + +#include "CoMD_info.h" +#include "mytype.h" +#include "parallel.h" + +FILE* yamlFile = NULL; + +static const char* CoMDVersion = "1.1"; +static const char* CoMDVariant = CoMD_VARIANT; + +static void getTimeString(char* timestring) +{ +#ifndef PRINT_TIMING + *timestring = '\0'; + return; +#endif + + time_t rawtime; + struct tm* timeinfo; + time(&rawtime); + timeinfo = localtime(&rawtime); + + sprintf(timestring, + "%4d-%02i-%02d, %02d:%02d:%02d", + timeinfo->tm_year+1900, + timeinfo->tm_mon+1, + timeinfo->tm_mday, + timeinfo->tm_hour, + timeinfo->tm_min, + timeinfo->tm_sec); +} + +void yamlBegin(void) +{ + if (! printRank()) + return; + + char filename[64]; + time_t rawtime; + time (&rawtime); + struct tm* ptm = localtime(&rawtime); + char sdate[25]; + //use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12 + sprintf (sdate,"%04d:%02d:%02d-%02d:%02d:%02d", + ptm->tm_year + 1900, ptm->tm_mon+1, + ptm->tm_mday, ptm->tm_hour, ptm->tm_min,ptm->tm_sec); + sprintf(filename, "%s.%s.yaml", CoMDVariant, sdate); + yamlFile = fopen(filename, "w"); +} + +void yamlAppInfo(FILE* file) +{ + if (! printRank()) + return; + printSeparator(file); + fprintf(file,"Mini-Application Name : %s\n", CoMDVariant); + fprintf(file,"Mini-Application Version : %s\n", CoMDVersion); + fprintf(file,"Platform:\n"); + fprintf(file," hostname: %s\n", CoMD_HOSTNAME); + fprintf(file," kernel name: %s\n", CoMD_KERNEL_NAME); + fprintf(file," kernel release: %s\n", CoMD_KERNEL_RELEASE); + fprintf(file," processor: %s\n", CoMD_PROCESSOR); + fprintf(file,"Build:\n"); + fprintf(file," CC: %s\n", CoMD_COMPILER); + fprintf(file," compiler version: %s\n", CoMD_COMPILER_VERSION); + fprintf(file," CFLAGS: %s\n", CoMD_CFLAGS); + fprintf(file," LDFLAGS: %s\n", CoMD_LDFLAGS); + fprintf(file," using MPI: %s\n", builtWithMpi() ? "true":"false"); + fprintf(file," Threading: none\n"); + fprintf(file," Double Precision: %s\n", (sizeof(real_t)==sizeof(double)?"true":"false")); + char timestring[32]; + getTimeString(timestring); + fprintf(file,"Run Date/Time: %s\n", timestring); + fprintf(file, "\n"); + fflush(file); +} + +void yamlEnd(void) +{ + if (! printRank()) + return; + + fclose(yamlFile); +} + +void printSeparator(FILE* file) +{ + //fprintf(file,"=========================================================================\n"); + fprintf(file,"\n"); +} + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile @@ -6,6 +6,7 @@ Pathfinder \ miniGMG \ RSBench \ - SimpleMOC + SimpleMOC \ + CoMD include $(LEVEL)/Makefile.programs