blob: 5c2263f21d1b2284e1fddd63da0480209719c65b [file] [log] [blame]
/// \file
/// Main program
///
/// \mainpage CoMD: A Classical Molecular Dynamics Mini-app
///
/// CoMD is a reference implementation of typical classical molecular
/// dynamics algorithms and workloads. It is created and maintained by
/// The Exascale Co-Design Center for Materials in Extreme Environments
/// (ExMatEx). http://codesign.lanl.gov/projects/exmatex. The
/// code is intended to serve as a vehicle for co-design by allowing
/// others to extend and/or reimplement it as needed to test performance of
/// new architectures, programming models, etc.
///
/// The current version of CoMD is available from:
/// http://exmatex.github.io/CoMD
///
/// To contact the developers of CoMD send email to: exmatex-comd@llnl.gov.
///
/// Table of Contents
/// =================
///
/// Click on the links below to browse the CoMD documentation.
///
/// \subpage pg_md_basics
///
/// \subpage pg_building_comd
///
/// \subpage pg_running_comd
///
/// \subpage pg_measuring_performance
///
/// \subpage pg_problem_selection_and_scaling
///
/// \subpage pg_verifying_correctness
///
/// \subpage pg_comd_architecture
///
/// \subpage pg_optimization_targets
///
/// \subpage pg_whats_new
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include "CoMDTypes.h"
#include "decomposition.h"
#include "linkCells.h"
#include "eam.h"
#include "ljForce.h"
#include "initAtoms.h"
#include "memUtils.h"
#include "yamlOutput.h"
#include "parallel.h"
#include "performanceTimers.h"
#include "mycommand.h"
#include "timestep.h"
#include "constants.h"
#define REDIRECT_OUTPUT 0
#define MIN(A,B) ((A) < (B) ? (A) : (B))
static SimFlat* initSimulation(Command cmd);
static void destroySimulation(SimFlat** ps);
static void initSubsystems(void);
static void finalizeSubsystems(void);
static BasePotential* initPotential(
int doeam, const char* potDir, const char* potName, const char* potType);
static SpeciesData* initSpecies(BasePotential* pot);
static Validate* initValidate(SimFlat* s);
static void validateResult(const Validate* val, SimFlat *sim);
static void sumAtoms(SimFlat* s);
static void printThings(SimFlat* s, int iStep, double elapsedTime);
static void printSimulationDataYaml(FILE* file, SimFlat* s);
static void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8]);
int main(int argc, char** argv)
{
// Prolog
initParallel(&argc, &argv);
profileStart(totalTimer);
initSubsystems();
timestampBarrier("Starting Initialization\n");
#ifdef PRINT_YAML
yamlAppInfo(yamlFile);
#endif
yamlAppInfo(screenOut);
Command cmd = parseCommandLine(argc, argv);
#ifdef PRINT_YAML
printCmdYaml(yamlFile, &cmd);
#endif
printCmdYaml(screenOut, &cmd);
SimFlat* sim = initSimulation(cmd);
#ifdef PRINT_YAML
printSimulationDataYaml(yamlFile, sim);
#endif
printSimulationDataYaml(screenOut, sim);
Validate* validate = initValidate(sim); // atom counts, energy
timestampBarrier("Initialization Finished\n");
timestampBarrier("Starting simulation\n");
// This is the CoMD main loop
const int nSteps = sim->nSteps;
const int printRate = sim->printRate;
int iStep = 0;
profileStart(loopTimer);
for (; iStep<nSteps;)
{
startTimer(commReduceTimer);
sumAtoms(sim);
stopTimer(commReduceTimer);
printThings(sim, iStep, getElapsedTime(timestepTimer));
startTimer(timestepTimer);
timestep(sim, printRate, sim->dt);
stopTimer(timestepTimer);
iStep += printRate;
}
profileStop(loopTimer);
sumAtoms(sim);
printThings(sim, iStep, getElapsedTime(timestepTimer));
timestampBarrier("Ending simulation\n");
// Epilog
validateResult(validate, sim);
profileStop(totalTimer);
#ifdef PRINT_TIMING
printPerformanceResults(sim->atoms->nGlobal, sim->printRate);
printPerformanceResultsYaml(yamlFile);
#endif
destroySimulation(&sim);
comdFree(validate);
finalizeSubsystems();
timestampBarrier("CoMD Ending\n");
destroyParallel();
return 0;
}
/// Initialized the main CoMD data stucture, SimFlat, based on command
/// line input from the user. Also performs certain sanity checks on
/// the input to screen out certain non-sensical inputs.
///
/// Simple data members such as the time step dt are initialized
/// directly, substructures such as the potential, the link cells, the
/// atoms, etc., are initialized by calling additional initialization
/// functions (initPotential(), initLinkCells(), initAtoms(), etc.).
/// Initialization order is set by the natural dependencies of the
/// substructure such as the atoms need the link cells so the link cells
/// must be initialized before the atoms.
SimFlat* initSimulation(Command cmd)
{
SimFlat* sim = comdMalloc(sizeof(SimFlat));
sim->nSteps = cmd.nSteps;
sim->printRate = cmd.printRate;
sim->dt = cmd.dt;
sim->domain = NULL;
sim->boxes = NULL;
sim->atoms = NULL;
sim->ePotential = 0.0;
sim->eKinetic = 0.0;
sim->atomExchange = NULL;
sim->pot = initPotential(cmd.doeam, cmd.potDir, cmd.potName, cmd.potType);
real_t latticeConstant = cmd.lat;
if (cmd.lat < 0.0)
latticeConstant = sim->pot->lat;
// ensure input parameters make sense.
sanityChecks(cmd, sim->pot->cutoff, latticeConstant, sim->pot->latticeType);
sim->species = initSpecies(sim->pot);
real3 globalExtent;
globalExtent[0] = cmd.nx * latticeConstant;
globalExtent[1] = cmd.ny * latticeConstant;
globalExtent[2] = cmd.nz * latticeConstant;
sim->domain = initDecomposition(
cmd.xproc, cmd.yproc, cmd.zproc, globalExtent);
sim->boxes = initLinkCells(sim->domain, sim->pot->cutoff);
sim->atoms = initAtoms(sim->boxes);
// create lattice with desired temperature and displacement.
createFccLattice(cmd.nx, cmd.ny, cmd.nz, latticeConstant, sim);
setTemperature(sim, cmd.temperature);
randomDisplacements(sim, cmd.initialDelta);
sim->atomExchange = initAtomHaloExchange(sim->domain, sim->boxes);
// Forces must be computed before we call the time stepper.
startTimer(redistributeTimer);
redistributeAtoms(sim);
stopTimer(redistributeTimer);
startTimer(computeForceTimer);
computeForce(sim);
stopTimer(computeForceTimer);
kineticEnergy(sim);
return sim;
}
/// frees all data associated with *ps and frees *ps
void destroySimulation(SimFlat** ps)
{
if ( ! ps ) return;
SimFlat* s = *ps;
if ( ! s ) return;
BasePotential* pot = s->pot;
if ( pot) pot->destroy(&pot);
destroyLinkCells(&(s->boxes));
destroyAtoms(s->atoms);
destroyHaloExchange(&(s->atomExchange));
comdFree(s->species);
comdFree(s->domain);
comdFree(s);
*ps = NULL;
return;
}
void initSubsystems(void)
{
#if REDIRECT_OUTPUT
freopen("testOut.txt","w",screenOut);
#endif
#ifdef PRINT_YAML
yamlBegin();
#endif
}
void finalizeSubsystems(void)
{
#if REDIRECT_OUTPUT
fclose(screenOut);
#endif
#ifdef PRINT_YAML
yamlEnd();
#endif
}
/// decide whether to get LJ or EAM potentials
BasePotential* initPotential(
int doeam, const char* potDir, const char* potName, const char* potType)
{
BasePotential* pot = NULL;
if (doeam)
pot = initEamPot(potDir, potName, potType);
else
pot = initLjPot();
assert(pot);
return pot;
}
SpeciesData* initSpecies(BasePotential* pot)
{
SpeciesData* species = comdMalloc(sizeof(SpeciesData));
strcpy(species->name, pot->name);
species->atomicNo = pot->atomicNo;
species->mass = pot->mass;
return species;
}
Validate* initValidate(SimFlat* sim)
{
sumAtoms(sim);
Validate* val = comdMalloc(sizeof(Validate));
val->eTot0 = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
val->nAtoms0 = sim->atoms->nGlobal;
if (printRank())
{
fprintf(screenOut, "\n");
printSeparator(screenOut);
fprintf(screenOut, "Initial energy : %14.12f, atom count : %d \n",
val->eTot0, val->nAtoms0);
fprintf(screenOut, "\n");
}
return val;
}
void validateResult(const Validate* val, SimFlat* sim)
{
if (printRank())
{
real_t eFinal = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
int nAtomsDelta = (sim->atoms->nGlobal - val->nAtoms0);
fprintf(screenOut, "\n");
fprintf(screenOut, "\n");
fprintf(screenOut, "Simulation Validation:\n");
fprintf(screenOut, " Initial energy : %14.12f\n", val->eTot0);
fprintf(screenOut, " Final energy : %14.12f\n", eFinal);
fprintf(screenOut, " eFinal/eInitial : %f\n", eFinal/val->eTot0);
if ( nAtomsDelta == 0)
{
fprintf(screenOut, " Final atom count : %d, no atoms lost\n",
sim->atoms->nGlobal);
}
else
{
fprintf(screenOut, "#############################\n");
fprintf(screenOut, "# WARNING: %6d atoms lost #\n", nAtomsDelta);
fprintf(screenOut, "#############################\n");
}
}
}
void sumAtoms(SimFlat* s)
{
// sum atoms across all processers
s->atoms->nLocal = 0;
for (int i = 0; i < s->boxes->nLocalBoxes; i++)
{
s->atoms->nLocal += s->boxes->nAtoms[i];
}
startTimer(commReduceTimer);
addIntParallel(&s->atoms->nLocal, &s->atoms->nGlobal, 1);
stopTimer(commReduceTimer);
}
/// Prints current time, energy, performance etc to monitor the state of
/// the running simulation. Performance per atom is scaled by the
/// number of local atoms per process this should give consistent timing
/// assuming reasonable load balance
void printThings(SimFlat* s, int iStep, double elapsedTime)
{
// keep track previous value of iStep so we can calculate number of steps.
static int iStepPrev = -1;
static int firstCall = 1;
int nEval = iStep - iStepPrev; // gives nEval = 1 for zeroth step.
iStepPrev = iStep;
if (! printRank() )
return;
if (firstCall)
{
firstCall = 0;
fprintf(screenOut,
"# Performance\n"
"# Loop Time(fs) Total Energy Potential Energy Kinetic Energy Temperature (us/atom) # Atoms\n");
fflush(screenOut);
}
real_t time = iStep*s->dt;
real_t eTotal = (s->ePotential+s->eKinetic) / s->atoms->nGlobal;
real_t eK = s->eKinetic / s->atoms->nGlobal;
real_t eU = s->ePotential / s->atoms->nGlobal;
real_t Temp = (s->eKinetic / s->atoms->nGlobal) / (kB_eV * 1.5);
double timePerAtom = 1.0e6*elapsedTime/(double)(nEval*s->atoms->nLocal);
#ifndef PRINT_TIMING
timePerAtom = 0.0;
#endif
fprintf(screenOut, " %6d %10.2f %18.12f %18.12f %18.12f %12.4f %10.4f %12d\n",
iStep, time, eTotal, eU, eK, Temp, timePerAtom, s->atoms->nGlobal);
}
/// Print information about the simulation in a format that is (mostly)
/// YAML compliant.
void printSimulationDataYaml(FILE* file, SimFlat* s)
{
// All ranks get maxOccupancy
int maxOcc = maxOccupancy(s->boxes);
// Only rank 0 prints
if (! printRank())
return;
fprintf(file,"Simulation data: \n");
fprintf(file," Total atoms : %d\n",
s->atoms->nGlobal);
fprintf(file," Min global bounds : [ %14.10f, %14.10f, %14.10f ]\n",
s->domain->globalMin[0], s->domain->globalMin[1], s->domain->globalMin[2]);
fprintf(file," Max global bounds : [ %14.10f, %14.10f, %14.10f ]\n",
s->domain->globalMax[0], s->domain->globalMax[1], s->domain->globalMax[2]);
printSeparator(file);
fprintf(file,"Decomposition data: \n");
fprintf(file," Processors : %6d,%6d,%6d\n",
s->domain->procGrid[0], s->domain->procGrid[1], s->domain->procGrid[2]);
fprintf(file," Local boxes : %6d,%6d,%6d = %8d\n",
s->boxes->gridSize[0], s->boxes->gridSize[1], s->boxes->gridSize[2],
s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2]);
fprintf(file," Box size : [ %14.10f, %14.10f, %14.10f ]\n",
s->boxes->boxSize[0], s->boxes->boxSize[1], s->boxes->boxSize[2]);
fprintf(file," Box factor : [ %14.10f, %14.10f, %14.10f ] \n",
s->boxes->boxSize[0]/s->pot->cutoff,
s->boxes->boxSize[1]/s->pot->cutoff,
s->boxes->boxSize[2]/s->pot->cutoff);
fprintf(file, " Max Link Cell Occupancy: %d of %d\n",
maxOcc, MAXATOMS);
printSeparator(file);
fprintf(file,"Potential data: \n");
s->pot->print(file, s->pot);
// Memory footprint diagnostics
int perAtomSize = 10*sizeof(real_t)+2*sizeof(int);
float mbPerAtom = perAtomSize/1024/1024;
float totalMemLocal = (float)(perAtomSize*s->atoms->nLocal)/1024/1024;
float totalMemGlobal = (float)(perAtomSize*s->atoms->nGlobal)/1024/1024;
int nLocalBoxes = s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2];
int nTotalBoxes = (s->boxes->gridSize[0]+2)*(s->boxes->gridSize[1]+2)*(s->boxes->gridSize[2]+2);
float paddedMemLocal = (float) nLocalBoxes*(perAtomSize*MAXATOMS)/1024/1024;
float paddedMemTotal = (float) nTotalBoxes*(perAtomSize*MAXATOMS)/1024/1024;
printSeparator(file);
fprintf(file,"Memory data: \n");
fprintf(file, " Intrinsic atom footprint = %4d B/atom \n", perAtomSize);
fprintf(file, " Total atom footprint = %7.3f MB (%6.2f MB/node)\n", totalMemGlobal, totalMemLocal);
fprintf(file, " Link cell atom footprint = %7.3f MB/node\n", paddedMemLocal);
fprintf(file, " Link cell atom footprint = %7.3f MB/node (including halo cell data\n", paddedMemTotal);
fflush(file);
}
/// Check that the user input meets certain criteria.
void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8])
{
int failCode = 0;
// Check that domain grid matches number of ranks. (fail code 1)
int nProcs = cmd.xproc * cmd.yproc * cmd.zproc;
if (nProcs != getNRanks())
{
failCode |= 1;
if (printRank() )
fprintf(screenOut,
"\nNumber of MPI ranks must match xproc * yproc * zproc\n");
}
// Check whether simuation is too small (fail code 2)
double minx = 2*cutoff*cmd.xproc;
double miny = 2*cutoff*cmd.yproc;
double minz = 2*cutoff*cmd.zproc;
double sizex = cmd.nx*latticeConst;
double sizey = cmd.ny*latticeConst;
double sizez = cmd.nz*latticeConst;
if ( sizex < minx || sizey < miny || sizez < minz)
{
failCode |= 2;
if (printRank())
fprintf(screenOut,"\nSimulation too small.\n"
" Increase the number of unit cells to make the simulation\n"
" at least (%3.2f, %3.2f. %3.2f) Ansgstroms in size\n",
minx, miny, minz);
}
// Check for supported lattice structure (fail code 4)
if (strcasecmp(latticeType, "FCC") != 0)
{
failCode |= 4;
if ( printRank() )
fprintf(screenOut,
"\nOnly FCC Lattice type supported, not %s. Fatal Error.\n",
latticeType);
}
int checkCode = failCode;
bcastParallel(&checkCode, sizeof(int), 0);
// This assertion can only fail if different tasks failed different
// sanity checks. That should not be possible.
assert(checkCode == failCode);
if (failCode != 0)
exit(failCode);
}
// --------------------------------------------------------------
/// \page pg_building_comd Building CoMD
///
/// CoMD is written with portability in mind and should compile using
/// practically any compiler that implements the C99 standard. You will
/// need to create a Makefile by copying the sample provided with the
/// distribution (Makefile.vanilla).
///
/// $ cp Makefile.vanilla Makefile
///
/// and use the make command to build the code
///
/// $ make
///
/// The sample Makefile will compile the code on many platforms. See
/// comments in Makefile.vanilla for information about specifying the
/// name of the C compiler, and/or additional compiler switches that
/// might be necessary for your platform.
///
/// The main options available in the Makefile are toggling single/double
/// precision and enabling/disabling MPI. In the event MPI is not
/// available, setting the DO_MPI flag to OFF will create a purely
/// serial build (you will likely also need to change the setting of
/// CC).
///
/// The makefile should handle all the dependency checking needed, via
/// makedepend.
///
/// 'make clean' removes the object and dependency files.
///
/// 'make distclean' additionally removes the executable file and the
/// documentation files.
///
/// Other build options
/// -------------------
///
/// Various other options are made available by \#define arguments within
/// some of the source files.
///
/// #REDIRECT_OUTPUT in CoMD.c
///
/// Setting this to 1 will redirect all screen output to a file,
/// currently set to 'testOut.txt'.
///
/// #POT_SHIFT in ljForce.c
///
/// This is set to 1.0 by default, and shifts the values of the cohesive
/// energy given by the Lennard-Jones potential so it is zero at the
/// cutoff radius. This setting improves energy conservation
/// step-to-step as it reduces the noise generated by atoms crossing the
/// cutoff threshold. However, it does not affect the long-term energy
/// conservation of the code.
///
/// #MAXATOMS in linkCells.h
///
/// The default value is 64, which allows ample padding of the linkCell
/// structure to allow for density fluctuations. Reducing it may improve
/// the efficiency of the code via improved thread utilization and
/// reduced memory footprint.
// --------------------------------------------------------------
// --------------------------------------------------------------
/// \page pg_measuring_performance Measuring Performance
///
/// CoMD implements a simple and extensible system of internal timers to
/// measure the performance profile of the code. As explained in
/// performanceTimers.c, it is easy to create additional timers and
/// associate them with code regions of specific interest. In addition,
/// the getTime() and getTick() functions can be easily reimplemented to
/// take advantage of platform specific timing resources.
///
/// A timing report is printed at the end of each simulation.
///
/// ~~~~
/// Timings for Rank 0
/// Timer # Calls Avg/Call (s) Total (s) % Loop
/// ___________________________________________________________________
/// total 1 50.6701 50.6701 100.04
/// loop 1 50.6505 50.6505 100.00
/// timestep 1 50.6505 50.6505 100.00
/// position 10000 0.0000 0.0441 0.09
/// velocity 20000 0.0000 0.0388 0.08
/// redistribute 10001 0.0003 3.4842 6.88
/// atomHalo 10001 0.0002 2.4577 4.85
/// force 10001 0.0047 47.0856 92.96
/// eamHalo 10001 0.0001 1.0592 2.09
/// commHalo 60006 0.0000 1.7550 3.46
/// commReduce 12 0.0000 0.0003 0.00
///
/// Timing Statistics Across 8 Ranks:
/// Timer Rank: Min(s) Rank: Max(s) Avg(s) Stdev(s)
/// _____________________________________________________________________________
/// total 3: 50.6697 0: 50.6701 50.6699 0.0001
/// loop 0: 50.6505 4: 50.6505 50.6505 0.0000
/// timestep 0: 50.6505 4: 50.6505 50.6505 0.0000
/// position 2: 0.0437 0: 0.0441 0.0439 0.0001
/// velocity 2: 0.0380 4: 0.0392 0.0385 0.0004
/// redistribute 0: 3.4842 1: 3.7085 3.6015 0.0622
/// atomHalo 0: 2.4577 7: 2.6441 2.5780 0.0549
/// force 1: 46.8624 0: 47.0856 46.9689 0.0619
/// eamHalo 3: 0.2269 6: 1.2936 1.0951 0.3344
/// commHalo 3: 1.0803 6: 2.1856 1.9363 0.3462
/// commReduce 6: 0.0002 2: 0.0003 0.0003 0.0000
///
/// ---------------------------------------------------
/// Average atom update rate: 9.39 us/atom/task
/// ---------------------------------------------------
///
/// ~~~~
/// This report consists of two blocks. The upper block lists the absolute
/// wall clock time spent in each timer on rank 0 of the job. The lower
/// block reports minimum, maximum, average, and standard deviation of
/// times across all tasks.
/// The ranks where the minimum and maximum values occured are also reported
/// to aid in identifying hotspots or load imbalances.
///
/// The last line of the report gives the atom update rate in
/// microseconds/atom/task. Since this quantity is normalized by both
/// the number of atoms and the number of tasks it provides a simple
/// figure of merit to compare performance between runs with different
/// numbers of atoms and different numbers of tasks. Any increase in
/// this number relative to a large number of atoms on a single task
/// represents a loss of parallel efficiency.
///
/// Choosing the problem size correctly has important implications for the
/// reported performance. Small problem sizes may run entirely in the cache
/// of some architectures, leading to very good performance results.
/// For general characterization of performance, it is probably best to
/// choose problem sizes which force the code to access main memory, even
/// though there may be strong scaling scenarios where the code is indeed
/// running mainly in cache.
///
/// *** Architecture/Configuration for above timing numbers:
/// SGI XE1300 cluster with dual-socket Intel quad-core Nehalem processors.
/// Each node has 2 Quad-Core Xeon X5550 processors runnning at 2.66 GHz
/// with 3 GB of memory per core.
// --------------------------------------------------------------
/// \page pg_problem_selection_and_scaling Problem Selection and Scaling
///
/// CoMD is a reference molecular dynamics simulation code as used in
/// materials science.
///
/// Problem Specification {#sec_problem_spec}
/// ======================
///
/// The reference problem is solid Copper starting from a face-centered
/// cubic (FCC) lattice. The initial thermodynamic conditions
/// (Temperature and Volume (via the lattice spacing, lat))can be specified
/// from the command line input. The default is 600 K and standard
/// volume (lat = 3.615 Angstroms).
/// Different temperatures (e.g. T =3000K) and volumes can be
/// specified to melt the system and enhance the interchange of atoms
/// between domains.
///
/// The dynamics is micro-canonical (NVE = constant Number of atoms,
/// constant total system Volume, and constant total system Energy). As
/// a result, the temperature is not fixed. Rather, the temperature will
/// adjust from the initial temperature (as specified on the command line)
/// to a final temperature as the total system kinetic energy comes into
/// equilibrium with the total system potential energy.
///
/// The total size of the problem (number of atoms) is specified by the
/// number (nx, ny, nz) of FCC unit cells in the x, y, z directions: nAtoms
/// = 4 * nx * ny * nz. The default size is nx = ny = nz = 20 or 32,000 atoms.
///
/// The simulation models bulk copper by replicating itself in every
/// direction using periodic boundary conditions.
///
/// Two interatomic force models are available: the Lennard-Jones (LJ)
/// two-body potential (ljForce.c) and the many-body Embedded-Atom Model (EAM)
/// potential (eam.c). The LJ potential is included for comparison and
/// is a valid approximation for constant volume and uniform
/// density. The EAM potential is a more accurate model of cohesion in
/// simple metals like Copper and includes the energetics necessary to
/// model non-uniform density and free surfaces.
///
/// Scaling Studies in CoMD {#sec_scaling_studies}
/// =======================
///
/// CoMD implements a simple geometric domain decomposition to divide
/// the total problem space into domains, which are owned by MPI
/// ranks. Each domain is a single-program multiple data (SPMD)
/// partition of the larger problem.
///
/// Caution: When doing scaling studies, it is important to distinguish
/// between the problem setup phase and the problem execution phase. Both
/// are important to the workflow of doing molecular dynamics, but it
/// is the execution phase we want to quantify in the scaling studies
/// described below, for that dominates the execution time for long runs
/// (millions of time steps). The problem setup can be an appreciable fraction
/// of the execution time for short runs (the default is 100 time steps)
/// and erroneous conclusions drawn.
///
/// This code is configured with timers. The times are reported per particle
/// and the timers for the force calculation, timestep, etc start after the
/// initialization phase is done.
///
/// Weak Scaling {#ssec_weak_scaling}
/// -----------
///
/// A weak scaling test fixes the amount of work per processor and
/// compares the execution time over number of processors. Weak scaling
/// keeps the ratio of inter-processor communication (surface) to
/// intra-processor work (volume) fixed. The amount of inter-processor
/// work scales with the number of processors in the domain and O(1000)
/// atoms per domain are needed for reasonable performance.
///
/// Examples,
///
/// - Increase in processor count by 8: <br>
/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=40)
///
/// - Increase in processor count by 2: <br>
/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=20, nz=40)
///
/// In general, it is wise to keep the ratio of processor count to
/// system size in each direction fixed (i.e. cubic domains): xproc_0 / nx_0 = xproc_1 /
/// nx_1, since this minimizes surface area to volume.
/// Feel free to experiment, you might learn something about
/// algorithms to optimize communication relative to work.
///
/// Strong Scaling {#ssec_strong_scaling}
/// ---------------
///
/// A strong scaling test fixes the total problem size and compares the
/// execution time for different numbers of processors. Strong scaling
/// increases the ratio of inter-processor communication (surface) to
/// intra-processor work (volume).
///
/// Examples,
///
/// - Increase in processor count by 8: <br>
/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=20)
///
/// - Increase in processor count by 2: <br>
/// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=nz=20)
///
/// The domain decomposition requires O(1000) atoms per domain and
/// begins to scale poorly for small numbers of atoms per domain.
/// Again, feel free to experiment, you might learn something here as
/// well. For example, when molecular dynamics codes were written for
/// vector supercomputers, large lists of force pairs were created for
/// the vector processor. These force lists provide a natural force
/// decomposition for early parallel computers (Fast Parallel Algorithms
/// for Short-Range Molecular Dynamics, S. J. Plimpton, J Comp Phys,
/// 117, 1-19 (1995).) Using replicated data, force decomposition can
/// scale to fewer than one atom per processor and is a natural
/// mechanism to exploit intra-processor parallelism.
///
/// For further details see for example:
/// https://support.scinet.utoronto.ca/wiki/index.php/Introduction_To_Performance
// --------------------------------------------------------------
/// \page pg_verifying_correctness Verifying Correctness
///
/// Verifying the correctness of an MD simulation is challenging.
/// Because MD is Lyapunov unstable, any small errors, even harmless
/// round-off errors, will lead to a long-term divergence in the atom
/// trajectories. Hence, comparing atom positions at the end of a run
/// is not always a useful verification technique. (Such divergences
/// are not a problem for science applications of MD since they do not
/// alter the statistical physics.) Small, single-particle errors can
/// also be difficult to detect in system-wide quantities such as the
/// kinetic or potential energy that are averaged over a large number of
/// particles.
///
/// In spite of these challenges, there are several methods which are
/// likely to catch significant errors.
///
/// Cohesive Energy {#sec_ver_cohesive_energy}
/// ===============
///
/// With a perfect lattice as the initial structure (this is the
/// default), the potential energy per atom is the cohesive energy.
/// This value should be computed correctly to many decimal places. Any
/// variation beyond the last 1 or 2 decimal places is cause for
/// investigation. The correct values for the cohesive energy are
///
/// | Potential | Cohesive Energy |
/// | :------------- | :-------------- |
/// | Lennard-Jones | -1.243619295058 |
/// | EAM (Adams) | -3.538079224691 |
/// | EAM (Mishin) | -3.539999969176 |
///
/// The \link sec_command_line_options command
/// line options \endlink documentation explains the switches used to
/// select the potential used in the simulation.
///
/// Note that the cohesive energy calculation is not sensitive to errors
/// in forces. It is also performed on a highly symmetric structure so
/// there are many errors this will not catch. Still, it is a good
/// first check.
///
/// Energy Conservation {#sec_ver_energy_conservation}
/// ===================
///
/// A correctly implemented force kernel, with an appropriate time step
/// (the default value of 1 fs is conservative for temperatures under
/// 10,000K) will conserve total energy over long times to 5 or more
/// digits. Any long term systematic drift in the total energy is a
/// cause for concern.
///
/// To facilitate checking energy conservation CoMD prints the final and
/// initial values of the total energy. When comparing these values, pay
/// careful attention to these details:
///
/// - It is common to observe an initial transient change in the total
/// energy. Differences in the total energy of 2-3% can be expected in
/// the first 10-100 time steps.
/// - The best way to check energy conservation is to run at least
/// several thousand steps and look at the slope of the total energy
/// ignoring at least the first one or two thousand steps. More steps
/// are even better.
/// - Set the temperature to at least several hundred K. This ensures
/// that atoms will sample a large range of configurations and expose
/// possible errors.
/// - Fluctuations in the energy can make it difficult to tell if
/// conservation is observed. Increasing the number of atoms will reduce
/// the fluctuations.
///
///
/// Particle Conservation {#sec_ver_particle_conservation}
/// =====================
///
/// The simulation should always end with the same number of particles
/// it started with. Any change is a bug. CoMD checks the initial and
/// final number of particles and prints a warning at the end of the
/// simulation if they are not equal.
///
/// Reproducibility {#sec_ver_reproducibility}
/// ===============
///
/// The same simulation run repeatedly on the same hardware should
/// produce the same result. Because parallel computing can add
/// elements of non-determinism we do not expect perfect long term
/// reproducibility, however over a few hundred to a few thousand time
/// steps the energies should not exhibit run-to-run differences outside
/// the last 1 or 2 decimal places. Larger differences are a sign of
/// trouble and should be investigated. This kind of test is
/// practically the only way to detect race conditions in shared memory
/// parallelism.
///
/// Portability {#sec_ver_portability}
/// ===========
///
/// In our experience, simulations that start from the same initial
/// condition tend to produce very similar trajectories over short terms
/// (100 to 1000 time step), even on different hardware platforms.
/// Short term differences beyond the last 1 or 2 decimal places should
/// likely be investigated.
///
/// General Principles {#sec_ver_general}
/// =======================
///
/// - Simulations run at 0K are too trivial for verification, set
/// the initial temperature to at least several hundred K.
/// - Longer runs are better to check conservation. Compare
/// energies after initial transients are damped out.
/// - Larger runs are better to check conservation. Fluctuations in the
/// energy are averaged out.
/// - Short term (order 100 time steps) discrepancies from run-to-run
/// or platform-to platform beyond the last one or two decimal places
/// are reason for concern. Differences in 4th or 5th decimal place
/// is almost certainly a bug.
/// - Contact the CoMD developers (exmatex-comd@llnl.gov) if you have
/// questions about validation.
///
// --------------------------------------------------------------
/// \page pg_comd_architecture CoMD Architecture
///
/// Program Flow {#sec_program_flow}
/// ============
///
/// We have attempted to make the program flow in CoMD 1.1 as simple and
/// transparent as possible. The main program consists of three blocks:
/// prolog, main loop, and epilog.
///
/// Prolog {#ssec_flow_prolog}
/// -------
///
/// The job of the prolog is to initialize the simulation and prepare
/// for the main loop. Notable tasks in the prolog include calling
/// - initParallel() to start MPI
/// - parseCommandLine() to read the command line options
/// - initSimulation() to initialize the main data structure, SimFlatSt.
/// This includes tasks such as
/// - initEamPot() to read tabular data for the potential function
/// - initDecomposition() to set up the domain decomposition
/// - createFccLattice() to generate an initial structure for the atoms
/// - initValidate() to store initial data for a simple validation check
///
/// In CoMD 1.1 all atomic structures are internally generated so
/// there is no need to read large files with atom coordinate data.
///
/// Main Loop {#ssec_flow_main_loop}
/// ---------
///
/// The main loop calls
/// - timestep(), the integrator to update particle positions,
/// - printThings() to periodically prints simulation information
///
/// The timestep() function is the heart of the code as it choreographs
/// updating the particle positions, along with computing forces
/// (computeForce()) and communicating atoms between ranks
/// (redistributeAtoms()).
///
/// Epilog {#ssec_flow_epilog}
/// -------
///
/// The epilog code handles end of run bookkeeping such as
/// - validateResult() to check validation
/// - printPerformanceResults() to print a performance summary
/// - destroySimulation() to free memory
///
/// Key Data Structures {#sec_key_data_structures}
/// ==================
///
/// Practically all data in CoMD belongs to the SimFlatSt structure.
/// This includes:
/// - BasePotentialSt A polymorphic structure for the potential model
/// - HaloExchangeSt A polymorphic strcuture for communication halo data
/// - DomainSt The parallel domain decomposition
/// - LinkCellSt The link cells
/// - AtomsSt The atom coordinates and velocities
/// - SpeciesDataSt Properties of the atomic species being simulated.
///
/// Consult the individual pages for each of these structures to learn
/// more. The descriptions in haloExchange.c and initLinkCells() are
/// especially useful to understand how the atoms are commuicated
/// between tasks and stored in link cells for fast pair finding.
// --------------------------------------------------------------
/// \page pg_optimization_targets Optimization Targets
///
/// Computation {#sec_computation}
/// ============
///
/// The computational effort of classical MD is usually highly focused
/// in the force kernel. The two force kernels supplied by CoMD are
/// eamForce() and ljForce(). Both kernels are fundamentally loops over
/// pairs of atoms with significant opportunity to exploit high levels
/// of concurrency. One potential challenge when reordering or
/// parallelizing the pair loop structure is preventing race conditions
/// that result if two concurrent pair evaluations try to simultaneously
/// increment the forces and energies on the same atom.
///
/// The supplied EAM kernel uses interpolation from tabular data to
/// evaluate functions. Hence the interpolate() function is another
/// potential optimization target. Note that the two potential files
/// distributed with CoMD have very different sizes. The Adams
/// potential (Cu_u6.eam) has 500 points per function in the table while
/// the Mishin potential (Cu01.eam.alloy) has 10,000 points per
/// function. This difference could potentially impact important
/// details such as cache miss rates.
///
/// Communication {#sec_communication}
/// =============
///
/// As the number of atoms per MPI rank decreases, the communication
/// routines will start to require a significant fraction of the
/// run time. The main communication routine in CoMD is haloExchange().
/// The halo exchange is simple nearest neighbor, point-to-point
/// communication so it should scale well to practically any number of
/// nodes.
///
/// The halo exchange in CoMD 1.1 is a very simple 6-direction
/// structured halo exchange (see haloExchange.c). Other exchange
/// algorithms can be implemented without much difficulty.
///
/// The halo exchange function is called in two very different contexts.
/// The main usage is to exchange halo particle information (see
/// initAtomHaloExchange()). This process is coordinated by the
/// redistributeAtoms() function.
///
/// In addition to the atom exchange, when using the EAM potential, a
/// halo exchange is performed in the force routine (see
/// initForceHaloExchange()).
// --------------------------------------------------------------
/// \page pg_whats_new New Features and Changes in CoMD 1.1
///
/// The main goals of the 1.1 release were to add support for MPI and to
/// improve the structure and clarity of the code. Achieving these
/// goals required considerable changes compared to the 1.0 release.
/// However, the core structure of the most computationally intensive
/// kernels (the force routines) is mostly unchanged. We believe that
/// lessons learned from optimizing 1.0 force kernels to specific
/// hardware or programming models can be quickly transferred to kernels
/// in the 1.1 release.
///
/// Significant changes in CoMD 1.1 include:
///
/// - MPI support. Both MPI and single node serial executables can be
/// built from the same source files.
///
/// - Improved modularity and code clarity. Major data structures are
/// now organized with their own structs and initialization routines.
///
/// - The build system has been simplified to use only standard
/// Makefiles instead of CMake.
///
/// - The halo exchange operation needed to communicate remote particle
/// data between MPI ranks also creates "image" particles in the
/// serial build.
///
/// - Unified force kernels for both serial and MPI builds
///
/// - The addition of remote/image atoms allows periodic boundary
/// conditions to be handled outside the force loop.
///
/// - An additional communication/data copy step to handle electron
/// density on remote/image atoms has been added to the EAM force
/// loop.
///
/// - The coordinate system has been simplified to a single global
/// coordinate system for all particles.
///
/// - Evaluation of energies and forces using a Chebyshev polynomial
/// fits has been removed. Polynomial approximation of energies and
/// forces will return in a future CoMD version.
///
/// - Atomic structures are now generated internally, eliminating the
/// requirement to read, write, and distribute large atom
/// configuration files. Arbitrarily large initial structures can
/// be generated with specified initial temperature and random
/// displacements from lattice positions. Code to read/write atomic
/// positions has been removed.
///
/// - EAM potentials are now read from standard funcfl and setfl format
/// files. Voter style files are no longer supported.
///
/// - Collection of performance metrics is significantly improved.
/// Developers can easily add new timers to regions of interest. The
/// system is also designed to allow easy integration with platform
/// specific API's to high resolution timers, cycle counters,
/// hardware counters, etc.
///
///
/// - Hooks to in-situ analysis and visualization have been removed.
/// In-situ analysis capabilities will return in a future CoMD release.
///
/// Please contact the CoMD developers (exematex-comd@llnl.gov) if
/// any of the deleted features negative impacts your work. We
/// may be able to help produce a custom version that includes the code
/// you need.
// --------------------------------------------------------------
/// \page pg_md_basics MD Basics
///
/// The molecular dynamics (MD) computer simulation method is a well
/// established and important tool for the study of the dynamical
/// properties of liquids, solids, and other systems of interest in
/// Materials Science and Engineering, Chemistry and Biology. A material
/// is represented in terms of atoms and molecules. The method of MD
/// simulation involves the evaluation of the force acting on each atom
/// due to all other atoms in the system and the numerical integration
/// of the Newtonian equations of motion. Though MD was initially
/// developed to compute the equilibrium thermodynamic behavior of
/// materials (equation of state), most recent applications have used MD
/// to study non-equilibrium processes.
///
/// Wikipeda offers a basic introduction to molecular dynamics with
/// many references:
///
/// http://en.wikipedia.org/wiki/Molecular_dynamics
///
/// For a thorough treatment of MD methods, see:
/// - "Computer simulation of liquids" by M.P. Allen and D.J. Tildesley
/// (Oxford, 1989)
/// ISBN-10: 0198556454 | ISBN-13: 978-0198556459.
///
/// For an understanding of MD simulations and application to statistical mechanics:
/// - "Understanding Molecular Simulation, Second Edition: From Algorithms
/// to Applications," by D. Frenkel and B. Smit (Academic Press, 2001)
/// ISBN-10: 0122673514 | ISBN-13: 978-0122673511
/// - "Statistical and Thermal Physics: With Computer Applications," by
/// H. Gould and J. Tobochnik (Princeton, 2010)
/// ISBN-10: 0691137447 | ISBN-13: 978-0691137445
///
/// CoMD implements both the Lennard-Jones Potential (ljForce.c) and the
/// Embedded Atom Method Potential (eam.c).
///