| /// \file |
| /// Main program |
| /// |
| /// \mainpage CoMD: A Classical Molecular Dynamics Mini-app |
| /// |
| /// CoMD is a reference implementation of typical classical molecular |
| /// dynamics algorithms and workloads. It is created and maintained by |
| /// The Exascale Co-Design Center for Materials in Extreme Environments |
| /// (ExMatEx). http://codesign.lanl.gov/projects/exmatex. The |
| /// code is intended to serve as a vehicle for co-design by allowing |
| /// others to extend and/or reimplement it as needed to test performance of |
| /// new architectures, programming models, etc. |
| /// |
| /// The current version of CoMD is available from: |
| /// http://exmatex.github.io/CoMD |
| /// |
| /// To contact the developers of CoMD send email to: exmatex-comd@llnl.gov. |
| /// |
| /// Table of Contents |
| /// ================= |
| /// |
| /// Click on the links below to browse the CoMD documentation. |
| /// |
| /// \subpage pg_md_basics |
| /// |
| /// \subpage pg_building_comd |
| /// |
| /// \subpage pg_running_comd |
| /// |
| /// \subpage pg_measuring_performance |
| /// |
| /// \subpage pg_problem_selection_and_scaling |
| /// |
| /// \subpage pg_verifying_correctness |
| /// |
| /// \subpage pg_comd_architecture |
| /// |
| /// \subpage pg_optimization_targets |
| /// |
| /// \subpage pg_whats_new |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <strings.h> |
| #include <unistd.h> |
| #include <assert.h> |
| |
| #include "CoMDTypes.h" |
| #include "decomposition.h" |
| #include "linkCells.h" |
| #include "eam.h" |
| #include "ljForce.h" |
| #include "initAtoms.h" |
| #include "memUtils.h" |
| #include "yamlOutput.h" |
| #include "parallel.h" |
| #include "performanceTimers.h" |
| #include "mycommand.h" |
| #include "timestep.h" |
| #include "constants.h" |
| |
| #define REDIRECT_OUTPUT 0 |
| #define MIN(A,B) ((A) < (B) ? (A) : (B)) |
| |
| static SimFlat* initSimulation(Command cmd); |
| static void destroySimulation(SimFlat** ps); |
| |
| static void initSubsystems(void); |
| static void finalizeSubsystems(void); |
| |
| static BasePotential* initPotential( |
| int doeam, const char* potDir, const char* potName, const char* potType); |
| static SpeciesData* initSpecies(BasePotential* pot); |
| static Validate* initValidate(SimFlat* s); |
| static void validateResult(const Validate* val, SimFlat *sim); |
| |
| static void sumAtoms(SimFlat* s); |
| static void printThings(SimFlat* s, int iStep, double elapsedTime); |
| static void printSimulationDataYaml(FILE* file, SimFlat* s); |
| static void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8]); |
| |
| |
| int main(int argc, char** argv) |
| { |
| // Prolog |
| initParallel(&argc, &argv); |
| profileStart(totalTimer); |
| initSubsystems(); |
| timestampBarrier("Starting Initialization\n"); |
| |
| #ifdef PRINT_YAML |
| yamlAppInfo(yamlFile); |
| #endif |
| yamlAppInfo(screenOut); |
| |
| Command cmd = parseCommandLine(argc, argv); |
| #ifdef PRINT_YAML |
| printCmdYaml(yamlFile, &cmd); |
| #endif |
| printCmdYaml(screenOut, &cmd); |
| |
| SimFlat* sim = initSimulation(cmd); |
| #ifdef PRINT_YAML |
| printSimulationDataYaml(yamlFile, sim); |
| #endif |
| printSimulationDataYaml(screenOut, sim); |
| |
| Validate* validate = initValidate(sim); // atom counts, energy |
| timestampBarrier("Initialization Finished\n"); |
| |
| timestampBarrier("Starting simulation\n"); |
| |
| // This is the CoMD main loop |
| const int nSteps = sim->nSteps; |
| const int printRate = sim->printRate; |
| int iStep = 0; |
| profileStart(loopTimer); |
| for (; iStep<nSteps;) |
| { |
| startTimer(commReduceTimer); |
| sumAtoms(sim); |
| stopTimer(commReduceTimer); |
| |
| printThings(sim, iStep, getElapsedTime(timestepTimer)); |
| |
| startTimer(timestepTimer); |
| timestep(sim, printRate, sim->dt); |
| stopTimer(timestepTimer); |
| |
| iStep += printRate; |
| } |
| profileStop(loopTimer); |
| |
| sumAtoms(sim); |
| printThings(sim, iStep, getElapsedTime(timestepTimer)); |
| timestampBarrier("Ending simulation\n"); |
| |
| // Epilog |
| validateResult(validate, sim); |
| profileStop(totalTimer); |
| |
| #ifdef PRINT_TIMING |
| printPerformanceResults(sim->atoms->nGlobal, sim->printRate); |
| printPerformanceResultsYaml(yamlFile); |
| #endif |
| |
| destroySimulation(&sim); |
| comdFree(validate); |
| finalizeSubsystems(); |
| |
| timestampBarrier("CoMD Ending\n"); |
| destroyParallel(); |
| |
| return 0; |
| } |
| |
| /// Initialized the main CoMD data stucture, SimFlat, based on command |
| /// line input from the user. Also performs certain sanity checks on |
| /// the input to screen out certain non-sensical inputs. |
| /// |
| /// Simple data members such as the time step dt are initialized |
| /// directly, substructures such as the potential, the link cells, the |
| /// atoms, etc., are initialized by calling additional initialization |
| /// functions (initPotential(), initLinkCells(), initAtoms(), etc.). |
| /// Initialization order is set by the natural dependencies of the |
| /// substructure such as the atoms need the link cells so the link cells |
| /// must be initialized before the atoms. |
| SimFlat* initSimulation(Command cmd) |
| { |
| SimFlat* sim = comdMalloc(sizeof(SimFlat)); |
| sim->nSteps = cmd.nSteps; |
| sim->printRate = cmd.printRate; |
| sim->dt = cmd.dt; |
| sim->domain = NULL; |
| sim->boxes = NULL; |
| sim->atoms = NULL; |
| sim->ePotential = 0.0; |
| sim->eKinetic = 0.0; |
| sim->atomExchange = NULL; |
| |
| sim->pot = initPotential(cmd.doeam, cmd.potDir, cmd.potName, cmd.potType); |
| real_t latticeConstant = cmd.lat; |
| if (cmd.lat < 0.0) |
| latticeConstant = sim->pot->lat; |
| |
| // ensure input parameters make sense. |
| sanityChecks(cmd, sim->pot->cutoff, latticeConstant, sim->pot->latticeType); |
| |
| sim->species = initSpecies(sim->pot); |
| |
| real3 globalExtent; |
| globalExtent[0] = cmd.nx * latticeConstant; |
| globalExtent[1] = cmd.ny * latticeConstant; |
| globalExtent[2] = cmd.nz * latticeConstant; |
| |
| sim->domain = initDecomposition( |
| cmd.xproc, cmd.yproc, cmd.zproc, globalExtent); |
| |
| sim->boxes = initLinkCells(sim->domain, sim->pot->cutoff); |
| sim->atoms = initAtoms(sim->boxes); |
| |
| // create lattice with desired temperature and displacement. |
| createFccLattice(cmd.nx, cmd.ny, cmd.nz, latticeConstant, sim); |
| setTemperature(sim, cmd.temperature); |
| randomDisplacements(sim, cmd.initialDelta); |
| |
| sim->atomExchange = initAtomHaloExchange(sim->domain, sim->boxes); |
| |
| // Forces must be computed before we call the time stepper. |
| startTimer(redistributeTimer); |
| redistributeAtoms(sim); |
| stopTimer(redistributeTimer); |
| |
| startTimer(computeForceTimer); |
| computeForce(sim); |
| stopTimer(computeForceTimer); |
| |
| kineticEnergy(sim); |
| |
| return sim; |
| } |
| |
| /// frees all data associated with *ps and frees *ps |
| void destroySimulation(SimFlat** ps) |
| { |
| if ( ! ps ) return; |
| |
| SimFlat* s = *ps; |
| if ( ! s ) return; |
| |
| BasePotential* pot = s->pot; |
| if ( pot) pot->destroy(&pot); |
| destroyLinkCells(&(s->boxes)); |
| destroyAtoms(s->atoms); |
| destroyHaloExchange(&(s->atomExchange)); |
| comdFree(s->species); |
| comdFree(s->domain); |
| comdFree(s); |
| *ps = NULL; |
| |
| return; |
| } |
| |
| void initSubsystems(void) |
| { |
| #if REDIRECT_OUTPUT |
| freopen("testOut.txt","w",screenOut); |
| #endif |
| |
| #ifdef PRINT_YAML |
| yamlBegin(); |
| #endif |
| } |
| |
| void finalizeSubsystems(void) |
| { |
| #if REDIRECT_OUTPUT |
| fclose(screenOut); |
| #endif |
| |
| #ifdef PRINT_YAML |
| yamlEnd(); |
| #endif |
| } |
| |
| /// decide whether to get LJ or EAM potentials |
| BasePotential* initPotential( |
| int doeam, const char* potDir, const char* potName, const char* potType) |
| { |
| BasePotential* pot = NULL; |
| |
| if (doeam) |
| pot = initEamPot(potDir, potName, potType); |
| else |
| pot = initLjPot(); |
| assert(pot); |
| return pot; |
| } |
| |
| SpeciesData* initSpecies(BasePotential* pot) |
| { |
| SpeciesData* species = comdMalloc(sizeof(SpeciesData)); |
| |
| strcpy(species->name, pot->name); |
| species->atomicNo = pot->atomicNo; |
| species->mass = pot->mass; |
| |
| return species; |
| } |
| |
| Validate* initValidate(SimFlat* sim) |
| { |
| sumAtoms(sim); |
| Validate* val = comdMalloc(sizeof(Validate)); |
| val->eTot0 = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal; |
| val->nAtoms0 = sim->atoms->nGlobal; |
| |
| if (printRank()) |
| { |
| fprintf(screenOut, "\n"); |
| printSeparator(screenOut); |
| fprintf(screenOut, "Initial energy : %14.12f, atom count : %d \n", |
| val->eTot0, val->nAtoms0); |
| fprintf(screenOut, "\n"); |
| } |
| return val; |
| } |
| |
| void validateResult(const Validate* val, SimFlat* sim) |
| { |
| if (printRank()) |
| { |
| real_t eFinal = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal; |
| |
| int nAtomsDelta = (sim->atoms->nGlobal - val->nAtoms0); |
| |
| fprintf(screenOut, "\n"); |
| fprintf(screenOut, "\n"); |
| fprintf(screenOut, "Simulation Validation:\n"); |
| |
| fprintf(screenOut, " Initial energy : %14.12f\n", val->eTot0); |
| fprintf(screenOut, " Final energy : %14.12f\n", eFinal); |
| fprintf(screenOut, " eFinal/eInitial : %f\n", eFinal/val->eTot0); |
| if ( nAtomsDelta == 0) |
| { |
| fprintf(screenOut, " Final atom count : %d, no atoms lost\n", |
| sim->atoms->nGlobal); |
| } |
| else |
| { |
| fprintf(screenOut, "#############################\n"); |
| fprintf(screenOut, "# WARNING: %6d atoms lost #\n", nAtomsDelta); |
| fprintf(screenOut, "#############################\n"); |
| } |
| } |
| } |
| |
| void sumAtoms(SimFlat* s) |
| { |
| // sum atoms across all processers |
| s->atoms->nLocal = 0; |
| for (int i = 0; i < s->boxes->nLocalBoxes; i++) |
| { |
| s->atoms->nLocal += s->boxes->nAtoms[i]; |
| } |
| |
| startTimer(commReduceTimer); |
| addIntParallel(&s->atoms->nLocal, &s->atoms->nGlobal, 1); |
| stopTimer(commReduceTimer); |
| } |
| |
| /// Prints current time, energy, performance etc to monitor the state of |
| /// the running simulation. Performance per atom is scaled by the |
| /// number of local atoms per process this should give consistent timing |
| /// assuming reasonable load balance |
| void printThings(SimFlat* s, int iStep, double elapsedTime) |
| { |
| // keep track previous value of iStep so we can calculate number of steps. |
| static int iStepPrev = -1; |
| static int firstCall = 1; |
| |
| int nEval = iStep - iStepPrev; // gives nEval = 1 for zeroth step. |
| iStepPrev = iStep; |
| |
| if (! printRank() ) |
| return; |
| |
| if (firstCall) |
| { |
| firstCall = 0; |
| fprintf(screenOut, |
| "# Performance\n" |
| "# Loop Time(fs) Total Energy Potential Energy Kinetic Energy Temperature (us/atom) # Atoms\n"); |
| fflush(screenOut); |
| } |
| |
| real_t time = iStep*s->dt; |
| real_t eTotal = (s->ePotential+s->eKinetic) / s->atoms->nGlobal; |
| real_t eK = s->eKinetic / s->atoms->nGlobal; |
| real_t eU = s->ePotential / s->atoms->nGlobal; |
| real_t Temp = (s->eKinetic / s->atoms->nGlobal) / (kB_eV * 1.5); |
| |
| double timePerAtom = 1.0e6*elapsedTime/(double)(nEval*s->atoms->nLocal); |
| #ifndef PRINT_TIMING |
| timePerAtom = 0.0; |
| #endif |
| |
| fprintf(screenOut, " %6d %10.2f %18.12f %18.12f %18.12f %12.4f %10.4f %12d\n", |
| iStep, time, eTotal, eU, eK, Temp, timePerAtom, s->atoms->nGlobal); |
| } |
| |
| /// Print information about the simulation in a format that is (mostly) |
| /// YAML compliant. |
| void printSimulationDataYaml(FILE* file, SimFlat* s) |
| { |
| // All ranks get maxOccupancy |
| int maxOcc = maxOccupancy(s->boxes); |
| |
| // Only rank 0 prints |
| if (! printRank()) |
| return; |
| |
| fprintf(file,"Simulation data: \n"); |
| fprintf(file," Total atoms : %d\n", |
| s->atoms->nGlobal); |
| fprintf(file," Min global bounds : [ %14.10f, %14.10f, %14.10f ]\n", |
| s->domain->globalMin[0], s->domain->globalMin[1], s->domain->globalMin[2]); |
| fprintf(file," Max global bounds : [ %14.10f, %14.10f, %14.10f ]\n", |
| s->domain->globalMax[0], s->domain->globalMax[1], s->domain->globalMax[2]); |
| printSeparator(file); |
| fprintf(file,"Decomposition data: \n"); |
| fprintf(file," Processors : %6d,%6d,%6d\n", |
| s->domain->procGrid[0], s->domain->procGrid[1], s->domain->procGrid[2]); |
| fprintf(file," Local boxes : %6d,%6d,%6d = %8d\n", |
| s->boxes->gridSize[0], s->boxes->gridSize[1], s->boxes->gridSize[2], |
| s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2]); |
| fprintf(file," Box size : [ %14.10f, %14.10f, %14.10f ]\n", |
| s->boxes->boxSize[0], s->boxes->boxSize[1], s->boxes->boxSize[2]); |
| fprintf(file," Box factor : [ %14.10f, %14.10f, %14.10f ] \n", |
| s->boxes->boxSize[0]/s->pot->cutoff, |
| s->boxes->boxSize[1]/s->pot->cutoff, |
| s->boxes->boxSize[2]/s->pot->cutoff); |
| fprintf(file, " Max Link Cell Occupancy: %d of %d\n", |
| maxOcc, MAXATOMS); |
| printSeparator(file); |
| fprintf(file,"Potential data: \n"); |
| s->pot->print(file, s->pot); |
| |
| // Memory footprint diagnostics |
| int perAtomSize = 10*sizeof(real_t)+2*sizeof(int); |
| float mbPerAtom = perAtomSize/1024/1024; |
| float totalMemLocal = (float)(perAtomSize*s->atoms->nLocal)/1024/1024; |
| float totalMemGlobal = (float)(perAtomSize*s->atoms->nGlobal)/1024/1024; |
| |
| int nLocalBoxes = s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2]; |
| int nTotalBoxes = (s->boxes->gridSize[0]+2)*(s->boxes->gridSize[1]+2)*(s->boxes->gridSize[2]+2); |
| float paddedMemLocal = (float) nLocalBoxes*(perAtomSize*MAXATOMS)/1024/1024; |
| float paddedMemTotal = (float) nTotalBoxes*(perAtomSize*MAXATOMS)/1024/1024; |
| |
| printSeparator(file); |
| fprintf(file,"Memory data: \n"); |
| fprintf(file, " Intrinsic atom footprint = %4d B/atom \n", perAtomSize); |
| fprintf(file, " Total atom footprint = %7.3f MB (%6.2f MB/node)\n", totalMemGlobal, totalMemLocal); |
| fprintf(file, " Link cell atom footprint = %7.3f MB/node\n", paddedMemLocal); |
| fprintf(file, " Link cell atom footprint = %7.3f MB/node (including halo cell data\n", paddedMemTotal); |
| |
| fflush(file); |
| } |
| |
| /// Check that the user input meets certain criteria. |
| void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8]) |
| { |
| int failCode = 0; |
| |
| // Check that domain grid matches number of ranks. (fail code 1) |
| int nProcs = cmd.xproc * cmd.yproc * cmd.zproc; |
| if (nProcs != getNRanks()) |
| { |
| failCode |= 1; |
| if (printRank() ) |
| fprintf(screenOut, |
| "\nNumber of MPI ranks must match xproc * yproc * zproc\n"); |
| } |
| |
| // Check whether simuation is too small (fail code 2) |
| double minx = 2*cutoff*cmd.xproc; |
| double miny = 2*cutoff*cmd.yproc; |
| double minz = 2*cutoff*cmd.zproc; |
| double sizex = cmd.nx*latticeConst; |
| double sizey = cmd.ny*latticeConst; |
| double sizez = cmd.nz*latticeConst; |
| |
| if ( sizex < minx || sizey < miny || sizez < minz) |
| { |
| failCode |= 2; |
| if (printRank()) |
| fprintf(screenOut,"\nSimulation too small.\n" |
| " Increase the number of unit cells to make the simulation\n" |
| " at least (%3.2f, %3.2f. %3.2f) Ansgstroms in size\n", |
| minx, miny, minz); |
| } |
| |
| // Check for supported lattice structure (fail code 4) |
| if (strcasecmp(latticeType, "FCC") != 0) |
| { |
| failCode |= 4; |
| if ( printRank() ) |
| fprintf(screenOut, |
| "\nOnly FCC Lattice type supported, not %s. Fatal Error.\n", |
| latticeType); |
| } |
| int checkCode = failCode; |
| bcastParallel(&checkCode, sizeof(int), 0); |
| // This assertion can only fail if different tasks failed different |
| // sanity checks. That should not be possible. |
| assert(checkCode == failCode); |
| |
| if (failCode != 0) |
| exit(failCode); |
| } |
| |
| // -------------------------------------------------------------- |
| |
| |
| /// \page pg_building_comd Building CoMD |
| /// |
| /// CoMD is written with portability in mind and should compile using |
| /// practically any compiler that implements the C99 standard. You will |
| /// need to create a Makefile by copying the sample provided with the |
| /// distribution (Makefile.vanilla). |
| /// |
| /// $ cp Makefile.vanilla Makefile |
| /// |
| /// and use the make command to build the code |
| /// |
| /// $ make |
| /// |
| /// The sample Makefile will compile the code on many platforms. See |
| /// comments in Makefile.vanilla for information about specifying the |
| /// name of the C compiler, and/or additional compiler switches that |
| /// might be necessary for your platform. |
| /// |
| /// The main options available in the Makefile are toggling single/double |
| /// precision and enabling/disabling MPI. In the event MPI is not |
| /// available, setting the DO_MPI flag to OFF will create a purely |
| /// serial build (you will likely also need to change the setting of |
| /// CC). |
| /// |
| /// The makefile should handle all the dependency checking needed, via |
| /// makedepend. |
| /// |
| /// 'make clean' removes the object and dependency files. |
| /// |
| /// 'make distclean' additionally removes the executable file and the |
| /// documentation files. |
| /// |
| /// Other build options |
| /// ------------------- |
| /// |
| /// Various other options are made available by \#define arguments within |
| /// some of the source files. |
| /// |
| /// #REDIRECT_OUTPUT in CoMD.c |
| /// |
| /// Setting this to 1 will redirect all screen output to a file, |
| /// currently set to 'testOut.txt'. |
| /// |
| /// #POT_SHIFT in ljForce.c |
| /// |
| /// This is set to 1.0 by default, and shifts the values of the cohesive |
| /// energy given by the Lennard-Jones potential so it is zero at the |
| /// cutoff radius. This setting improves energy conservation |
| /// step-to-step as it reduces the noise generated by atoms crossing the |
| /// cutoff threshold. However, it does not affect the long-term energy |
| /// conservation of the code. |
| /// |
| /// #MAXATOMS in linkCells.h |
| /// |
| /// The default value is 64, which allows ample padding of the linkCell |
| /// structure to allow for density fluctuations. Reducing it may improve |
| /// the efficiency of the code via improved thread utilization and |
| /// reduced memory footprint. |
| |
| // -------------------------------------------------------------- |
| |
| |
| // -------------------------------------------------------------- |
| |
| |
| /// \page pg_measuring_performance Measuring Performance |
| /// |
| /// CoMD implements a simple and extensible system of internal timers to |
| /// measure the performance profile of the code. As explained in |
| /// performanceTimers.c, it is easy to create additional timers and |
| /// associate them with code regions of specific interest. In addition, |
| /// the getTime() and getTick() functions can be easily reimplemented to |
| /// take advantage of platform specific timing resources. |
| /// |
| /// A timing report is printed at the end of each simulation. |
| /// |
| /// ~~~~ |
| /// Timings for Rank 0 |
| /// Timer # Calls Avg/Call (s) Total (s) % Loop |
| /// ___________________________________________________________________ |
| /// total 1 50.6701 50.6701 100.04 |
| /// loop 1 50.6505 50.6505 100.00 |
| /// timestep 1 50.6505 50.6505 100.00 |
| /// position 10000 0.0000 0.0441 0.09 |
| /// velocity 20000 0.0000 0.0388 0.08 |
| /// redistribute 10001 0.0003 3.4842 6.88 |
| /// atomHalo 10001 0.0002 2.4577 4.85 |
| /// force 10001 0.0047 47.0856 92.96 |
| /// eamHalo 10001 0.0001 1.0592 2.09 |
| /// commHalo 60006 0.0000 1.7550 3.46 |
| /// commReduce 12 0.0000 0.0003 0.00 |
| /// |
| /// Timing Statistics Across 8 Ranks: |
| /// Timer Rank: Min(s) Rank: Max(s) Avg(s) Stdev(s) |
| /// _____________________________________________________________________________ |
| /// total 3: 50.6697 0: 50.6701 50.6699 0.0001 |
| /// loop 0: 50.6505 4: 50.6505 50.6505 0.0000 |
| /// timestep 0: 50.6505 4: 50.6505 50.6505 0.0000 |
| /// position 2: 0.0437 0: 0.0441 0.0439 0.0001 |
| /// velocity 2: 0.0380 4: 0.0392 0.0385 0.0004 |
| /// redistribute 0: 3.4842 1: 3.7085 3.6015 0.0622 |
| /// atomHalo 0: 2.4577 7: 2.6441 2.5780 0.0549 |
| /// force 1: 46.8624 0: 47.0856 46.9689 0.0619 |
| /// eamHalo 3: 0.2269 6: 1.2936 1.0951 0.3344 |
| /// commHalo 3: 1.0803 6: 2.1856 1.9363 0.3462 |
| /// commReduce 6: 0.0002 2: 0.0003 0.0003 0.0000 |
| /// |
| /// --------------------------------------------------- |
| /// Average atom update rate: 9.39 us/atom/task |
| /// --------------------------------------------------- |
| /// |
| /// ~~~~ |
| /// This report consists of two blocks. The upper block lists the absolute |
| /// wall clock time spent in each timer on rank 0 of the job. The lower |
| /// block reports minimum, maximum, average, and standard deviation of |
| /// times across all tasks. |
| /// The ranks where the minimum and maximum values occured are also reported |
| /// to aid in identifying hotspots or load imbalances. |
| /// |
| /// The last line of the report gives the atom update rate in |
| /// microseconds/atom/task. Since this quantity is normalized by both |
| /// the number of atoms and the number of tasks it provides a simple |
| /// figure of merit to compare performance between runs with different |
| /// numbers of atoms and different numbers of tasks. Any increase in |
| /// this number relative to a large number of atoms on a single task |
| /// represents a loss of parallel efficiency. |
| /// |
| /// Choosing the problem size correctly has important implications for the |
| /// reported performance. Small problem sizes may run entirely in the cache |
| /// of some architectures, leading to very good performance results. |
| /// For general characterization of performance, it is probably best to |
| /// choose problem sizes which force the code to access main memory, even |
| /// though there may be strong scaling scenarios where the code is indeed |
| /// running mainly in cache. |
| /// |
| /// *** Architecture/Configuration for above timing numbers: |
| /// SGI XE1300 cluster with dual-socket Intel quad-core Nehalem processors. |
| /// Each node has 2 Quad-Core Xeon X5550 processors runnning at 2.66 GHz |
| /// with 3 GB of memory per core. |
| |
| // -------------------------------------------------------------- |
| |
| |
| /// \page pg_problem_selection_and_scaling Problem Selection and Scaling |
| /// |
| /// CoMD is a reference molecular dynamics simulation code as used in |
| /// materials science. |
| /// |
| /// Problem Specification {#sec_problem_spec} |
| /// ====================== |
| /// |
| /// The reference problem is solid Copper starting from a face-centered |
| /// cubic (FCC) lattice. The initial thermodynamic conditions |
| /// (Temperature and Volume (via the lattice spacing, lat))can be specified |
| /// from the command line input. The default is 600 K and standard |
| /// volume (lat = 3.615 Angstroms). |
| /// Different temperatures (e.g. T =3000K) and volumes can be |
| /// specified to melt the system and enhance the interchange of atoms |
| /// between domains. |
| /// |
| /// The dynamics is micro-canonical (NVE = constant Number of atoms, |
| /// constant total system Volume, and constant total system Energy). As |
| /// a result, the temperature is not fixed. Rather, the temperature will |
| /// adjust from the initial temperature (as specified on the command line) |
| /// to a final temperature as the total system kinetic energy comes into |
| /// equilibrium with the total system potential energy. |
| /// |
| /// The total size of the problem (number of atoms) is specified by the |
| /// number (nx, ny, nz) of FCC unit cells in the x, y, z directions: nAtoms |
| /// = 4 * nx * ny * nz. The default size is nx = ny = nz = 20 or 32,000 atoms. |
| /// |
| /// The simulation models bulk copper by replicating itself in every |
| /// direction using periodic boundary conditions. |
| /// |
| /// Two interatomic force models are available: the Lennard-Jones (LJ) |
| /// two-body potential (ljForce.c) and the many-body Embedded-Atom Model (EAM) |
| /// potential (eam.c). The LJ potential is included for comparison and |
| /// is a valid approximation for constant volume and uniform |
| /// density. The EAM potential is a more accurate model of cohesion in |
| /// simple metals like Copper and includes the energetics necessary to |
| /// model non-uniform density and free surfaces. |
| /// |
| /// Scaling Studies in CoMD {#sec_scaling_studies} |
| /// ======================= |
| /// |
| /// CoMD implements a simple geometric domain decomposition to divide |
| /// the total problem space into domains, which are owned by MPI |
| /// ranks. Each domain is a single-program multiple data (SPMD) |
| /// partition of the larger problem. |
| /// |
| /// Caution: When doing scaling studies, it is important to distinguish |
| /// between the problem setup phase and the problem execution phase. Both |
| /// are important to the workflow of doing molecular dynamics, but it |
| /// is the execution phase we want to quantify in the scaling studies |
| /// described below, for that dominates the execution time for long runs |
| /// (millions of time steps). The problem setup can be an appreciable fraction |
| /// of the execution time for short runs (the default is 100 time steps) |
| /// and erroneous conclusions drawn. |
| /// |
| /// This code is configured with timers. The times are reported per particle |
| /// and the timers for the force calculation, timestep, etc start after the |
| /// initialization phase is done. |
| /// |
| /// Weak Scaling {#ssec_weak_scaling} |
| /// ----------- |
| /// |
| /// A weak scaling test fixes the amount of work per processor and |
| /// compares the execution time over number of processors. Weak scaling |
| /// keeps the ratio of inter-processor communication (surface) to |
| /// intra-processor work (volume) fixed. The amount of inter-processor |
| /// work scales with the number of processors in the domain and O(1000) |
| /// atoms per domain are needed for reasonable performance. |
| /// |
| /// Examples, |
| /// |
| /// - Increase in processor count by 8: <br> |
| /// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=40) |
| /// |
| /// - Increase in processor count by 2: <br> |
| /// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=20, nz=40) |
| /// |
| /// In general, it is wise to keep the ratio of processor count to |
| /// system size in each direction fixed (i.e. cubic domains): xproc_0 / nx_0 = xproc_1 / |
| /// nx_1, since this minimizes surface area to volume. |
| /// Feel free to experiment, you might learn something about |
| /// algorithms to optimize communication relative to work. |
| /// |
| /// Strong Scaling {#ssec_strong_scaling} |
| /// --------------- |
| /// |
| /// A strong scaling test fixes the total problem size and compares the |
| /// execution time for different numbers of processors. Strong scaling |
| /// increases the ratio of inter-processor communication (surface) to |
| /// intra-processor work (volume). |
| /// |
| /// Examples, |
| /// |
| /// - Increase in processor count by 8: <br> |
| /// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=20) |
| /// |
| /// - Increase in processor count by 2: <br> |
| /// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=nz=20) |
| /// |
| /// The domain decomposition requires O(1000) atoms per domain and |
| /// begins to scale poorly for small numbers of atoms per domain. |
| /// Again, feel free to experiment, you might learn something here as |
| /// well. For example, when molecular dynamics codes were written for |
| /// vector supercomputers, large lists of force pairs were created for |
| /// the vector processor. These force lists provide a natural force |
| /// decomposition for early parallel computers (Fast Parallel Algorithms |
| /// for Short-Range Molecular Dynamics, S. J. Plimpton, J Comp Phys, |
| /// 117, 1-19 (1995).) Using replicated data, force decomposition can |
| /// scale to fewer than one atom per processor and is a natural |
| /// mechanism to exploit intra-processor parallelism. |
| /// |
| /// For further details see for example: |
| /// https://support.scinet.utoronto.ca/wiki/index.php/Introduction_To_Performance |
| |
| |
| // -------------------------------------------------------------- |
| |
| |
| /// \page pg_verifying_correctness Verifying Correctness |
| /// |
| /// Verifying the correctness of an MD simulation is challenging. |
| /// Because MD is Lyapunov unstable, any small errors, even harmless |
| /// round-off errors, will lead to a long-term divergence in the atom |
| /// trajectories. Hence, comparing atom positions at the end of a run |
| /// is not always a useful verification technique. (Such divergences |
| /// are not a problem for science applications of MD since they do not |
| /// alter the statistical physics.) Small, single-particle errors can |
| /// also be difficult to detect in system-wide quantities such as the |
| /// kinetic or potential energy that are averaged over a large number of |
| /// particles. |
| /// |
| /// In spite of these challenges, there are several methods which are |
| /// likely to catch significant errors. |
| /// |
| /// Cohesive Energy {#sec_ver_cohesive_energy} |
| /// =============== |
| /// |
| /// With a perfect lattice as the initial structure (this is the |
| /// default), the potential energy per atom is the cohesive energy. |
| /// This value should be computed correctly to many decimal places. Any |
| /// variation beyond the last 1 or 2 decimal places is cause for |
| /// investigation. The correct values for the cohesive energy are |
| /// |
| /// | Potential | Cohesive Energy | |
| /// | :------------- | :-------------- | |
| /// | Lennard-Jones | -1.243619295058 | |
| /// | EAM (Adams) | -3.538079224691 | |
| /// | EAM (Mishin) | -3.539999969176 | |
| /// |
| /// The \link sec_command_line_options command |
| /// line options \endlink documentation explains the switches used to |
| /// select the potential used in the simulation. |
| /// |
| /// Note that the cohesive energy calculation is not sensitive to errors |
| /// in forces. It is also performed on a highly symmetric structure so |
| /// there are many errors this will not catch. Still, it is a good |
| /// first check. |
| /// |
| /// Energy Conservation {#sec_ver_energy_conservation} |
| /// =================== |
| /// |
| /// A correctly implemented force kernel, with an appropriate time step |
| /// (the default value of 1 fs is conservative for temperatures under |
| /// 10,000K) will conserve total energy over long times to 5 or more |
| /// digits. Any long term systematic drift in the total energy is a |
| /// cause for concern. |
| /// |
| /// To facilitate checking energy conservation CoMD prints the final and |
| /// initial values of the total energy. When comparing these values, pay |
| /// careful attention to these details: |
| /// |
| /// - It is common to observe an initial transient change in the total |
| /// energy. Differences in the total energy of 2-3% can be expected in |
| /// the first 10-100 time steps. |
| /// - The best way to check energy conservation is to run at least |
| /// several thousand steps and look at the slope of the total energy |
| /// ignoring at least the first one or two thousand steps. More steps |
| /// are even better. |
| /// - Set the temperature to at least several hundred K. This ensures |
| /// that atoms will sample a large range of configurations and expose |
| /// possible errors. |
| /// - Fluctuations in the energy can make it difficult to tell if |
| /// conservation is observed. Increasing the number of atoms will reduce |
| /// the fluctuations. |
| /// |
| /// |
| /// Particle Conservation {#sec_ver_particle_conservation} |
| /// ===================== |
| /// |
| /// The simulation should always end with the same number of particles |
| /// it started with. Any change is a bug. CoMD checks the initial and |
| /// final number of particles and prints a warning at the end of the |
| /// simulation if they are not equal. |
| /// |
| /// Reproducibility {#sec_ver_reproducibility} |
| /// =============== |
| /// |
| /// The same simulation run repeatedly on the same hardware should |
| /// produce the same result. Because parallel computing can add |
| /// elements of non-determinism we do not expect perfect long term |
| /// reproducibility, however over a few hundred to a few thousand time |
| /// steps the energies should not exhibit run-to-run differences outside |
| /// the last 1 or 2 decimal places. Larger differences are a sign of |
| /// trouble and should be investigated. This kind of test is |
| /// practically the only way to detect race conditions in shared memory |
| /// parallelism. |
| /// |
| /// Portability {#sec_ver_portability} |
| /// =========== |
| /// |
| /// In our experience, simulations that start from the same initial |
| /// condition tend to produce very similar trajectories over short terms |
| /// (100 to 1000 time step), even on different hardware platforms. |
| /// Short term differences beyond the last 1 or 2 decimal places should |
| /// likely be investigated. |
| /// |
| /// General Principles {#sec_ver_general} |
| /// ======================= |
| /// |
| /// - Simulations run at 0K are too trivial for verification, set |
| /// the initial temperature to at least several hundred K. |
| /// - Longer runs are better to check conservation. Compare |
| /// energies after initial transients are damped out. |
| /// - Larger runs are better to check conservation. Fluctuations in the |
| /// energy are averaged out. |
| /// - Short term (order 100 time steps) discrepancies from run-to-run |
| /// or platform-to platform beyond the last one or two decimal places |
| /// are reason for concern. Differences in 4th or 5th decimal place |
| /// is almost certainly a bug. |
| /// - Contact the CoMD developers (exmatex-comd@llnl.gov) if you have |
| /// questions about validation. |
| /// |
| |
| // -------------------------------------------------------------- |
| |
| |
| /// \page pg_comd_architecture CoMD Architecture |
| /// |
| /// Program Flow {#sec_program_flow} |
| /// ============ |
| /// |
| /// We have attempted to make the program flow in CoMD 1.1 as simple and |
| /// transparent as possible. The main program consists of three blocks: |
| /// prolog, main loop, and epilog. |
| /// |
| /// Prolog {#ssec_flow_prolog} |
| /// ------- |
| /// |
| /// The job of the prolog is to initialize the simulation and prepare |
| /// for the main loop. Notable tasks in the prolog include calling |
| /// - initParallel() to start MPI |
| /// - parseCommandLine() to read the command line options |
| /// - initSimulation() to initialize the main data structure, SimFlatSt. |
| /// This includes tasks such as |
| /// - initEamPot() to read tabular data for the potential function |
| /// - initDecomposition() to set up the domain decomposition |
| /// - createFccLattice() to generate an initial structure for the atoms |
| /// - initValidate() to store initial data for a simple validation check |
| /// |
| /// In CoMD 1.1 all atomic structures are internally generated so |
| /// there is no need to read large files with atom coordinate data. |
| /// |
| /// Main Loop {#ssec_flow_main_loop} |
| /// --------- |
| /// |
| /// The main loop calls |
| /// - timestep(), the integrator to update particle positions, |
| /// - printThings() to periodically prints simulation information |
| /// |
| /// The timestep() function is the heart of the code as it choreographs |
| /// updating the particle positions, along with computing forces |
| /// (computeForce()) and communicating atoms between ranks |
| /// (redistributeAtoms()). |
| /// |
| /// Epilog {#ssec_flow_epilog} |
| /// ------- |
| /// |
| /// The epilog code handles end of run bookkeeping such as |
| /// - validateResult() to check validation |
| /// - printPerformanceResults() to print a performance summary |
| /// - destroySimulation() to free memory |
| /// |
| /// Key Data Structures {#sec_key_data_structures} |
| /// ================== |
| /// |
| /// Practically all data in CoMD belongs to the SimFlatSt structure. |
| /// This includes: |
| /// - BasePotentialSt A polymorphic structure for the potential model |
| /// - HaloExchangeSt A polymorphic strcuture for communication halo data |
| /// - DomainSt The parallel domain decomposition |
| /// - LinkCellSt The link cells |
| /// - AtomsSt The atom coordinates and velocities |
| /// - SpeciesDataSt Properties of the atomic species being simulated. |
| /// |
| /// Consult the individual pages for each of these structures to learn |
| /// more. The descriptions in haloExchange.c and initLinkCells() are |
| /// especially useful to understand how the atoms are commuicated |
| /// between tasks and stored in link cells for fast pair finding. |
| |
| // -------------------------------------------------------------- |
| |
| |
| /// \page pg_optimization_targets Optimization Targets |
| /// |
| /// Computation {#sec_computation} |
| /// ============ |
| /// |
| /// The computational effort of classical MD is usually highly focused |
| /// in the force kernel. The two force kernels supplied by CoMD are |
| /// eamForce() and ljForce(). Both kernels are fundamentally loops over |
| /// pairs of atoms with significant opportunity to exploit high levels |
| /// of concurrency. One potential challenge when reordering or |
| /// parallelizing the pair loop structure is preventing race conditions |
| /// that result if two concurrent pair evaluations try to simultaneously |
| /// increment the forces and energies on the same atom. |
| /// |
| /// The supplied EAM kernel uses interpolation from tabular data to |
| /// evaluate functions. Hence the interpolate() function is another |
| /// potential optimization target. Note that the two potential files |
| /// distributed with CoMD have very different sizes. The Adams |
| /// potential (Cu_u6.eam) has 500 points per function in the table while |
| /// the Mishin potential (Cu01.eam.alloy) has 10,000 points per |
| /// function. This difference could potentially impact important |
| /// details such as cache miss rates. |
| /// |
| /// Communication {#sec_communication} |
| /// ============= |
| /// |
| /// As the number of atoms per MPI rank decreases, the communication |
| /// routines will start to require a significant fraction of the |
| /// run time. The main communication routine in CoMD is haloExchange(). |
| /// The halo exchange is simple nearest neighbor, point-to-point |
| /// communication so it should scale well to practically any number of |
| /// nodes. |
| /// |
| /// The halo exchange in CoMD 1.1 is a very simple 6-direction |
| /// structured halo exchange (see haloExchange.c). Other exchange |
| /// algorithms can be implemented without much difficulty. |
| /// |
| /// The halo exchange function is called in two very different contexts. |
| /// The main usage is to exchange halo particle information (see |
| /// initAtomHaloExchange()). This process is coordinated by the |
| /// redistributeAtoms() function. |
| /// |
| /// In addition to the atom exchange, when using the EAM potential, a |
| /// halo exchange is performed in the force routine (see |
| /// initForceHaloExchange()). |
| |
| |
| // -------------------------------------------------------------- |
| |
| |
| /// \page pg_whats_new New Features and Changes in CoMD 1.1 |
| /// |
| /// The main goals of the 1.1 release were to add support for MPI and to |
| /// improve the structure and clarity of the code. Achieving these |
| /// goals required considerable changes compared to the 1.0 release. |
| /// However, the core structure of the most computationally intensive |
| /// kernels (the force routines) is mostly unchanged. We believe that |
| /// lessons learned from optimizing 1.0 force kernels to specific |
| /// hardware or programming models can be quickly transferred to kernels |
| /// in the 1.1 release. |
| /// |
| /// Significant changes in CoMD 1.1 include: |
| /// |
| /// - MPI support. Both MPI and single node serial executables can be |
| /// built from the same source files. |
| /// |
| /// - Improved modularity and code clarity. Major data structures are |
| /// now organized with their own structs and initialization routines. |
| /// |
| /// - The build system has been simplified to use only standard |
| /// Makefiles instead of CMake. |
| /// |
| /// - The halo exchange operation needed to communicate remote particle |
| /// data between MPI ranks also creates "image" particles in the |
| /// serial build. |
| /// |
| /// - Unified force kernels for both serial and MPI builds |
| /// |
| /// - The addition of remote/image atoms allows periodic boundary |
| /// conditions to be handled outside the force loop. |
| /// |
| /// - An additional communication/data copy step to handle electron |
| /// density on remote/image atoms has been added to the EAM force |
| /// loop. |
| /// |
| /// - The coordinate system has been simplified to a single global |
| /// coordinate system for all particles. |
| /// |
| /// - Evaluation of energies and forces using a Chebyshev polynomial |
| /// fits has been removed. Polynomial approximation of energies and |
| /// forces will return in a future CoMD version. |
| /// |
| /// - Atomic structures are now generated internally, eliminating the |
| /// requirement to read, write, and distribute large atom |
| /// configuration files. Arbitrarily large initial structures can |
| /// be generated with specified initial temperature and random |
| /// displacements from lattice positions. Code to read/write atomic |
| /// positions has been removed. |
| /// |
| /// - EAM potentials are now read from standard funcfl and setfl format |
| /// files. Voter style files are no longer supported. |
| /// |
| /// - Collection of performance metrics is significantly improved. |
| /// Developers can easily add new timers to regions of interest. The |
| /// system is also designed to allow easy integration with platform |
| /// specific API's to high resolution timers, cycle counters, |
| /// hardware counters, etc. |
| /// |
| /// |
| /// - Hooks to in-situ analysis and visualization have been removed. |
| /// In-situ analysis capabilities will return in a future CoMD release. |
| /// |
| /// Please contact the CoMD developers (exematex-comd@llnl.gov) if |
| /// any of the deleted features negative impacts your work. We |
| /// may be able to help produce a custom version that includes the code |
| /// you need. |
| |
| |
| // -------------------------------------------------------------- |
| |
| |
| /// \page pg_md_basics MD Basics |
| /// |
| /// The molecular dynamics (MD) computer simulation method is a well |
| /// established and important tool for the study of the dynamical |
| /// properties of liquids, solids, and other systems of interest in |
| /// Materials Science and Engineering, Chemistry and Biology. A material |
| /// is represented in terms of atoms and molecules. The method of MD |
| /// simulation involves the evaluation of the force acting on each atom |
| /// due to all other atoms in the system and the numerical integration |
| /// of the Newtonian equations of motion. Though MD was initially |
| /// developed to compute the equilibrium thermodynamic behavior of |
| /// materials (equation of state), most recent applications have used MD |
| /// to study non-equilibrium processes. |
| /// |
| /// Wikipeda offers a basic introduction to molecular dynamics with |
| /// many references: |
| /// |
| /// http://en.wikipedia.org/wiki/Molecular_dynamics |
| /// |
| /// For a thorough treatment of MD methods, see: |
| /// - "Computer simulation of liquids" by M.P. Allen and D.J. Tildesley |
| /// (Oxford, 1989) |
| /// ISBN-10: 0198556454 | ISBN-13: 978-0198556459. |
| /// |
| /// For an understanding of MD simulations and application to statistical mechanics: |
| /// - "Understanding Molecular Simulation, Second Edition: From Algorithms |
| /// to Applications," by D. Frenkel and B. Smit (Academic Press, 2001) |
| /// ISBN-10: 0122673514 | ISBN-13: 978-0122673511 |
| /// - "Statistical and Thermal Physics: With Computer Applications," by |
| /// H. Gould and J. Tobochnik (Princeton, 2010) |
| /// ISBN-10: 0691137447 | ISBN-13: 978-0691137445 |
| /// |
| /// CoMD implements both the Lennard-Jones Potential (ljForce.c) and the |
| /// Embedded Atom Method Potential (eam.c). |
| /// |