blob: 83d4734119ee6c36a4d2888fb342f63c19003adb [file] [log] [blame]
/// \file
/// Performance timer functions.
///
/// Use the timer functionality to collect timing and number of calls
/// information for chosen computations (such as force calls) and
/// communication (such as sends, receives, reductions). Timing results
/// are reported at the end of the run showing overall timings and
/// statistics of timings across ranks.
///
/// A new timer can be added as follows:
/// -# add new handle to the TimerHandle in performanceTimers.h
/// -# provide a corresponding name in timerName
///
/// Note that the order of the handles and names must be the
/// same. This order also determines the order in which the timers are
/// printed. Names can contain leading spaces to show a hierarchical
/// ordering. Timers with zero calls are omitted from the report.
///
/// Raw timer data is obtained from the getTime() and getTick()
/// functions. The supplied portable versions of these functions can be
/// replaced with platform specific versions for improved accuracy or
/// lower latency.
/// \see TimerHandle
/// \see getTime
/// \see getTick
///
#include "performanceTimers.h"
#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <stdint.h>
#include <inttypes.h>
#include <math.h>
#include "performanceTimers.h"
#include "mytype.h"
#include "parallel.h"
#include "yamlOutput.h"
static uint64_t getTime(void);
static double getTick(void);
static void timerStats(void);
/// You must add timer name in same order as enum in .h file.
/// Leading spaces can be specified to show a hierarchy of timers.
char* timerName[numberOfTimers] = {
"total",
"loop",
"timestep",
" position",
" velocity",
" redistribute",
" atomHalo",
" force",
" eamHalo",
"commHalo",
"commReduce"
};
/// Timer data collected. Also facilitates computing averages and
/// statistics.
typedef struct TimersSt
{
uint64_t start; //!< call start time
uint64_t total; //!< current total time
uint64_t count; //!< current call count
uint64_t elapsed; //!< lap time
int minRank; //!< rank with min value
int maxRank; //!< rank with max value
double minValue; //!< min over ranks
double maxValue; //!< max over ranks
double average; //!< average over ranks
double stdev; //!< stdev across ranks
} Timers;
/// Global timing data collected.
typedef struct TimerGlobalSt
{
double atomRate; //!< average time (us) per atom per rank
double atomAllRate; //!< average time (us) per atom
double atomsPerUSec; //!< average atoms per time (us)
} TimerGlobal;
static Timers perfTimer[numberOfTimers];
static TimerGlobal perfGlobal;
void profileStart(const enum TimerHandle handle)
{
perfTimer[handle].start = getTime();
}
void profileStop(const enum TimerHandle handle)
{
perfTimer[handle].count += 1;
uint64_t delta = getTime() - perfTimer[handle].start;
perfTimer[handle].total += delta;
perfTimer[handle].elapsed += delta;
}
/// \details
/// Return elapsed time (in seconds) since last call with this handle
/// and clear for next lap.
double getElapsedTime(const enum TimerHandle handle)
{
double etime = getTick() * (double)perfTimer[handle].elapsed;
perfTimer[handle].elapsed = 0;
return etime;
}
/// \details
/// The report contains two blocks. The upper block is performance
/// information for the printRank. The lower block is statistical
/// information over all ranks.
void printPerformanceResults(int nGlobalAtoms, int printRate)
{
// Collect timer statistics overall and across ranks
timerStats();
if (!printRank())
return;
// only print timers with non-zero values.
double tick = getTick();
double loopTime = perfTimer[loopTimer].total*tick;
fprintf(screenOut, "\n\nTimings for Rank %d\n", getMyRank());
fprintf(screenOut, " Timer # Calls Avg/Call (s) Total (s) %% Loop\n");
fprintf(screenOut, "___________________________________________________________________\n");
for (int ii=0; ii<numberOfTimers; ++ii)
{
double totalTime = perfTimer[ii].total*tick;
if (perfTimer[ii].count > 0)
fprintf(screenOut, "%-16s%12"PRIu64" %8.4f %8.4f %8.2f\n",
timerName[ii],
perfTimer[ii].count,
totalTime/(double)perfTimer[ii].count,
totalTime,
totalTime/loopTime*100.0);
}
fprintf(screenOut, "\nTiming Statistics Across %d Ranks:\n", getNRanks());
fprintf(screenOut, " Timer Rank: Min(s) Rank: Max(s) Avg(s) Stdev(s)\n");
fprintf(screenOut, "_____________________________________________________________________________\n");
for (int ii = 0; ii < numberOfTimers; ++ii)
{
if (perfTimer[ii].count > 0)
fprintf(screenOut, "%-16s%6d:%10.4f %6d:%10.4f %10.4f %10.4f\n",
timerName[ii],
perfTimer[ii].minRank, perfTimer[ii].minValue*tick,
perfTimer[ii].maxRank, perfTimer[ii].maxValue*tick,
perfTimer[ii].average*tick, perfTimer[ii].stdev*tick);
}
double atomsPerTask = nGlobalAtoms/(real_t)getNRanks();
perfGlobal.atomRate = perfTimer[timestepTimer].average * tick * 1e6 /
(atomsPerTask * perfTimer[timestepTimer].count * printRate);
perfGlobal.atomAllRate = perfTimer[timestepTimer].average * tick * 1e6 /
(nGlobalAtoms * perfTimer[timestepTimer].count * printRate);
perfGlobal.atomsPerUSec = 1.0 / perfGlobal.atomAllRate;
fprintf(screenOut, "\n---------------------------------------------------\n");
fprintf(screenOut, " Average atom update rate: %6.2f us/atom/task\n", perfGlobal.atomRate);
fprintf(screenOut, "---------------------------------------------------\n\n");
fprintf(screenOut, "\n---------------------------------------------------\n");
fprintf(screenOut, " Average all atom update rate: %6.2f us/atom\n", perfGlobal.atomAllRate);
fprintf(screenOut, "---------------------------------------------------\n\n");
fprintf(screenOut, "\n---------------------------------------------------\n");
fprintf(screenOut, " Average atom rate: %6.2f atoms/us\n", perfGlobal.atomsPerUSec);
fprintf(screenOut, "---------------------------------------------------\n\n");
}
void printPerformanceResultsYaml(FILE* file)
{
if (! printRank())
return;
double tick = getTick();
double loopTime = perfTimer[loopTimer].total*tick;
fprintf(file,"\nPerformance Results:\n");
fprintf(file, " TotalRanks: %d\n", getNRanks());
fprintf(file, " ReportingTimeUnits: seconds\n");
fprintf(file, "Performance Results For Rank %d:\n", getMyRank());
for (int ii = 0; ii < numberOfTimers; ii++)
{
if (perfTimer[ii].count > 0)
{
double totalTime = perfTimer[ii].total*tick;
fprintf(file, " Timer: %s\n", timerName[ii]);
fprintf(file, " CallCount: %"PRIu64"\n", perfTimer[ii].count);
fprintf(file, " AvgPerCall: %8.4f\n", totalTime/(double)perfTimer[ii].count);
fprintf(file, " Total: %8.4f\n", totalTime);
fprintf(file, " PercentLoop: %8.2f\n", totalTime/loopTime*100);
}
}
fprintf(file, "Performance Results Across Ranks:\n");
for (int ii = 0; ii < numberOfTimers; ii++)
{
if (perfTimer[ii].count > 0)
{
fprintf(file, " Timer: %s\n", timerName[ii]);
fprintf(file, " MinRank: %d\n", perfTimer[ii].minRank);
fprintf(file, " MinTime: %8.4f\n", perfTimer[ii].minValue*tick);
fprintf(file, " MaxRank: %d\n", perfTimer[ii].maxRank);
fprintf(file, " MaxTime: %8.4f\n", perfTimer[ii].maxValue*tick);
fprintf(file, " AvgTime: %8.4f\n", perfTimer[ii].average*tick);
fprintf(file, " StdevTime: %8.4f\n", perfTimer[ii].stdev*tick);
}
}
fprintf(file,"Performance Global Update Rates:\n");
fprintf(file, " AtomUpdateRate:\n");
fprintf(file, " AverageRate: %6.2f\n", perfGlobal.atomRate);
fprintf(file, " Units: us/atom/task\n");
fprintf(file, " AllAtomUpdateRate:\n");
fprintf(file, " AverageRate: %6.2f\n", perfGlobal.atomAllRate);
fprintf(file, " Units: us/atom\n");
fprintf(file, " AtomRate:\n");
fprintf(file, " AverageRate: %6.2f\n", perfGlobal.atomsPerUSec);
fprintf(file, " Units: atoms/us\n");
fprintf(file, "\n");
}
/// Returns current time as a 64-bit integer. This portable version
/// returns the number of microseconds since mindight, Jamuary 1, 1970.
/// Hence, timing data will have a resolution of 1 microsecond.
/// Platforms with access to calls with lower latency or higher
/// resolution (such as a cycle counter) may wish to replace this
/// implementation and change the conversion factor in getTick as
/// appropriate.
/// \see getTick for the conversion factor between the integer time
/// units of this function and seconds.
static uint64_t getTime(void)
{
struct timeval ptime;
uint64_t t = 0;
gettimeofday(&ptime, (struct timezone *)NULL);
t = ((uint64_t)1000000)*(uint64_t)ptime.tv_sec + (uint64_t)ptime.tv_usec;
return t;
}
/// Returns the factor for converting the integer time reported by
/// getTime into seconds. The portable getTime returns values in units
/// of microseconds so the conversion is simply 1e-6.
/// \see getTime
static double getTick(void)
{
double seconds_per_cycle = 1.0e-6;
return seconds_per_cycle;
}
/// Collect timer statistics across ranks.
void timerStats(void)
{
double sendBuf[numberOfTimers], recvBuf[numberOfTimers];
// Determine average of each timer across ranks
for (int ii = 0; ii < numberOfTimers; ii++)
sendBuf[ii] = (double)perfTimer[ii].total;
addDoubleParallel(sendBuf, recvBuf, numberOfTimers);
for (int ii = 0; ii < numberOfTimers; ii++)
perfTimer[ii].average = recvBuf[ii] / (double)getNRanks();
// Determine min and max across ranks and which rank
RankReduceData reduceSendBuf[numberOfTimers], reduceRecvBuf[numberOfTimers];
for (int ii = 0; ii < numberOfTimers; ii++)
{
reduceSendBuf[ii].val = (double)perfTimer[ii].total;
reduceSendBuf[ii].rank = getMyRank();
}
minRankDoubleParallel(reduceSendBuf, reduceRecvBuf, numberOfTimers);
for (int ii = 0; ii < numberOfTimers; ii++)
{
perfTimer[ii].minValue = reduceRecvBuf[ii].val;
perfTimer[ii].minRank = reduceRecvBuf[ii].rank;
}
maxRankDoubleParallel(reduceSendBuf, reduceRecvBuf, numberOfTimers);
for (int ii = 0; ii < numberOfTimers; ii++)
{
perfTimer[ii].maxValue = reduceRecvBuf[ii].val;
perfTimer[ii].maxRank = reduceRecvBuf[ii].rank;
}
// Determine standard deviation
for (int ii = 0; ii < numberOfTimers; ii++)
{
double temp = (double)perfTimer[ii].total - perfTimer[ii].average;
sendBuf[ii] = temp * temp;
}
addDoubleParallel(sendBuf, recvBuf, numberOfTimers);
for (int ii = 0; ii < numberOfTimers; ii++)
{
perfTimer[ii].stdev = sqrt(recvBuf[ii] / (double) getNRanks());
}
}