blob: 90f01d0c7531c3f790a825f6e36db55480fcb241 [file] [log] [blame]
//@HEADER
// ************************************************************************
//
// HPCCG: Simple Conjugate Gradient Benchmark Code
// Copyright (2006) Sandia Corporation
//
// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
// license for use of this work by or on behalf of the U.S. Government.
//
// This library is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as
// published by the Free Software Foundation; either version 2.1 of the
// License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
// USA
// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
//
// ************************************************************************
//@HEADER
// Changelog
//
// Version 0.3
// - Added timing of setup time for sparse MV
// - Corrected percentages reported for sparse MV with overhead
//
/////////////////////////////////////////////////////////////////////////
// Main routine of a program that reads a sparse matrix, right side
// vector, solution vector and initial guess from a file in HPC
// format. This program then calls the HPCCG conjugate gradient
// solver to solve the problem, and then prints results.
// Calling sequence:
// test_HPCCG linear_system_file
// Routines called:
// read_HPC_row - Reads in linear system
// mytimer - Timing routine (compile with -DWALL to get wall clock
// times
// HPCCG - CG Solver
// compute_residual - Compares HPCCG solution to known solution.
#include <iostream>
using std::cout;
using std::cerr;
using std::endl;
#include <cstdio>
#include <cstdlib>
#include <cctype>
#include <cassert>
#include <string>
#include <cmath>
#ifdef USING_MPI
#include <mpi.h> // If this routine is compiled with -DUSING_MPI
// then include mpi.h
#include "make_local_matrix.hpp" // Also include this function
#endif
#ifdef USING_OMP
#include <omp.h>
#endif
#include "generate_matrix.hpp"
#include "read_HPC_row.hpp"
#include "mytimer.hpp"
#include "HPC_sparsemv.hpp"
#include "compute_residual.hpp"
#include "HPCCG.hpp"
#include "HPC_Sparse_Matrix.hpp"
#include "dump_matlab_matrix.hpp"
#include "YAML_Element.hpp"
#include "YAML_Doc.hpp"
#undef DEBUG
int main(int argc, char *argv[])
{
HPC_Sparse_Matrix *A;
double *x, *b, *xexact;
double norm, d;
int ierr = 0;
int i, j;
int ione = 1;
double times[7];
double t6 = 0.0;
int nx,ny,nz;
#ifdef USING_MPI
MPI_Init(&argc, &argv);
int size, rank; // Number of MPI processes, My process ID
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// if (size < 100) cout << "Process "<<rank<<" of "<<size<<" is alive." <<endl;
#else
int size = 1; // Serial case (not using MPI)
int rank = 0;
#endif
#ifdef DEBUG
if (rank==0)
{
int junk = 0;
cout << "Press enter to continue"<< endl;
cin >> junk;
}
MPI_Barrier(MPI_COMM_WORLD);
#endif
if(argc != 2 && argc!=4) {
if (rank==0)
cerr << "Usage:" << endl
<< "Mode 1: " << argv[0] << " nx ny nz" << endl
<< " where nx, ny and nz are the local sub-block dimensions, or" << endl
<< "Mode 2: " << argv[0] << " HPC_data_file " << endl
<< " where HPC_data_file is a globally accessible file containing matrix data." << endl;
exit(1);
}
if (argc==4)
{
nx = atoi(argv[1]);
ny = atoi(argv[2]);
nz = atoi(argv[3]);
generate_matrix(nx, ny, nz, &A, &x, &b, &xexact);
}
else
{
read_HPC_row(argv[1], &A, &x, &b, &xexact);
}
bool dump_matrix = false;
if (dump_matrix && size<=4) dump_matlab_matrix(A, rank);
#ifdef USING_MPI
// Transform matrix indices from global to local values.
// Define number of columns for the local matrix.
t6 = mytimer(); make_local_matrix(A); t6 = mytimer() - t6;
times[6] = t6;
#endif
double t1 = mytimer(); // Initialize it (if needed)
int niters = 0;
double normr = 0.0;
int max_iter = 150;
double tolerance = 0.0; // Set tolerance to zero to make all runs do max_iter iterations
ierr = HPCCG( A, b, x, max_iter, tolerance, niters, normr, times);
if (ierr) cerr << "Error in call to CG: " << ierr << ".\n" << endl;
#ifdef USING_MPI
double t4 = times[4];
double t4min = 0.0;
double t4max = 0.0;
double t4avg = 0.0;
MPI_Allreduce(&t4, &t4min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
MPI_Allreduce(&t4, &t4max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
MPI_Allreduce(&t4, &t4avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
t4avg = t4avg/((double) size);
#endif
// initialize YAML doc
if (rank==0) // Only PE 0 needs to compute and report timing results
{
double fniters = niters;
double fnrow = A->total_nrow;
double fnnz = A->total_nnz;
double fnops_ddot = fniters*4*fnrow;
double fnops_waxpby = fniters*6*fnrow;
double fnops_sparsemv = fniters*2*fnnz;
double fnops = fnops_ddot+fnops_waxpby+fnops_sparsemv;
YAML_Doc doc("hpccg", "1.0");
doc.add("Parallelism","");
#ifdef USING_MPI
doc.get("Parallelism")->add("Number of MPI ranks",size);
#else
doc.get("Parallelism")->add("MPI not enabled","");
#endif
#ifdef USING_OMP
int nthreads = 1;
#pragma omp parallel
nthreads = omp_get_num_threads();
doc.get("Parallelism")->add("Number of OpenMP threads",nthreads);
#else
doc.get("Parallelism")->add("OpenMP not enabled","");
#endif
doc.add("Dimensions","");
doc.get("Dimensions")->add("nx",nx);
doc.get("Dimensions")->add("ny",ny);
doc.get("Dimensions")->add("nz",nz);
doc.add("Number of iterations: ", niters);
doc.add("Final residual: ", normr);
#ifdef PERFORMACE_SUM
doc.add("********** Performance Summary (times in sec) ***********","");
doc.add("Time Summary","");
doc.get("Time Summary")->add("Total ",times[0]);
doc.get("Time Summary")->add("DDOT ",times[1]);
doc.get("Time Summary")->add("WAXPBY ",times[2]);
doc.get("Time Summary")->add("SPARSEMV",times[3]);
doc.add("FLOPS Summary","");
doc.get("FLOPS Summary")->add("Total ",fnops);
doc.get("FLOPS Summary")->add("DDOT ",fnops_ddot);
doc.get("FLOPS Summary")->add("WAXPBY ",fnops_waxpby);
doc.get("FLOPS Summary")->add("SPARSEMV",fnops_sparsemv);
doc.add("MFLOPS Summary","");
doc.get("MFLOPS Summary")->add("Total ",fnops/times[0]/1.0E6);
doc.get("MFLOPS Summary")->add("DDOT ",fnops_ddot/times[1]/1.0E6);
doc.get("MFLOPS Summary")->add("WAXPBY ",fnops_waxpby/times[2]/1.0E6);
doc.get("MFLOPS Summary")->add("SPARSEMV",fnops_sparsemv/(times[3])/1.0E6);
#endif
#ifdef USING_MPI
doc.add("DDOT Timing Variations","");
doc.get("DDOT Timing Variations")->add("Min DDOT MPI_Allreduce time",t4min);
doc.get("DDOT Timing Variations")->add("Max DDOT MPI_Allreduce time",t4max);
doc.get("DDOT Timing Variations")->add("Avg DDOT MPI_Allreduce time",t4avg);
double totalSparseMVTime = times[3] + times[5]+ times[6];
doc.add("SPARSEMV OVERHEADS","");
doc.get("SPARSEMV OVERHEADS")->add("SPARSEMV MFLOPS W OVERHEAD",fnops_sparsemv/(totalSparseMVTime)/1.0E6);
doc.get("SPARSEMV OVERHEADS")->add("SPARSEMV PARALLEL OVERHEAD Time", (times[5]+times[6]));
doc.get("SPARSEMV OVERHEADS")->add("SPARSEMV PARALLEL OVERHEAD Pct", (times[5]+times[6])/totalSparseMVTime*100.0);
doc.get("SPARSEMV OVERHEADS")->add("SPARSEMV PARALLEL OVERHEAD Setup Time", (times[6]));
doc.get("SPARSEMV OVERHEADS")->add("SPARSEMV PARALLEL OVERHEAD Setup Pct", (times[6])/totalSparseMVTime*100.0);
doc.get("SPARSEMV OVERHEADS")->add("SPARSEMV PARALLEL OVERHEAD Bdry Exch Time", (times[5]));
doc.get("SPARSEMV OVERHEADS")->add("SPARSEMV PARALLEL OVERHEAD Bdry Exch Pct", (times[5])/totalSparseMVTime*100.0);
#endif
if (rank == 0) { // only PE 0 needs to compute and report timing results
std::string yaml = doc.generateYAML();
cout << yaml;
}
}
// Compute difference between known exact solution and computed solution
// All processors are needed here.
double residual = 0;
// if ((ierr = compute_residual(A->local_nrow, x, xexact, &residual)))
// cerr << "Error in call to compute_residual: " << ierr << ".\n" << endl;
// if (rank==0)
// cout << "Difference between computed and exact = "
// << residual << ".\n" << endl;
// Finish up
#ifdef USING_MPI
MPI_Finalize();
#endif
return 0 ;
}