MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/driver.hpp - third_party/llvm-test-suite - Git at Google

 #ifndef _driver_hpp_
 #define _driver_hpp_

 //@HEADER
 // ************************************************************************
 //
 // MiniFE: Simple Finite Element Assembly and Solve
 // Copyright (2006-2013) Sandia Corporation
 //
 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
 // license for use of this work by or on behalf of the U.S. Government.
 //
 // This library is free software; you can redistribute it and/or modify
 // it under the terms of the GNU Lesser General Public License as
 // published by the Free Software Foundation; either version 2.1 of the
 // License, or (at your option) any later version.
 //
 // This library is distributed in the hope that it will be useful, but
 // WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 // Lesser General Public License for more details.
 //
 // You should have received a copy of the GNU Lesser General Public
 // License along with this library; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 // USA
 //
 // ************************************************************************
 //@HEADER

 #include <cstddef>
 #include <cmath>
 #include <cstdlib>
 #include <iostream>
 #include <sstream>
 #include <iomanip>

 #include <box_utils.hpp>
 #include <Vector.hpp>

 #ifdef MINIFE_CSR_MATRIX
 #include <CSRMatrix.hpp>
 #elif defined(MINIFE_ELL_MATRIX)
 #include <ELLMatrix.hpp>
 #else
 #include <CSRMatrix.hpp>
 #endif

 #include <simple_mesh_description.hpp>

 #include <SparseMatrix_functions.hpp>

 #include <generate_matrix_structure.hpp>
 #include <assemble_FE_data.hpp>

 #include <verify_solution.hpp>

 #include <compute_matrix_stats.hpp>
 #include <make_local_matrix.hpp>
 #include <imbalance.hpp>
 #include <cg_solve.hpp>
 #if MINIFE_KERNELS != 0
 #include <time_kernels.hpp>
 #endif
 #include <outstream.hpp>
 #include <utils.hpp>
 #include <mytimer.hpp>
 #include <YAML_Doc.hpp>

 #ifdef HAVE_MPI
 #include <mpi.h>
 #endif

 #define RUN_TIMED_FUNCTION(msg, fn, time_inc, time_total) \
 {                                   \
   /*if (myproc==0) {                  \
     std::cout.width(30);            \
     std::cout << msg;               \
     std::cout.flush();              \
   }*/                                 \
   timer_type rtf_t0 = mytimer();    \
   fn;                               \
   time_inc = mytimer() - rtf_t0;    \
   time_total += time_inc;           \
   /*if (myproc==0) {                  \
     std::cout << time_inc << "s, total time: " << time_total << std::endl; \
   }*/                                 \
 }

 //This program assembles finite-element matrices into a global matrix and
 //vector, then solves the linear-system using Conjugate Gradients.
 //Each finite-element is a hexahedron with 8 vertex-nodes.
 //
 //Notes:
 //- In finite-element terms, the box dimensions are in elements, not nodes.
 //  In other words, a 2x2x2 box describes 8 elements, each of which has 8 nodes,
 //  so it is a 3x3x3 node domain (27 nodes).
 //  The assembled linear system will have 1 equation for each finite element node.
 //
 //- The coordinate origin is at the corner of the global box where x=0,
 //  y=0, z=0, and the box extends along the positive x-axis, positive y-axis,
 //  and the positive z-axis.
 //
 //- Some aspects of matrix-structure generation and finite-element assembly
 //  are convenient to do using global node identifiers.
 //  A global identifier for each node is obtained from coordinates plus
 //  global box dimensions. See the function 'get_id' in box_utils.hpp.
 //
 //- Each node corresponds to a row in the matrix. The RCB partitioning method
 //  we use to split the global box among processors results in some
 //  processors owning non-contiguous blocks of global node identifiers.
 //  Since it is convenient for matrices and vectors to store contiguously-
 //  numbered blocks of rows, we map global node identifiers to a separate
 //  space of row numbers such that each processor's nodes correspond to a
 //  contiguous block of row numbers.
 //

 namespace miniFE {

 template<typename Scalar,
          typename LocalOrdinal,
          typename GlobalOrdinal>
 int
 driver(const Box& global_box, Box& my_box,
        Parameters& params, YAML_Doc& ydoc)
 {
   int global_nx = global_box[0][1];
   int global_ny = global_box[1][1];
   int global_nz = global_box[2][1];

   int numprocs = 1, myproc = 0;
 #ifdef HAVE_MPI
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
 #endif

   if (params.load_imbalance > 0) {
     add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc);
   }

   float largest_imbalance = 0, std_dev = 0;
   compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance,
                                    std_dev, ydoc, true);


   //Create a representation of the mesh:
   //Note that 'simple_mesh_description' is a virtual or conceptual
   //mesh that doesn't actually store mesh data.
 #ifdef TIME_IT
   if (myproc==0) {
     std::cout.width(30);
     std::cout << "creating/filling mesh...";
     std::cout.flush();
   }
 #endif

   timer_type t_start = mytimer();
   timer_type t0 = mytimer();

   simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box);

   timer_type mesh_fill = mytimer() - t0;
   timer_type t_total = mytimer() - t_start;

 #ifdef TIME_IT
   if (myproc==0) {
     std::cout << mesh_fill << "s, total time: " << t_total << std::endl;
   }
 #endif

   //next we will generate the matrix structure.

   //Declare matrix object:

 #if defined(MINIFE_ELL_MATRIX)
   typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
 #else
   typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
 #endif

   MatrixType A;

   timer_type gen_structure;
   RUN_TIMED_FUNCTION("generating matrix structure...",
                      generate_matrix_structure(mesh, A),
                      gen_structure, t_total);

   GlobalOrdinal local_nrows = A.rows.size();
   GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1;

   Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows);
   Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows);

   //Assemble finite-element sub-matrices and sub-vectors into the global
   //linear system:

   timer_type fe_assembly;
   RUN_TIMED_FUNCTION("assembling FE data...",
                      assemble_FE_data(mesh, A, b, params),
                      fe_assembly, t_total);

   if (myproc == 0) {
     ydoc.add("Matrix structure generation","");
     ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure);
     ydoc.add("FE assembly","");
     ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly);
   }

 #ifdef MINIFE_DEBUG
   write_matrix("A_prebc.mtx", A);
   write_vector("b_prebc.vec", b);
 #endif

   //Now apply dirichlet boundary-conditions
   //(Apply the 0-valued surfaces first, then the 1-valued surface last.)

   timer_type dirbc_time;
   RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
             impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total);
   RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
             impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total);

 #ifdef MINIFE_DEBUG
   write_matrix("A.mtx", A);
   write_vector("b.vec", b);
 #endif

   //Transform global indices to local, set up communication information:

   timer_type make_local_time;
   RUN_TIMED_FUNCTION("making matrix indices local...",
                      make_local_matrix(A),
                      make_local_time, t_total);

 #ifdef MINIFE_DEBUG
   write_matrix("A_local.mtx", A);
   write_vector("b_local.vec", b);
 #endif

   size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc);

   //Prepare to perform conjugate gradient solve:

   LocalOrdinal max_iters = 200;
   LocalOrdinal num_iters = 0;
   typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
   magnitude rnorm = 0;
   magnitude tol = std::numeric_limits<magnitude>::epsilon();

   timer_type cg_times[NUM_TIMERS];

   typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType;

   t_total = mytimer() - t_start;

   bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1;

   int verify_result = 0;

 #if MINIFE_KERNELS != 0
   if (myproc==0) {
     std::cout.width(30);
     std::cout << "Starting kernel timing loops ..." << std::endl;
   }

   max_iters = 500;
   x.coefs[0] = 0.9;
   if (matvec_with_comm_overlap) {
     time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
   }
   else {
     time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
   }
   num_iters = max_iters;
   std::string title("Kernel timings");
 #else
   if (myproc==0) {
     std::cout << "Starting CG solver ... " << std::endl;
   }

   if (matvec_with_comm_overlap) {
 #ifdef MINIFE_CSR_MATRIX
     rearrange_matrix_local_external(A);
     cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol,
            num_iters, rnorm, cg_times);
 #else
     std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl;
 #endif
   }
   else {
     cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol,
            num_iters, rnorm, cg_times);
     if (myproc == 0) {
       std::cout << "Final Resid Norm: " << rnorm << std::endl;
     }

     if (params.verify_solution > 0) {
       double tolerance = 0.06;
       bool verify_whole_domain = false;
   #ifdef MINIFE_DEBUG
       verify_whole_domain = true;
   #endif
       if (myproc == 0) {
         if (verify_whole_domain) std::cout << "verifying solution..." << std::endl;
         else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl;
       }
       verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain);
     }
   }

 #ifdef MINIFE_DEBUG
   write_vector("x.vec", x);
 #endif
   std::string title("CG solve");
 #endif

   if (myproc == 0) {
     ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name());
     ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name());
     ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name());
     ydoc.add(title,"");
     ydoc.get(title)->add("Iterations",num_iters);
     ydoc.get(title)->add("Final Resid Norm",rnorm);

     GlobalOrdinal global_nrows = global_nx;
     global_nrows *= global_ny*global_nz;

     //flops-per-mv, flops-per-dot, flops-per-waxpy:
     double mv_flops = global_nnz*2.0;
     double dot_flops = global_nrows*2.0;
     double waxpy_flops = global_nrows*3.0;

 #if MINIFE_KERNELS == 0
 //if MINIFE_KERNELS == 0 then we did a CG solve, and in that case
 //there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys.
     mv_flops *= (num_iters+1);
     dot_flops *= (2*num_iters);
     waxpy_flops *= (3*num_iters+2);
 #else
 //if MINIFE_KERNELS then we did one of each operation per iteration.
     mv_flops *= num_iters;
     dot_flops *= num_iters;
     waxpy_flops *= num_iters;
 #endif

     double total_flops = mv_flops + dot_flops + waxpy_flops;

     double mv_mflops = -1;
     if (cg_times[MATVEC] > 1.e-4)
       mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]);

     double dot_mflops = -1;
     if (cg_times[DOT] > 1.e-4)
       dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]);

     double waxpy_mflops = -1;
     if (cg_times[WAXPY] > 1.e-4)
       waxpy_mflops = 1.e-6 *  (waxpy_flops/cg_times[WAXPY]);

     double total_mflops = -1;
     if (cg_times[TOTAL] > 1.e-4)
       total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]);

     ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]);
     ydoc.get(title)->add("WAXPY Flops",waxpy_flops);
     if (waxpy_mflops >= 0)
       ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops);
     else
       ydoc.get(title)->add("WAXPY Mflops","inf");

     ydoc.get(title)->add("DOT Time",cg_times[DOT]);
     ydoc.get(title)->add("DOT Flops",dot_flops);
     if (dot_mflops >= 0)
       ydoc.get(title)->add("DOT Mflops",dot_mflops);
     else
       ydoc.get(title)->add("DOT Mflops","inf");

     ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]);
     ydoc.get(title)->add("MATVEC Flops",mv_flops);
     if (mv_mflops >= 0)
       ydoc.get(title)->add("MATVEC Mflops",mv_mflops);
     else
       ydoc.get(title)->add("MATVEC Mflops","inf");

 #ifdef MINIFE_FUSED
     ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]);
     ydoc.get(title)->add("MATVECDOT Flops",mv_flops);
     if (mv_mflops >= 0)
       ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops);
     else
       ydoc.get(title)->add("MATVECDOT Mflops","inf");
 #endif

 #if MINIFE_KERNELS == 0
     ydoc.get(title)->add("Total","");
     ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]);
     ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops);
     if (total_mflops >= 0)
       ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops);
     else
       ydoc.get(title)->get("Total")->add("Total CG Mflops","inf");
     ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters);
 #endif
   }

   return verify_result;
 }

 }//namespace miniFE

 #endif
	#ifndef _driver_hpp_
	#define _driver_hpp_

	//@HEADER
	// ************************************************************************
	//
	// MiniFE: Simple Finite Element Assembly and Solve
	// Copyright (2006-2013) Sandia Corporation
	//
	// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
	// license for use of this work by or on behalf of the U.S. Government.
	//
	// This library is free software; you can redistribute it and/or modify
	// it under the terms of the GNU Lesser General Public License as
	// published by the Free Software Foundation; either version 2.1 of the
	// License, or (at your option) any later version.
	//
	// This library is distributed in the hope that it will be useful, but
	// WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	// Lesser General Public License for more details.
	//
	// You should have received a copy of the GNU Lesser General Public
	// License along with this library; if not, write to the Free Software
	// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	// USA
	//
	// ************************************************************************
	//@HEADER

	#include <cstddef>
	#include <cmath>
	#include <cstdlib>
	#include <iostream>
	#include <sstream>
	#include <iomanip>

	#include <box_utils.hpp>
	#include <Vector.hpp>

	#ifdef MINIFE_CSR_MATRIX
	#include <CSRMatrix.hpp>
	#elif defined(MINIFE_ELL_MATRIX)
	#include <ELLMatrix.hpp>
	#else
	#include <CSRMatrix.hpp>
	#endif

	#include <simple_mesh_description.hpp>

	#include <SparseMatrix_functions.hpp>

	#include <generate_matrix_structure.hpp>
	#include <assemble_FE_data.hpp>

	#include <verify_solution.hpp>

	#include <compute_matrix_stats.hpp>
	#include <make_local_matrix.hpp>
	#include <imbalance.hpp>
	#include <cg_solve.hpp>
	#if MINIFE_KERNELS != 0
	#include <time_kernels.hpp>
	#endif
	#include <outstream.hpp>
	#include <utils.hpp>
	#include <mytimer.hpp>
	#include <YAML_Doc.hpp>

	#ifdef HAVE_MPI
	#include <mpi.h>
	#endif

	#define RUN_TIMED_FUNCTION(msg, fn, time_inc, time_total) \
	{ \
	/*if (myproc==0) { \
	std::cout.width(30); \
	std::cout << msg; \
	std::cout.flush(); \
	}*/ \
	timer_type rtf_t0 = mytimer(); \
	fn; \
	time_inc = mytimer() - rtf_t0; \
	time_total += time_inc; \
	/*if (myproc==0) { \
	std::cout << time_inc << "s, total time: " << time_total << std::endl; \
	}*/ \
	}

	//This program assembles finite-element matrices into a global matrix and
	//vector, then solves the linear-system using Conjugate Gradients.
	//Each finite-element is a hexahedron with 8 vertex-nodes.
	//
	//Notes:
	//- In finite-element terms, the box dimensions are in elements, not nodes.
	// In other words, a 2x2x2 box describes 8 elements, each of which has 8 nodes,
	// so it is a 3x3x3 node domain (27 nodes).
	// The assembled linear system will have 1 equation for each finite element node.
	//
	//- The coordinate origin is at the corner of the global box where x=0,
	// y=0, z=0, and the box extends along the positive x-axis, positive y-axis,
	// and the positive z-axis.
	//
	//- Some aspects of matrix-structure generation and finite-element assembly
	// are convenient to do using global node identifiers.
	// A global identifier for each node is obtained from coordinates plus
	// global box dimensions. See the function 'get_id' in box_utils.hpp.
	//
	//- Each node corresponds to a row in the matrix. The RCB partitioning method
	// we use to split the global box among processors results in some
	// processors owning non-contiguous blocks of global node identifiers.
	// Since it is convenient for matrices and vectors to store contiguously-
	// numbered blocks of rows, we map global node identifiers to a separate
	// space of row numbers such that each processor's nodes correspond to a
	// contiguous block of row numbers.
	//

	namespace miniFE {

	template<typename Scalar,
	typename LocalOrdinal,
	typename GlobalOrdinal>
	int
	driver(const Box& global_box, Box& my_box,
	Parameters& params, YAML_Doc& ydoc)
	{
	int global_nx = global_box[0][1];
	int global_ny = global_box[1][1];
	int global_nz = global_box[2][1];

	int numprocs = 1, myproc = 0;
	#ifdef HAVE_MPI
	MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
	MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
	#endif

	if (params.load_imbalance > 0) {
	add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc);
	}

	float largest_imbalance = 0, std_dev = 0;
	compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance,
	std_dev, ydoc, true);


	//Create a representation of the mesh:
	//Note that 'simple_mesh_description' is a virtual or conceptual
	//mesh that doesn't actually store mesh data.
	#ifdef TIME_IT
	if (myproc==0) {
	std::cout.width(30);
	std::cout << "creating/filling mesh...";
	std::cout.flush();
	}
	#endif

	timer_type t_start = mytimer();
	timer_type t0 = mytimer();

	simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box);

	timer_type mesh_fill = mytimer() - t0;
	timer_type t_total = mytimer() - t_start;

	#ifdef TIME_IT
	if (myproc==0) {
	std::cout << mesh_fill << "s, total time: " << t_total << std::endl;
	}
	#endif

	//next we will generate the matrix structure.

	//Declare matrix object:

	#if defined(MINIFE_ELL_MATRIX)
	typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
	#else
	typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
	#endif

	MatrixType A;

	timer_type gen_structure;
	RUN_TIMED_FUNCTION("generating matrix structure...",
	generate_matrix_structure(mesh, A),
	gen_structure, t_total);

	GlobalOrdinal local_nrows = A.rows.size();
	GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1;

	Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows);
	Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows);

	//Assemble finite-element sub-matrices and sub-vectors into the global
	//linear system:

	timer_type fe_assembly;
	RUN_TIMED_FUNCTION("assembling FE data...",
	assemble_FE_data(mesh, A, b, params),
	fe_assembly, t_total);

	if (myproc == 0) {
	ydoc.add("Matrix structure generation","");
	ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure);
	ydoc.add("FE assembly","");
	ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly);
	}

	#ifdef MINIFE_DEBUG
	write_matrix("A_prebc.mtx", A);
	write_vector("b_prebc.vec", b);
	#endif

	//Now apply dirichlet boundary-conditions
	//(Apply the 0-valued surfaces first, then the 1-valued surface last.)

	timer_type dirbc_time;
	RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
	impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total);
	RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
	impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total);

	#ifdef MINIFE_DEBUG
	write_matrix("A.mtx", A);
	write_vector("b.vec", b);
	#endif

	//Transform global indices to local, set up communication information:

	timer_type make_local_time;
	RUN_TIMED_FUNCTION("making matrix indices local...",
	make_local_matrix(A),
	make_local_time, t_total);

	#ifdef MINIFE_DEBUG
	write_matrix("A_local.mtx", A);
	write_vector("b_local.vec", b);
	#endif

	size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc);

	//Prepare to perform conjugate gradient solve:

	LocalOrdinal max_iters = 200;
	LocalOrdinal num_iters = 0;
	typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
	magnitude rnorm = 0;
	magnitude tol = std::numeric_limits<magnitude>::epsilon();

	timer_type cg_times[NUM_TIMERS];

	typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType;

	t_total = mytimer() - t_start;

	bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1;

	int verify_result = 0;

	#if MINIFE_KERNELS != 0
	if (myproc==0) {
	std::cout.width(30);
	std::cout << "Starting kernel timing loops ..." << std::endl;
	}

	max_iters = 500;
	x.coefs[0] = 0.9;
	if (matvec_with_comm_overlap) {
	time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
	}
	else {
	time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
	}
	num_iters = max_iters;
	std::string title("Kernel timings");
	#else
	if (myproc==0) {
	std::cout << "Starting CG solver ... " << std::endl;
	}

	if (matvec_with_comm_overlap) {
	#ifdef MINIFE_CSR_MATRIX
	rearrange_matrix_local_external(A);
	cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol,
	num_iters, rnorm, cg_times);
	#else
	std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl;
	#endif
	}
	else {
	cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol,
	num_iters, rnorm, cg_times);
	if (myproc == 0) {
	std::cout << "Final Resid Norm: " << rnorm << std::endl;
	}

	if (params.verify_solution > 0) {
	double tolerance = 0.06;
	bool verify_whole_domain = false;
	#ifdef MINIFE_DEBUG
	verify_whole_domain = true;
	#endif
	if (myproc == 0) {
	if (verify_whole_domain) std::cout << "verifying solution..." << std::endl;
	else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl;
	}
	verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain);
	}
	}

	#ifdef MINIFE_DEBUG
	write_vector("x.vec", x);
	#endif
	std::string title("CG solve");
	#endif

	if (myproc == 0) {
	ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name());
	ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name());
	ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name());
	ydoc.add(title,"");
	ydoc.get(title)->add("Iterations",num_iters);
	ydoc.get(title)->add("Final Resid Norm",rnorm);

	GlobalOrdinal global_nrows = global_nx;
	global_nrows = global_nyglobal_nz;

	//flops-per-mv, flops-per-dot, flops-per-waxpy:
	double mv_flops = global_nnz*2.0;
	double dot_flops = global_nrows*2.0;
	double waxpy_flops = global_nrows*3.0;

	#if MINIFE_KERNELS == 0
	//if MINIFE_KERNELS == 0 then we did a CG solve, and in that case
	//there were num_iters+1 matvecs, num_iters2 dots, and num_iters3+2 waxpys.
	mv_flops *= (num_iters+1);
	dot_flops = (2num_iters);
	waxpy_flops = (3num_iters+2);
	#else
	//if MINIFE_KERNELS then we did one of each operation per iteration.
	mv_flops *= num_iters;
	dot_flops *= num_iters;
	waxpy_flops *= num_iters;
	#endif

	double total_flops = mv_flops + dot_flops + waxpy_flops;

	double mv_mflops = -1;
	if (cg_times[MATVEC] > 1.e-4)
	mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]);

	double dot_mflops = -1;
	if (cg_times[DOT] > 1.e-4)
	dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]);

	double waxpy_mflops = -1;
	if (cg_times[WAXPY] > 1.e-4)
	waxpy_mflops = 1.e-6 * (waxpy_flops/cg_times[WAXPY]);

	double total_mflops = -1;
	if (cg_times[TOTAL] > 1.e-4)
	total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]);

	ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]);
	ydoc.get(title)->add("WAXPY Flops",waxpy_flops);
	if (waxpy_mflops >= 0)
	ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops);
	else
	ydoc.get(title)->add("WAXPY Mflops","inf");

	ydoc.get(title)->add("DOT Time",cg_times[DOT]);
	ydoc.get(title)->add("DOT Flops",dot_flops);
	if (dot_mflops >= 0)
	ydoc.get(title)->add("DOT Mflops",dot_mflops);
	else
	ydoc.get(title)->add("DOT Mflops","inf");

	ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]);
	ydoc.get(title)->add("MATVEC Flops",mv_flops);
	if (mv_mflops >= 0)
	ydoc.get(title)->add("MATVEC Mflops",mv_mflops);
	else
	ydoc.get(title)->add("MATVEC Mflops","inf");

	#ifdef MINIFE_FUSED
	ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]);
	ydoc.get(title)->add("MATVECDOT Flops",mv_flops);
	if (mv_mflops >= 0)
	ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops);
	else
	ydoc.get(title)->add("MATVECDOT Mflops","inf");
	#endif

	#if MINIFE_KERNELS == 0
	ydoc.get(title)->add("Total","");
	ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]);
	ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops);
	if (total_mflops >= 0)
	ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops);
	else
	ydoc.get(title)->get("Total")->add("Total CG Mflops","inf");
	ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters);
	#endif
	}

	return verify_result;
	}

	}//namespace miniFE

	#endif