| #ifndef _driver_hpp_ |
| #define _driver_hpp_ |
| |
| //@HEADER |
| // ************************************************************************ |
| // |
| // MiniFE: Simple Finite Element Assembly and Solve |
| // Copyright (2006-2013) Sandia Corporation |
| // |
| // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive |
| // license for use of this work by or on behalf of the U.S. Government. |
| // |
| // This library is free software; you can redistribute it and/or modify |
| // it under the terms of the GNU Lesser General Public License as |
| // published by the Free Software Foundation; either version 2.1 of the |
| // License, or (at your option) any later version. |
| // |
| // This library is distributed in the hope that it will be useful, but |
| // WITHOUT ANY WARRANTY; without even the implied warranty of |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| // Lesser General Public License for more details. |
| // |
| // You should have received a copy of the GNU Lesser General Public |
| // License along with this library; if not, write to the Free Software |
| // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
| // USA |
| // |
| // ************************************************************************ |
| //@HEADER |
| |
| #include <cstddef> |
| #include <cmath> |
| #include <cstdlib> |
| #include <iostream> |
| #include <sstream> |
| #include <iomanip> |
| |
| #include <box_utils.hpp> |
| #include <Vector.hpp> |
| |
| #ifdef MINIFE_CSR_MATRIX |
| #include <CSRMatrix.hpp> |
| #elif defined(MINIFE_ELL_MATRIX) |
| #include <ELLMatrix.hpp> |
| #else |
| #include <CSRMatrix.hpp> |
| #endif |
| |
| #include <simple_mesh_description.hpp> |
| |
| #include <SparseMatrix_functions.hpp> |
| |
| #include <generate_matrix_structure.hpp> |
| #include <assemble_FE_data.hpp> |
| |
| #include <verify_solution.hpp> |
| |
| #include <compute_matrix_stats.hpp> |
| #include <make_local_matrix.hpp> |
| #include <imbalance.hpp> |
| #include <cg_solve.hpp> |
| #if MINIFE_KERNELS != 0 |
| #include <time_kernels.hpp> |
| #endif |
| #include <outstream.hpp> |
| #include <utils.hpp> |
| #include <mytimer.hpp> |
| #include <YAML_Doc.hpp> |
| |
| #ifdef HAVE_MPI |
| #include <mpi.h> |
| #endif |
| |
| #define RUN_TIMED_FUNCTION(msg, fn, time_inc, time_total) \ |
| { \ |
| /*if (myproc==0) { \ |
| std::cout.width(30); \ |
| std::cout << msg; \ |
| std::cout.flush(); \ |
| }*/ \ |
| timer_type rtf_t0 = mytimer(); \ |
| fn; \ |
| time_inc = mytimer() - rtf_t0; \ |
| time_total += time_inc; \ |
| /*if (myproc==0) { \ |
| std::cout << time_inc << "s, total time: " << time_total << std::endl; \ |
| }*/ \ |
| } |
| |
| //This program assembles finite-element matrices into a global matrix and |
| //vector, then solves the linear-system using Conjugate Gradients. |
| //Each finite-element is a hexahedron with 8 vertex-nodes. |
| // |
| //Notes: |
| //- In finite-element terms, the box dimensions are in elements, not nodes. |
| // In other words, a 2x2x2 box describes 8 elements, each of which has 8 nodes, |
| // so it is a 3x3x3 node domain (27 nodes). |
| // The assembled linear system will have 1 equation for each finite element node. |
| // |
| //- The coordinate origin is at the corner of the global box where x=0, |
| // y=0, z=0, and the box extends along the positive x-axis, positive y-axis, |
| // and the positive z-axis. |
| // |
| //- Some aspects of matrix-structure generation and finite-element assembly |
| // are convenient to do using global node identifiers. |
| // A global identifier for each node is obtained from coordinates plus |
| // global box dimensions. See the function 'get_id' in box_utils.hpp. |
| // |
| //- Each node corresponds to a row in the matrix. The RCB partitioning method |
| // we use to split the global box among processors results in some |
| // processors owning non-contiguous blocks of global node identifiers. |
| // Since it is convenient for matrices and vectors to store contiguously- |
| // numbered blocks of rows, we map global node identifiers to a separate |
| // space of row numbers such that each processor's nodes correspond to a |
| // contiguous block of row numbers. |
| // |
| |
| namespace miniFE { |
| |
| template<typename Scalar, |
| typename LocalOrdinal, |
| typename GlobalOrdinal> |
| int |
| driver(const Box& global_box, Box& my_box, |
| Parameters& params, YAML_Doc& ydoc) |
| { |
| int global_nx = global_box[0][1]; |
| int global_ny = global_box[1][1]; |
| int global_nz = global_box[2][1]; |
| |
| int numprocs = 1, myproc = 0; |
| #ifdef HAVE_MPI |
| MPI_Comm_size(MPI_COMM_WORLD, &numprocs); |
| MPI_Comm_rank(MPI_COMM_WORLD, &myproc); |
| #endif |
| |
| if (params.load_imbalance > 0) { |
| add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc); |
| } |
| |
| float largest_imbalance = 0, std_dev = 0; |
| compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance, |
| std_dev, ydoc, true); |
| |
| |
| //Create a representation of the mesh: |
| //Note that 'simple_mesh_description' is a virtual or conceptual |
| //mesh that doesn't actually store mesh data. |
| #ifdef TIME_IT |
| if (myproc==0) { |
| std::cout.width(30); |
| std::cout << "creating/filling mesh..."; |
| std::cout.flush(); |
| } |
| #endif |
| |
| timer_type t_start = mytimer(); |
| timer_type t0 = mytimer(); |
| |
| simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box); |
| |
| timer_type mesh_fill = mytimer() - t0; |
| timer_type t_total = mytimer() - t_start; |
| |
| #ifdef TIME_IT |
| if (myproc==0) { |
| std::cout << mesh_fill << "s, total time: " << t_total << std::endl; |
| } |
| #endif |
| |
| //next we will generate the matrix structure. |
| |
| //Declare matrix object: |
| |
| #if defined(MINIFE_ELL_MATRIX) |
| typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType; |
| #else |
| typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType; |
| #endif |
| |
| MatrixType A; |
| |
| timer_type gen_structure; |
| RUN_TIMED_FUNCTION("generating matrix structure...", |
| generate_matrix_structure(mesh, A), |
| gen_structure, t_total); |
| |
| GlobalOrdinal local_nrows = A.rows.size(); |
| GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1; |
| |
| Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows); |
| Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows); |
| |
| //Assemble finite-element sub-matrices and sub-vectors into the global |
| //linear system: |
| |
| timer_type fe_assembly; |
| RUN_TIMED_FUNCTION("assembling FE data...", |
| assemble_FE_data(mesh, A, b, params), |
| fe_assembly, t_total); |
| |
| if (myproc == 0) { |
| ydoc.add("Matrix structure generation",""); |
| ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure); |
| ydoc.add("FE assembly",""); |
| ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly); |
| } |
| |
| #ifdef MINIFE_DEBUG |
| write_matrix("A_prebc.mtx", A); |
| write_vector("b_prebc.vec", b); |
| #endif |
| |
| //Now apply dirichlet boundary-conditions |
| //(Apply the 0-valued surfaces first, then the 1-valued surface last.) |
| |
| timer_type dirbc_time; |
| RUN_TIMED_FUNCTION("imposing Dirichlet BC...", |
| impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total); |
| RUN_TIMED_FUNCTION("imposing Dirichlet BC...", |
| impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total); |
| |
| #ifdef MINIFE_DEBUG |
| write_matrix("A.mtx", A); |
| write_vector("b.vec", b); |
| #endif |
| |
| //Transform global indices to local, set up communication information: |
| |
| timer_type make_local_time; |
| RUN_TIMED_FUNCTION("making matrix indices local...", |
| make_local_matrix(A), |
| make_local_time, t_total); |
| |
| #ifdef MINIFE_DEBUG |
| write_matrix("A_local.mtx", A); |
| write_vector("b_local.vec", b); |
| #endif |
| |
| size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc); |
| |
| //Prepare to perform conjugate gradient solve: |
| |
| LocalOrdinal max_iters = 200; |
| LocalOrdinal num_iters = 0; |
| typedef typename TypeTraits<Scalar>::magnitude_type magnitude; |
| magnitude rnorm = 0; |
| magnitude tol = std::numeric_limits<magnitude>::epsilon(); |
| |
| timer_type cg_times[NUM_TIMERS]; |
| |
| typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType; |
| |
| t_total = mytimer() - t_start; |
| |
| bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1; |
| |
| int verify_result = 0; |
| |
| #if MINIFE_KERNELS != 0 |
| if (myproc==0) { |
| std::cout.width(30); |
| std::cout << "Starting kernel timing loops ..." << std::endl; |
| } |
| |
| max_iters = 500; |
| x.coefs[0] = 0.9; |
| if (matvec_with_comm_overlap) { |
| time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times); |
| } |
| else { |
| time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times); |
| } |
| num_iters = max_iters; |
| std::string title("Kernel timings"); |
| #else |
| if (myproc==0) { |
| std::cout << "Starting CG solver ... " << std::endl; |
| } |
| |
| if (matvec_with_comm_overlap) { |
| #ifdef MINIFE_CSR_MATRIX |
| rearrange_matrix_local_external(A); |
| cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol, |
| num_iters, rnorm, cg_times); |
| #else |
| std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl; |
| #endif |
| } |
| else { |
| cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol, |
| num_iters, rnorm, cg_times); |
| if (myproc == 0) { |
| std::cout << "Final Resid Norm: " << rnorm << std::endl; |
| } |
| |
| if (params.verify_solution > 0) { |
| double tolerance = 0.06; |
| bool verify_whole_domain = false; |
| #ifdef MINIFE_DEBUG |
| verify_whole_domain = true; |
| #endif |
| if (myproc == 0) { |
| if (verify_whole_domain) std::cout << "verifying solution..." << std::endl; |
| else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl; |
| } |
| verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain); |
| } |
| } |
| |
| #ifdef MINIFE_DEBUG |
| write_vector("x.vec", x); |
| #endif |
| std::string title("CG solve"); |
| #endif |
| |
| if (myproc == 0) { |
| ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name()); |
| ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name()); |
| ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name()); |
| ydoc.add(title,""); |
| ydoc.get(title)->add("Iterations",num_iters); |
| ydoc.get(title)->add("Final Resid Norm",rnorm); |
| |
| GlobalOrdinal global_nrows = global_nx; |
| global_nrows *= global_ny*global_nz; |
| |
| //flops-per-mv, flops-per-dot, flops-per-waxpy: |
| double mv_flops = global_nnz*2.0; |
| double dot_flops = global_nrows*2.0; |
| double waxpy_flops = global_nrows*3.0; |
| |
| #if MINIFE_KERNELS == 0 |
| //if MINIFE_KERNELS == 0 then we did a CG solve, and in that case |
| //there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys. |
| mv_flops *= (num_iters+1); |
| dot_flops *= (2*num_iters); |
| waxpy_flops *= (3*num_iters+2); |
| #else |
| //if MINIFE_KERNELS then we did one of each operation per iteration. |
| mv_flops *= num_iters; |
| dot_flops *= num_iters; |
| waxpy_flops *= num_iters; |
| #endif |
| |
| double total_flops = mv_flops + dot_flops + waxpy_flops; |
| |
| double mv_mflops = -1; |
| if (cg_times[MATVEC] > 1.e-4) |
| mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]); |
| |
| double dot_mflops = -1; |
| if (cg_times[DOT] > 1.e-4) |
| dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]); |
| |
| double waxpy_mflops = -1; |
| if (cg_times[WAXPY] > 1.e-4) |
| waxpy_mflops = 1.e-6 * (waxpy_flops/cg_times[WAXPY]); |
| |
| double total_mflops = -1; |
| if (cg_times[TOTAL] > 1.e-4) |
| total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]); |
| |
| ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]); |
| ydoc.get(title)->add("WAXPY Flops",waxpy_flops); |
| if (waxpy_mflops >= 0) |
| ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops); |
| else |
| ydoc.get(title)->add("WAXPY Mflops","inf"); |
| |
| ydoc.get(title)->add("DOT Time",cg_times[DOT]); |
| ydoc.get(title)->add("DOT Flops",dot_flops); |
| if (dot_mflops >= 0) |
| ydoc.get(title)->add("DOT Mflops",dot_mflops); |
| else |
| ydoc.get(title)->add("DOT Mflops","inf"); |
| |
| ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]); |
| ydoc.get(title)->add("MATVEC Flops",mv_flops); |
| if (mv_mflops >= 0) |
| ydoc.get(title)->add("MATVEC Mflops",mv_mflops); |
| else |
| ydoc.get(title)->add("MATVEC Mflops","inf"); |
| |
| #ifdef MINIFE_FUSED |
| ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]); |
| ydoc.get(title)->add("MATVECDOT Flops",mv_flops); |
| if (mv_mflops >= 0) |
| ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops); |
| else |
| ydoc.get(title)->add("MATVECDOT Mflops","inf"); |
| #endif |
| |
| #if MINIFE_KERNELS == 0 |
| ydoc.get(title)->add("Total",""); |
| ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]); |
| ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops); |
| if (total_mflops >= 0) |
| ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops); |
| else |
| ydoc.get(title)->get("Total")->add("Total CG Mflops","inf"); |
| ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters); |
| #endif |
| } |
| |
| return verify_result; |
| } |
| |
| }//namespace miniFE |
| |
| #endif |
| |