MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.c - third_party/llvm-test-suite - Git at Google

 //------------------------------------------------------------------------------------------------------------------------------
 // Samuel Williams
 // SWWilliams@lbl.gov
 // Lawrence Berkeley National Lab
 //------------------------------------------------------------------------------------------------------------------------------
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <math.h>
 //------------------------------------------------------------------------------------------------------------------------------
 #ifdef __MPI
 #include <mpi.h>
 #endif
 //------------------------------------------------------------------------------------------------------------------------------
 #include "timer.h"
 #include "defines.h"
 #include "box.h"
 #include "mg.h"
 #include "operators.h"
 #include "solver.h"
 //------------------------------------------------------------------------------------------------------------------------------
 int create_subdomain(subdomain_type * box, int subdomain_low_i, int subdomain_low_j, int subdomain_low_k,
                                        int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k,
                                        int numGrids, int ghosts, int numLevels){
   int level;
   uint64_t memory_allocated=0;
   box->numLevels=numLevels;
   box->ghosts=ghosts;
   box->low.i = subdomain_low_i;
   box->low.j = subdomain_low_j;
   box->low.k = subdomain_low_k;
   box->dim.i = subdomain_dim_i;
   box->dim.j = subdomain_dim_j;
   box->dim.k = subdomain_dim_k;
   posix_memalign((void**)&(box->levels),64,numLevels*sizeof(box_type));
   memory_allocated += numLevels*sizeof(box_type);
   for(level=0;level<numLevels;level++){
                           int numGridsActual = numGrids;
     if(level == (numLevels-1))numGridsActual+= IterativeSolver_NumGrids(); // the bottom solver may need a few auxillary grids...
     memory_allocated += create_box(&box->levels[level],numGridsActual,subdomain_low_i>>level,subdomain_low_j>>level,subdomain_low_k>>level,
                                                                       subdomain_dim_i>>level,subdomain_dim_j>>level,subdomain_dim_k>>level,ghosts);
   }
   return(memory_allocated);
 }


 void destroy_subdomain(subdomain_type * box){
   int level;
   for(level=0;level<box->numLevels;level++){
     destroy_box(&box->levels[level]);
   }
   free(box->levels);
 }


 //------------------------------------------------------------------------------------------------------------------------------
 int calculate_neighboring_subdomain_index(domain_type *domain, int bi, int bj, int bk, int di, int dj, int dk){
   int ni,nj,nk;
   ni=(bi+domain->subdomains_per_rank_in.i+di)%domain->subdomains_per_rank_in.i;
   nj=(bj+domain->subdomains_per_rank_in.j+dj)%domain->subdomains_per_rank_in.j;
   nk=(bk+domain->subdomains_per_rank_in.k+dk)%domain->subdomains_per_rank_in.k;
   int index = ni + nj*domain->subdomains_per_rank_in.i + nk*domain->subdomains_per_rank_in.i*domain->subdomains_per_rank_in.j;
   return(index);
 }

 int calculate_neighboring_subdomain_rank(domain_type * domain, int bi, int bj, int bk, int di, int dj, int dk, int ri, int rj, int rk){
   if( domain->boundary_condition.i != __BOUNDARY_PERIODIC){
     if( ((domain->subdomains_per_rank_in.i*ri)+bi+di) <                        0 )return(-1);
     if( ((domain->subdomains_per_rank_in.i*ri)+bi+di) >= domain->subdomains_in.i )return(-1);
   }
   if( domain->boundary_condition.j != __BOUNDARY_PERIODIC){
     if( ((domain->subdomains_per_rank_in.j*rj)+bj+dj) <                        0 )return(-1);
     if( ((domain->subdomains_per_rank_in.j*rj)+bj+dj) >= domain->subdomains_in.j )return(-1);
   }
   if( domain->boundary_condition.k != __BOUNDARY_PERIODIC){
     if( ((domain->subdomains_per_rank_in.k*rk)+bk+dk) <                        0 )return(-1);
     if( ((domain->subdomains_per_rank_in.k*rk)+bk+dk) >= domain->subdomains_in.k )return(-1);
   }
   // if didn't walk off domain, calculate new rank for periodic boundaries...
   if((bi+di)<0)ri--;if((bi+di)>=domain->subdomains_per_rank_in.i)ri++;ri=(ri+domain->ranks_in.i)%domain->ranks_in.i;
   if((bj+dj)<0)rj--;if((bj+dj)>=domain->subdomains_per_rank_in.j)rj++;rj=(rj+domain->ranks_in.j)%domain->ranks_in.j;
   if((bk+dk)<0)rk--;if((bk+dk)>=domain->subdomains_per_rank_in.k)rk++;rk=(rk+domain->ranks_in.k)%domain->ranks_in.k;
   int rank = ri + rj*domain->ranks_in.i + rk*domain->ranks_in.i*domain->ranks_in.j;
   return(rank);
 }


 //------------------------------------------------------------------------------------------------------------------------------
 int create_domain(domain_type * domain,
               int subdomain_dim_i,  int subdomain_dim_j,  int subdomain_dim_k,
               int subdomains_per_rank_in_i, int subdomains_per_rank_in_j, int subdomains_per_rank_in_k,
               int ranks_in_i,      int ranks_in_j,      int ranks_in_k,
               int rank,
               int *boundary_conditions,
               int numGrids, int ghosts, int numLevels
              ){
   int level;
   int  i, j, k;
   int di,dj,dk;
   // FIX ghosts = array (varies with level)
   domain->rank = rank;
   if(domain->rank==0){printf("creating domain...       ");fflush(stdout);}
   //FIX, limit ghosts based on coarsest problem size
   if(ghosts>(subdomain_dim_i>>(numLevels-1))){if(domain->rank==0)printf("#ghosts(%d)>bottom grid size(%d)\n",ghosts,subdomain_dim_i>>(numLevels-1));exit(0);}

   if( (subdomain_dim_i!=subdomain_dim_j)||(subdomain_dim_j!=subdomain_dim_k)||(subdomain_dim_i!=subdomain_dim_k) ){
   if(domain->rank==0)printf("subdomain_dim's must be equal\n");exit(0);
   }
   uint64_t memory_allocated =0;
   // processes are laid out in x, then y, then z
   //int ranks = ranks_in_i*ranks_in_j*ranks_in_k;
   int my_k_rank = (rank                                                              ) / (ranks_in_i*ranks_in_j);
   int my_j_rank = (rank - (ranks_in_i*ranks_in_j*my_k_rank)                          ) / (ranks_in_i           );
   int my_i_rank = (rank - (ranks_in_i*ranks_in_j*my_k_rank) - (ranks_in_i*my_j_rank) );
   //printf("%2d: (%2d,%2d,%2d)\n",domain->rank,my_k_rank,my_j_rank,my_i_rank);


   domain->ranks_in.i               = ranks_in_i;
   domain->ranks_in.j               = ranks_in_j;
   domain->ranks_in.k               = ranks_in_k;
   domain->subdomains_per_rank_in.i = subdomains_per_rank_in_i;
   domain->subdomains_per_rank_in.j = subdomains_per_rank_in_j;
   domain->subdomains_per_rank_in.k = subdomains_per_rank_in_k;
   domain->subdomains_in.i          = subdomains_per_rank_in_i*ranks_in_i;
   domain->subdomains_in.j          = subdomains_per_rank_in_j*ranks_in_j;
   domain->subdomains_in.k          = subdomains_per_rank_in_k*ranks_in_k;
   domain->subdomains_per_rank      = subdomains_per_rank_in_i*subdomains_per_rank_in_j*subdomains_per_rank_in_k;
   posix_memalign((void**)&(domain->subdomains),64,domain->subdomains_per_rank*sizeof(subdomain_type));
   memory_allocated+=domain->subdomains_per_rank*sizeof(subdomain_type);

   domain->dim.i                = domain->subdomains_in.i * subdomain_dim_i;
   domain->dim.j                = domain->subdomains_in.j * subdomain_dim_j;
   domain->dim.k                = domain->subdomains_in.k * subdomain_dim_k;
   domain->boundary_condition.i = boundary_conditions[0];
   domain->boundary_condition.j = boundary_conditions[1];
   domain->boundary_condition.k = boundary_conditions[2];
   domain->numLevels            = numLevels;
   domain->numGrids             = numGrids;
   domain->ghosts               = ghosts;
   for(level=0;level<domain->numLevels;level++){
                                domain->h[level] = -1;
     domain->dominant_eigenvalue_of_DinvA[level] = -1;
   }

   // calculate the neighbors of this process
   for(dk=-1;dk<=1;dk++){
   for(dj=-1;dj<=1;dj++){
   for(di=-1;di<=1;di++){
     int n = 13+di+3*dj+9*dk;
     int neighbor_rank_in_i = (my_i_rank+di+ranks_in_i)%ranks_in_i;
     int neighbor_rank_in_j = (my_j_rank+dj+ranks_in_j)%ranks_in_j;
     int neighbor_rank_in_k = (my_k_rank+dk+ranks_in_k)%ranks_in_k;
     domain->rank_of_neighbor[n] = neighbor_rank_in_i + ranks_in_i*neighbor_rank_in_j + ranks_in_i*ranks_in_j*neighbor_rank_in_k;
     if( domain->boundary_condition.i != __BOUNDARY_PERIODIC){
       if( (my_i_rank+di) <           0 )domain->rank_of_neighbor[n] = -1;
       if( (my_i_rank+di) >= ranks_in_i )domain->rank_of_neighbor[n] = -1;
     }
     if( domain->boundary_condition.j != __BOUNDARY_PERIODIC){
       if( (my_j_rank+dj) <           0 )domain->rank_of_neighbor[n] = -1;
       if( (my_j_rank+dj) >= ranks_in_j )domain->rank_of_neighbor[n] = -1;
     }
     if( domain->boundary_condition.k != __BOUNDARY_PERIODIC){
       if( (my_k_rank+dk) <           0 )domain->rank_of_neighbor[n] = -1;
       if( (my_k_rank+dk) >= ranks_in_k )domain->rank_of_neighbor[n] = -1;
     }
   }}}

   // subdomains within a process are laid out in i, then j, then k
   for(k=0;k<subdomains_per_rank_in_k;k++){
   for(j=0;j<subdomains_per_rank_in_j;j++){
   for(i=0;i<subdomains_per_rank_in_i;i++){
     int box = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
     int low_i = subdomain_dim_i * (i + subdomains_per_rank_in_i*my_i_rank);
     int low_j = subdomain_dim_j * (j + subdomains_per_rank_in_j*my_j_rank);
     int low_k = subdomain_dim_k * (k + subdomains_per_rank_in_k*my_k_rank);
     memory_allocated += create_subdomain(&domain->subdomains[box],low_i,low_j,low_k,subdomain_dim_i,subdomain_dim_j,subdomain_dim_k,numGrids,ghosts,numLevels);
     for(dk=-1;dk<=1;dk++){
     for(dj=-1;dj<=1;dj++){
     for(di=-1;di<=1;di++){
       int n = 13+di+3*dj+9*dk;
       domain->subdomains[box].neighbors[n].rank        = calculate_neighboring_subdomain_rank(domain,i,j,k,di,dj,dk,my_i_rank,my_j_rank,my_k_rank);
       domain->subdomains[box].neighbors[n].local_index = calculate_neighboring_subdomain_index(domain,i,j,k,di,dj,dk);
       // FIX boundary conditions??
       //printf("rank=%2d, box[%2d].neighbors[%3d]=(%3d,%3d)\n",domain->rank,box,n,domain->subdomains[box].neighbors[n].rank,domain->subdomains[box].neighbors[n].local_index);
     }}}
   }}}


   int FacesEdgesCorners[27] = {4,10,12,14,16,22,   1,3,5,7,9,11,15,17,19,21,23,25,   0,2,6,8,18,20,24,26,   13};
   int    faces[27] = {0,0,0,0,1,0,0,0,0,  0,1,0,1,0,1,0,1,0,  0,0,0,0,1,0,0,0,0};
   int    edges[27] = {0,1,0,1,0,1,0,1,0,  1,0,1,0,0,0,1,0,1,  0,1,0,1,0,1,0,1,0};
   int  corners[27] = {1,0,1,0,0,0,1,0,1,  0,0,0,0,0,0,0,0,0,  1,0,1,0,0,0,1,0,1};

   #ifdef __MPI
   // allocate MPI send/recv buffers for the 26 neighbors....
   for(dk=-1;dk<=1;dk++){
   for(dj=-1;dj<=1;dj++){
   for(di=-1;di<=1;di++){
     int n = 13+di+3*dj+9*dk;if(n!=13){
     int bufSize = 1;
     if(di==0)bufSize*=domain->subdomains_per_rank_in.i*subdomain_dim_i;else bufSize*=ghosts;
     if(dj==0)bufSize*=domain->subdomains_per_rank_in.j*subdomain_dim_j;else bufSize*=ghosts;
     if(dk==0)bufSize*=domain->subdomains_per_rank_in.k*subdomain_dim_k;else bufSize*=ghosts;
     posix_memalign((void**)&(domain->send_buffer[n]),64,bufSize*sizeof(double));
     posix_memalign((void**)&(domain->recv_buffer[n]),64,bufSize*sizeof(double));
                         memset(domain->send_buffer[n],0,bufSize*sizeof(double));
                         memset(domain->recv_buffer[n],0,bufSize*sizeof(double));
                                       memory_allocated+=bufSize*sizeof(double);
                                       memory_allocated+=bufSize*sizeof(double);
   }}}}
   #endif


   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   // now enumerate a list of data transfers to effect a ghost zone exchange (boundary conditions are skipped)
   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   for(level=0;level<domain->numLevels;level++){
     int initialize;for(initialize=0;initialize<2;initialize++){ // initialize=0=count/malloc.  initialize=1=initialize
     int buffer=0;
     //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // Traverse list of boxes, and create MPI pack for faces, edges, and corners
     domain->bufferCopy_Pack_Start=buffer;
     int nn;
     #ifdef __MPI
     for(nn=0;nn<26;nn++){
       int n = FacesEdgesCorners[nn];
       int di = ((n/1)%3) - 1;
       int dj = ((n/3)%3) - 1;
       int dk = ((n/9)%3) - 1;

       // traverse all boxes in direction di/dj/dk... (e.g. all subdomains on the corresponding face)
       int sd_i_lo,sd_i_hi;
       int sd_j_lo,sd_j_hi;
       int sd_k_lo,sd_k_hi;
       switch(di){
         case -1:sd_i_lo=                         0;sd_i_hi=                       1;break;
         case  0:sd_i_lo=                         0;sd_i_hi=subdomains_per_rank_in_i;break;
         case  1:sd_i_lo=subdomains_per_rank_in_i-1;sd_i_hi=subdomains_per_rank_in_i;break;
       };
       switch(dj){
         case -1:sd_j_lo=                         0;sd_j_hi=                       1;break;
         case  0:sd_j_lo=                         0;sd_j_hi=subdomains_per_rank_in_j;break;
         case  1:sd_j_lo=subdomains_per_rank_in_j-1;sd_j_hi=subdomains_per_rank_in_j;break;
       };
       switch(dk){
         case -1:sd_k_lo=                         0;sd_k_hi=                       1;break;
         case  0:sd_k_lo=                         0;sd_k_hi=subdomains_per_rank_in_k;break;
         case  1:sd_k_lo=subdomains_per_rank_in_k-1;sd_k_hi=subdomains_per_rank_in_k;break;
       };
       for(k=sd_k_lo;k<sd_k_hi;k++){
       for(j=sd_j_lo;j<sd_j_hi;j++){
       for(i=sd_i_lo;i<sd_i_hi;i++){
         int off_node_exchange=1;
         if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == domain->rank)off_node_exchange=0; // same process
         if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) ==           -1)off_node_exchange=0; // domain boundary
         if(off_node_exchange){
           int sendBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
           int grid_i,grid_j,grid_k; // ijk in the source grid (relative to first ghost zone element)
           int  dim_i, dim_j, dim_k; // dimensions of face/edge/corner
           switch(di){
             case -1:grid_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].ghosts;break;
             case  0:grid_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].dim.i; break;
             case  1:grid_i=domain->subdomains[sendBox].levels[level].dim.i; dim_i=domain->subdomains[sendBox].levels[level].ghosts;break;
           };
           switch(dj){
             case -1:grid_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].ghosts;break;
             case  0:grid_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].dim.j; break;
             case  1:grid_j=domain->subdomains[sendBox].levels[level].dim.j; dim_j=domain->subdomains[sendBox].levels[level].ghosts;break;
           };
           switch(dk){
             case -1:grid_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].ghosts;break;
             case  0:grid_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].dim.k; break;
             case  1:grid_k=domain->subdomains[sendBox].levels[level].dim.k; dim_k=domain->subdomains[sendBox].levels[level].ghosts;break;
           };
           int buf_i = dim_i*(i-sd_i_lo); // ijk in the target buffer
           int buf_j = dim_j*(j-sd_j_lo);
           int buf_k = dim_k*(k-sd_k_lo);
           int buf_pencil = dim_i*(sd_i_hi-sd_i_lo);
           int buf_plane  = dim_j*(sd_j_hi-sd_j_lo)*buf_pencil;

           // FIX, could be broken into sub buffers for more parallelism (i.e. pencils)
           if(initialize==1){
             domain->bufferCopies[level][buffer].dim.i        = dim_i;
             domain->bufferCopies[level][buffer].dim.j        = dim_j;
             domain->bufferCopies[level][buffer].dim.k        = dim_k;
             domain->bufferCopies[level][buffer].read.box     = sendBox;
             domain->bufferCopies[level][buffer].read.ptr     = NULL;
             domain->bufferCopies[level][buffer].read.i       = grid_i;
             domain->bufferCopies[level][buffer].read.j       = grid_j;
             domain->bufferCopies[level][buffer].read.k       = grid_k;
             domain->bufferCopies[level][buffer].read.pencil  = domain->subdomains[sendBox].levels[level].pencil;
             domain->bufferCopies[level][buffer].read.plane   = domain->subdomains[sendBox].levels[level].plane;
             domain->bufferCopies[level][buffer].write.box    = -1;
             domain->bufferCopies[level][buffer].write.ptr    = domain->send_buffer[n];
             domain->bufferCopies[level][buffer].write.i      = buf_i;
             domain->bufferCopies[level][buffer].write.j      = buf_j;
             domain->bufferCopies[level][buffer].write.k      = buf_k;
             domain->bufferCopies[level][buffer].write.pencil = buf_pencil;
             domain->bufferCopies[level][buffer].write.plane  = buf_plane;
             domain->bufferCopies[level][buffer].isFace       =   faces[n];
             domain->bufferCopies[level][buffer].isEdge       =   edges[n];
             domain->bufferCopies[level][buffer].isCorner     = corners[n];
           }
           //else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from box=%3d to (%3d,%3d,%3d) in MPI[%2d,%2d,%2d] (%3d,%4d)\n",level,buffer,dim_i,dim_j,dim_k,grid_i,grid_j,grid_k,sendBox,buf_i,buf_j,buf_k,di,dj,dk,buf_pencil,buf_plane);
           buffer++;
         } // off-process neighbor
       }}} // subdomains
     } // directions
     #endif
     domain->bufferCopy_Pack_End=buffer;
     //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // Traverse list of boxes, and create local exchange/domain copies for faces, edges, and corners
     domain->bufferCopy_Local_Start=buffer;
     for(nn=0;nn<26;nn++){
       int n = FacesEdgesCorners[nn];
       int di = ((n/1)%3) - 1;
       int dj = ((n/3)%3) - 1;
       int dk = ((n/9)%3) - 1;
       for(k=0;k<subdomains_per_rank_in_k;k++){
       for(j=0;j<subdomains_per_rank_in_j;j++){
       for(i=0;i<subdomains_per_rank_in_i;i++){
         int sendBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
         int on_node_exchange = 1;
         if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) != domain->rank)on_node_exchange=0; // if == -1, then domain boundary and thus not on_node
         if(on_node_exchange){
           int recvBox = calculate_neighboring_subdomain_index(domain,i,j,k,di,dj,dk);
           int send_i,send_j,send_k; // ijk in the source grid (relative to first ghost zone element)
           int recv_i,recv_j,recv_k; // ijk in the destination grid (relative to first ghost zone element)
           int  dim_i, dim_j, dim_k; // dimensions of face/edge/corner
           switch(di){
             case -1:send_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].ghosts;recv_i=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.i;break;
             case  0:send_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].dim.i; recv_i=domain->subdomains[recvBox].levels[level].ghosts;break;
             case  1:send_i=domain->subdomains[sendBox].levels[level].dim.i; dim_i=domain->subdomains[sendBox].levels[level].ghosts;recv_i=0;break;
           };
           switch(dj){
             case -1:send_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].ghosts;recv_j=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.j;break;
             case  0:send_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].dim.j; recv_j=domain->subdomains[recvBox].levels[level].ghosts;break;
             case  1:send_j=domain->subdomains[sendBox].levels[level].dim.j; dim_j=domain->subdomains[sendBox].levels[level].ghosts;recv_j=0;break;
           };
           switch(dk){
             case -1:send_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].ghosts;recv_k=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.k;break;
             case  0:send_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].dim.k; recv_k=domain->subdomains[recvBox].levels[level].ghosts;break;
             case  1:send_k=domain->subdomains[sendBox].levels[level].dim.k; dim_k=domain->subdomains[sendBox].levels[level].ghosts;recv_k=0;break;
           };
           // FIX, could be broken into sub buffers for more parallelism (i.e. pencils)
           if(initialize==1){
             domain->bufferCopies[level][buffer].dim.i        = dim_i;
             domain->bufferCopies[level][buffer].dim.j        = dim_j;
             domain->bufferCopies[level][buffer].dim.k        = dim_k;
             domain->bufferCopies[level][buffer].read.box     = sendBox;
             domain->bufferCopies[level][buffer].read.ptr     = NULL;
             domain->bufferCopies[level][buffer].read.i       = send_i;
             domain->bufferCopies[level][buffer].read.j       = send_j;
             domain->bufferCopies[level][buffer].read.k       = send_k;
             domain->bufferCopies[level][buffer].read.pencil  = domain->subdomains[sendBox].levels[level].pencil;
             domain->bufferCopies[level][buffer].read.plane   = domain->subdomains[sendBox].levels[level].plane;
             domain->bufferCopies[level][buffer].write.box    = recvBox;
             domain->bufferCopies[level][buffer].write.ptr    = NULL;
             domain->bufferCopies[level][buffer].write.i      = recv_i;
             domain->bufferCopies[level][buffer].write.j      = recv_j;
             domain->bufferCopies[level][buffer].write.k      = recv_k;
             domain->bufferCopies[level][buffer].write.pencil = domain->subdomains[recvBox].levels[level].pencil;
             domain->bufferCopies[level][buffer].write.plane  = domain->subdomains[recvBox].levels[level].plane;
             domain->bufferCopies[level][buffer].isFace       =   faces[n];
             domain->bufferCopies[level][buffer].isEdge       =   edges[n];
             domain->bufferCopies[level][buffer].isCorner     = corners[n];
           }
           //else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from box=%3d to (%3d,%3d,%3d) in box=%3d\n",level,buffer,dim_i,dim_j,dim_k,send_i,send_j,send_k,sendBox,recv_i,recv_j,recv_k,recvBox);
           buffer++;
         }
       }}} // all local subdomains
     } // directions
     domain->bufferCopy_Local_End=buffer;
     //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // Traverse list of boxes and their neighbors and gather data from buffers...
     // faces come from face buffers
     // edges can come from either edge buffers or face buffers
     // corners can come from either corner buffers, edge buffers or face buffers
     domain->bufferCopy_Unpack_Start=buffer;
     #ifdef __MPI
     for(nn=0;nn<26;nn++){
       int n = FacesEdgesCorners[nn];
       int di = ((n/1)%3) - 1;
       int dj = ((n/3)%3) - 1;
       int dk = ((n/9)%3) - 1;
       for(k=0;k<subdomains_per_rank_in_k;k++){
       for(j=0;j<subdomains_per_rank_in_j;j++){
       for(i=0;i<subdomains_per_rank_in_i;i++){
         int recvBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
         int off_node_exchange = 1;
         if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == domain->rank)off_node_exchange=0; // same node
         if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) ==           -1)off_node_exchange=0; // domain boundary
         if(off_node_exchange){
           int _di=0;
           int _dj=0;
           int _dk=0;
           if((i+di)<0){_di=-1;}else if((i+di)>=subdomains_per_rank_in_i){_di=1;}
           if((j+dj)<0){_dj=-1;}else if((j+dj)>=subdomains_per_rank_in_j){_dj=1;}
           if((k+dk)<0){_dk=-1;}else if((k+dk)>=subdomains_per_rank_in_k){_dk=1;}
           int _n = 13 + _di + 3*_dj + 9*_dk;
           int  dim_i, dim_j, dim_k; // dimensions of face/edge/corner
           int grid_i,grid_j,grid_k; // ijk in the destination grid (relative to first ghost zone element)
           int  buf_i, buf_j, buf_k; // ijk in the source buffer
           int buf_plane =1;
           int buf_pencil=1;
           switch(di){
             case -1:dim_i=domain->subdomains[recvBox].levels[level].ghosts;grid_i=0;                                                                                               break;
             case  0:dim_i=domain->subdomains[recvBox].levels[level].dim.i; grid_i=domain->subdomains[recvBox].levels[level].ghosts;                                                break;
             case  1:dim_i=domain->subdomains[recvBox].levels[level].ghosts;grid_i=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.i;break;
           };
           switch(dj){
             case -1:dim_j=domain->subdomains[recvBox].levels[level].ghosts;grid_j=0;                                                                                               break;
             case  0:dim_j=domain->subdomains[recvBox].levels[level].dim.j; grid_j=domain->subdomains[recvBox].levels[level].ghosts;                                                break;
             case  1:dim_j=domain->subdomains[recvBox].levels[level].ghosts;grid_j=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.j;break;
           };
           switch(dk){
             case -1:dim_k=domain->subdomains[recvBox].levels[level].ghosts;grid_k=0;                                                                                               break;
             case  0:dim_k=domain->subdomains[recvBox].levels[level].dim.k; grid_k=domain->subdomains[recvBox].levels[level].ghosts;                                                break;
             case  1:dim_k=domain->subdomains[recvBox].levels[level].ghosts;grid_k=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.k;break;
           };
           switch(_di){
             case -1:buf_i=0;                                                buf_plane*=                         domain->subdomains[recvBox].levels[level].ghosts;buf_pencil=                         domain->subdomains[recvBox].levels[level].ghosts;break;
             case  0:buf_i=i*domain->subdomains[recvBox].levels[level].dim.i;buf_plane*=subdomains_per_rank_in_i*domain->subdomains[recvBox].levels[level].dim.i; buf_pencil=subdomains_per_rank_in_i*domain->subdomains[recvBox].levels[level].dim.i; break;
             case  1:buf_i=0;                                                buf_plane*=                         domain->subdomains[recvBox].levels[level].ghosts;buf_pencil=                         domain->subdomains[recvBox].levels[level].ghosts;break;
           };
           switch(_dj){
             case -1:buf_j=0;                                                buf_plane*=                         domain->subdomains[recvBox].levels[level].ghosts;break;
             case  0:buf_j=j*domain->subdomains[recvBox].levels[level].dim.j;buf_plane*=subdomains_per_rank_in_j*domain->subdomains[recvBox].levels[level].dim.j; break;
             case  1:buf_j=0;                                                buf_plane*=                         domain->subdomains[recvBox].levels[level].ghosts;break;
           };
           switch(_dk){
             case -1:buf_k=0;                                                break;
             case  0:buf_k=k*domain->subdomains[recvBox].levels[level].dim.k;break;
             case  1:buf_k=0;                                                break;
           };
           if( (i+di>=0) && (i+di<subdomains_per_rank_in_i) ){if(di<0)buf_i-=domain->subdomains[recvBox].levels[level].ghosts;else if(di>0)buf_i+=domain->subdomains[recvBox].levels[level].dim.i;}// forward by dim.i or back by ghosts
           if( (j+dj>=0) && (j+dj<subdomains_per_rank_in_j) ){if(dj<0)buf_j-=domain->subdomains[recvBox].levels[level].ghosts;else if(dj>0)buf_j+=domain->subdomains[recvBox].levels[level].dim.j;}// forward by dim.j or back by ghosts
           if( (k+dk>=0) && (k+dk<subdomains_per_rank_in_k) ){if(dk<0)buf_k-=domain->subdomains[recvBox].levels[level].ghosts;else if(dk>0)buf_k+=domain->subdomains[recvBox].levels[level].dim.k;}// forward by dim.k or back by ghosts
           // FIX, could be broken into sub buffers for more parallelism (i.e. pencils)
           if(initialize==1){
             domain->bufferCopies[level][buffer].dim.i        = dim_i;
             domain->bufferCopies[level][buffer].dim.j        = dim_j;
             domain->bufferCopies[level][buffer].dim.k        = dim_k;
             domain->bufferCopies[level][buffer].read.box     = -1;
             domain->bufferCopies[level][buffer].read.ptr     = domain->recv_buffer[_n];
             domain->bufferCopies[level][buffer].read.i       = buf_i;
             domain->bufferCopies[level][buffer].read.j       = buf_j;
             domain->bufferCopies[level][buffer].read.k       = buf_k;
             domain->bufferCopies[level][buffer].read.pencil  = buf_pencil;
             domain->bufferCopies[level][buffer].read.plane   = buf_plane;
             domain->bufferCopies[level][buffer].write.box    = recvBox;
             domain->bufferCopies[level][buffer].write.ptr    = NULL;
             domain->bufferCopies[level][buffer].write.i      = grid_i;
             domain->bufferCopies[level][buffer].write.j      = grid_j;
             domain->bufferCopies[level][buffer].write.k      = grid_k;
             domain->bufferCopies[level][buffer].write.pencil = domain->subdomains[recvBox].levels[level].pencil;
             domain->bufferCopies[level][buffer].write.plane  = domain->subdomains[recvBox].levels[level].plane;
             domain->bufferCopies[level][buffer].isFace       =   faces[n];
             domain->bufferCopies[level][buffer].isEdge       =   edges[n];
             domain->bufferCopies[level][buffer].isCorner     = corners[n];
           }
           //else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from MPI[%2d,%2d,%2d] (%3d,%4d) to box=%3d at (%3d,%3d,%3d)\n",level,buffer,dim_i,dim_j,dim_k,buf_i,buf_j,buf_k,_di,_dj,_dk,buf_pencil,buf_plane,recvBox,grid_i,grid_j,grid_k);
           buffer++;
         }
       }}}
     }
     #endif
     domain->bufferCopy_Unpack_End=buffer;
     //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // At the end of initialize 0, malloc the buffers
     if(initialize==0){
       domain->bufferCopies[level] = (bufferCopy_type*)malloc(buffer*sizeof(bufferCopy_type));
       memory_allocated+=buffer*sizeof(bufferCopy_type);
     }
     //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   }}
   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


   if(domain->rank==0){
   printf("done\n");fflush(stdout);
   printf("  %d x %d x %d (per subdomain)\n",subdomain_dim_i,subdomain_dim_j,subdomain_dim_k);
   printf("  %d x %d x %d (per process)\n",subdomains_per_rank_in_i*subdomain_dim_i,subdomains_per_rank_in_j*subdomain_dim_j,subdomains_per_rank_in_k*subdomain_dim_k);
   printf("  %d x %d x %d (overall)\n",ranks_in_i*subdomains_per_rank_in_i*subdomain_dim_i,ranks_in_j*subdomains_per_rank_in_j*subdomain_dim_j,ranks_in_k*subdomains_per_rank_in_k*subdomain_dim_k);
   printf("  %d-deep ghost zones\n",ghosts);
   printf("  allocated %llu MB\n",memory_allocated>>20);
   fflush(stdout);
   }
   return(memory_allocated);
 }


 void destroy_domain(domain_type * domain){
   if(domain->rank==0){printf("deallocating domain...   ");fflush(stdout);}
   int box;for(box=0;box<domain->subdomains_per_rank;box++){
     destroy_subdomain(&domain->subdomains[box]);
   }
   free(domain->subdomains);
   // FIX, free buffer_copies, etc...
   if(domain->rank==0){printf("done\n");fflush(stdout);}
 }


 //------------------------------------------------------------------------------------------------------------------------------
 void print_timing(domain_type *domain){
   #ifdef TIMING
   int level,numLevels = domain->numLevels;
   uint64_t _timeStart=CycleTime();sleep(1);uint64_t _timeEnd=CycleTime();
   double SecondsPerCycle = (double)1.0/(double)(_timeEnd-_timeStart);
          SecondsPerCycle = SecondsPerCycle/(double)domain->MGSolves_performed; // prints average performance per MGSolve

   if(domain->rank!=0)return;

   uint64_t total;
           printf("                       ");for(level=0;level<(numLevels  );level++){printf("%12d ",level);}printf("\n");
           printf("                       ");for(level=0;level<(numLevels  );level++){printf("%10d^3 ",domain->subdomains[0].levels[0].dim.i>>level);}printf("       total\n");
   total=0;printf("smooth                 ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)       domain->cycles.smooth[level]);total+=       domain->cycles.smooth[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("residual               ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)     domain->cycles.residual[level]);total+=     domain->cycles.residual[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("restriction            ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)  domain->cycles.restriction[level]);total+=  domain->cycles.restriction[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("interpolation          ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)domain->cycles.interpolation[level]);total+=domain->cycles.interpolation[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("applyOp                ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)     domain->cycles.apply_op[level]);total+=     domain->cycles.apply_op[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("BLAS1                  ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)        domain->cycles.blas1[level]);total+=        domain->cycles.blas1[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("BLAS3                  ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)        domain->cycles.blas3[level]);total+=        domain->cycles.blas3[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("communication          ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)domain->cycles.communication[level]);total+=domain->cycles.communication[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("  local exchange       ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)    domain->cycles.grid2grid[level]);total+=    domain->cycles.grid2grid[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   #ifdef __MPI
   total=0;printf("  pack MPI buffers     ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)         domain->cycles.pack[level]);total+=         domain->cycles.pack[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("  unpack MPI buffers   ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)       domain->cycles.unpack[level]);total+=       domain->cycles.unpack[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("  MPI_Isend            ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)         domain->cycles.send[level]);total+=         domain->cycles.send[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("  MPI_Irecv            ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)         domain->cycles.recv[level]);total+=         domain->cycles.recv[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("  MPI_Waitall          ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)         domain->cycles.wait[level]);total+=         domain->cycles.wait[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   total=0;printf("  MPI_collectives      ");for(level=0;level<(numLevels  );level++){printf("%12.6f ",SecondsPerCycle*(double)  domain->cycles.collectives[level]);total+=  domain->cycles.collectives[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
   #endif
   total=0;printf("--------------         ");for(level=0;level<(numLevels+1);level++){printf("------------ ");}printf("\n");
   total=0;printf("Total by level         ");for(level=0;level<(numLevels  );level++){printf("%12.6f " ,SecondsPerCycle*(double)domain->cycles.Total[level]);total+=domain->cycles.Total[level];}
                                                                                      printf("%12.6f\n",SecondsPerCycle*(double)total);
   printf("\n");
   printf( "  Total time in MGBuild   %12.6f\n",SecondsPerCycle*(double)domain->cycles.MGBuild);
   printf( "  Total time in MGSolve   %12.6f\n",SecondsPerCycle*(double)domain->cycles.MGSolve);
   printf("              \" v-cycles  %12.6f\n",SecondsPerCycle*(double)domain->cycles.vcycles);
   printf( "      number of v-cycles  %12d\n"  ,domain->vcycles_performed/domain->MGSolves_performed);
   printf( "Bottom solver iterations  %12d\n"  ,domain->Krylov_iterations/domain->MGSolves_performed);
   #if defined(__USE_CABICGSTAB) || defined(__USE_CACG)
   printf( "     formations of G[][]  %12d\n"  ,domain->CAKrylov_formations_of_G/domain->MGSolves_performed);
   #endif
   printf("\n\n");fflush(stdout);
   #endif
 }


 //------------------------------------------------------------------------------------------------------------------------------
 void MGResetTimers(domain_type * domain){
   int level;
   for(level=0;level<10;level++){
   domain->cycles.smooth[level]        = 0;
   domain->cycles.apply_op[level]      = 0;
   domain->cycles.residual[level]      = 0;
   domain->cycles.communication[level] = 0;
   domain->cycles.restriction[level]   = 0;
   domain->cycles.interpolation[level] = 0;
   domain->cycles.blas1[level]         = 0;
   domain->cycles.blas3[level]         = 0;
   domain->cycles.pack[level]          = 0;
   domain->cycles.unpack[level]        = 0;
   domain->cycles.grid2grid[level]     = 0;
   domain->cycles.recv[level]          = 0;
   domain->cycles.send[level]          = 0;
   domain->cycles.wait[level]          = 0;
   domain->cycles.collectives[level]   = 0;
   domain->cycles.Total[level]         = 0;
   }
   domain->cycles.vcycles              = 0;
   domain->cycles.MGSolve              = 0;
   domain->vcycles_performed           = 0;
   domain->MGSolves_performed          = 0;
   domain->Krylov_iterations           = 0;
   domain->CAKrylov_formations_of_G    = 0;
 }
 //------------------------------------------------------------------------------------------------------------------------------
 void MGBuild(domain_type * domain, double a, double b, double h0){
   int box,level,numLevels = domain->numLevels;

   if(domain->rank==0){printf("MGBuild...                      ");fflush(stdout);}

   // initialize timers...
   MGResetTimers(domain);
   domain->cycles.MGBuild = 0;

   uint64_t _timeStartMGBuild = CycleTime();


   // calculate hLevel for all levels/boxes
   for(level=0;level<numLevels;level++){
     double hLevel = h0 * (double)(1<<level);
     domain->h[level] = hLevel;
     for(box=0;box<domain->subdomains_per_rank;box++){
       domain->subdomains[box].levels[level].h = hLevel;
   }}

   // alias all red/black masks to box0's - performance tweak to avoid having a single global list of RedBlack masks as a function of box size
   //for(level=0;level<numLevels-1;level++){
   for(level=0;level<numLevels;level++){
     for(box=1;box<domain->subdomains_per_rank;box++){
       domain->subdomains[box].levels[level].RedBlack_64bMask = domain->subdomains[0].levels[level].RedBlack_64bMask;
       domain->subdomains[box].levels[level].RedBlack_FP[0]   = domain->subdomains[0].levels[level].RedBlack_FP[0];
       domain->subdomains[box].levels[level].RedBlack_FP[1]   = domain->subdomains[0].levels[level].RedBlack_FP[1];
   }}


   // form all restrictions of alpha[] for all boxes...
   for(level=0;level<numLevels-1;level++){      restriction(domain,level,__alpha,__alpha);}
   for(level=0;level<numLevels  ;level++){exchange_boundary(domain,level,__alpha ,1,1,1);} // FIX, only necessary if CA version


   // form all restrictions of beta_*[] for all boxes...
   #if 0
   #warning cell centered beta's are first restricted then projected for faces
   for(level=0;level<numLevels-1;level++){restriction(domain,level,__beta,__beta);}
   for(level=0;level<numLevels;level++){
     exchange_boundary(domain,level,__beta,1,1,1);
     project_cell_to_face(domain,level,__beta,__beta_i,0);
     project_cell_to_face(domain,level,__beta,__beta_j,1);
     project_cell_to_face(domain,level,__beta,__beta_k,2);
   }
   #else
   #warning cell centered beta's are first projected to faces, then restricted to coarsest grid
      exchange_boundary(domain,0,__beta,1,1,1);
   project_cell_to_face(domain,0,__beta,__beta_i,0);
   project_cell_to_face(domain,0,__beta,__beta_j,1);
   project_cell_to_face(domain,0,__beta,__beta_k,2);
   for(level=0;level<numLevels;level++){
                          exchange_boundary(domain,level,__beta_i,1,1,1); // must communicate betas before precomputing lambda (i.e. need high values of beta from next subdomain)
                          exchange_boundary(domain,level,__beta_j,1,1,1);
                          exchange_boundary(domain,level,__beta_k,1,1,1);
     if(level<numLevels-1)restriction_betas(domain,level,level+1);
   }
   #endif


   // form all restrictions of lambda[] for all boxes...
   for(level=0;level<numLevels;level++){   rebuild_lambda(domain,level,a,b);}
   for(level=0;level<numLevels;level++){exchange_boundary(domain,level,__lambda,1,1,1);} // FIX, only necessary if CA version


 /*
   // find eigenvalue_max using the power method...
   // probably won't converge, so use slop factor(*1.1) in chebyshev smoother
   if(domain->rank==0){printf("\n");fflush(stdout);}
   for(level=0;level<numLevels;level++){
     int k;
     int bk   = __ee;              // arbitrary... just reusing a grid...
     int Abk = __temp;             // arbitrary... just reusing a grid...
     noise(domain,level,bk,-1,1);  // careful... an initial guess of all 1's gives a*alpha*I+b*beta*(1-1) = {a*alpha}'s
     double eigenvalue_max = 0.0;
     for(k=0;k<10;k++){
       apply_op(domain,level,Abk,bk,a,b); // eigenvalue of D^{-1}A == lambda A bk
       mul_grids(domain,level,Abk,1.0,Abk,__lambda);
       eigenvalue_max = dot(domain,level,bk,Abk)/dot(domain,level,bk,bk);
       double den = sqrt(dot(domain,level,Abk,Abk)); // normalize bk+1 by the magnitude of Abk
       scale_grid(domain,level,bk,1.0/den,Abk); // i.e. bk+1 = Abk / ||Abk||
     }
     if(domain->rank==0){printf("  level=%2d, eigenvalue_max ~= %13.3f\n",level,eigenvalue_max);fflush(stdout);}
     domain->dominant_eigenvalue_of_DinvA[level] = eigenvalue_max;
   }
   // find eigenvalue_min using the power method...
   if(domain->rank==0){printf("\n");fflush(stdout);}
   for(level=0;level<numLevels;level++){
     int k;
     int bk   = __ee;              // arbitrary... just reusing a grid...
     int Abk = __temp;             // arbitrary... just reusing a grid...
     noise(domain,level,bk,-1,1);  // careful... an initial guess of all 1's gives a*alpha*I+b*beta*(1-1) = {a*alpha}'s
     double eigenvalue_min = 0.0;
     for(k=0;k<10;k++){
       apply_op(domain,level,Abk,bk,a,b); // eigenvalue of D^{-1}A == lambda A bk
       mul_grids(domain,level,Abk,1.0,Abk,__lambda);
       add_grids(domain,level,Abk,1.0,Abk,-domain->dominant_eigenvalue_of_DinvA[level],bk); // Abk = (A-ev_max I)bk = Abk - ev_max*bk
       eigenvalue_min = dot(domain,level,bk,Abk)/dot(domain,level,bk,bk);
       double den = sqrt(dot(domain,level,Abk,Abk)); // normalize bk+1 by the magnitude of Abk
       scale_grid(domain,level,bk,1.0/den,Abk); // i.e. bk+1 = Abk / ||Abk||
     }
     eigenvalue_min += domain->dominant_eigenvalue_of_DinvA[level];
     if(domain->rank==0){printf("  level=%2d, eigenvalue_min ~= %13.3f\n",level,eigenvalue_min);fflush(stdout);}
   }
 */


   uint64_t delta = (CycleTime()-_timeStartMGBuild);
   domain->cycles.MGBuild += delta;

   if(domain->rank==0){printf("done\n");fflush(stdout);}
 }


 //------------------------------------------------------------------------------------------------------------------------------
 void MGSolve(domain_type * domain, int u_id, int F_id, double a, double b, double desired_mg_norm){
   domain->MGSolves_performed++;
   int level;
   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   int e_id = __u;
   int R_id = __f_minus_Av;
   int box,v;
   int numLevels  = domain->numLevels;
   #ifdef __TEST_MG_CONVERGENCE
   int maxVCycles       = 20;
   #else
   int maxVCycles       = 10;
   #endif

   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   if(domain->rank==0){printf("MGSolve...                      ");fflush(stdout);}
   uint64_t _timeStartMGSolve = CycleTime();

   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   // make initial guess for e (=0) and setup the RHS
   #if 1
    zero_grid(domain,0,e_id);                // ee = 0
   scale_grid(domain,0,R_id,1.0,F_id);       // R_id = F_id
   #else
    mul_grids(domain,0,e_id,1.0,F_id,__lambda);  // e_id = F_id*lambda
   scale_grid(domain,0,R_id,1.0,F_id);           // R_id = F_id
   #endif

   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   // now do v-cycles to calculate the correction...
   uint64_t _timeStartVCycle = CycleTime();
   for(v=0;v<maxVCycles;v++){
     domain->vcycles_performed++;

     // down the v-cycle...
     for(level=0;level<(domain->numLevels-1);level++){
       uint64_t _LevelStart = CycleTime();
       #ifdef __FUSION_RESIDUAL_RESTRICTION
       smooth(domain,level,e_id,R_id,a,b);
       residual_and_restriction(domain,level,e_id,R_id,level+1,R_id,a,b);
       zero_grid(domain,level+1,e_id);
       #else
            smooth(domain,level,e_id,R_id,a,b);
          residual(domain,level,__temp,e_id,R_id,a,b);
       restriction(domain,level,R_id,__temp);
         zero_grid(domain,level+1,e_id);
       #endif
       domain->cycles.Total[level] += (uint64_t)(CycleTime()-_LevelStart);
     } // down


     // bottom solve...
     uint64_t _timeBottomStart = CycleTime();
     level = domain->numLevels-1;
     IterativeSolver(domain,level,e_id,R_id,a,b,__DEFAULT_BOTTOM_NORM);
     domain->cycles.Total[level] += (uint64_t)(CycleTime()-_timeBottomStart);


     // back up the v-cycle...
     for(level=(domain->numLevels-2);level>=0;level--){
       uint64_t _LevelStart = CycleTime();
       #ifdef __LINEAR_INTERPOLATION
          interpolation_linear(domain,level,1.0,e_id,e_id);
       #else
        interpolation_constant(domain,level,1.0,e_id,e_id);
       #endif
       smooth(domain,level,e_id,R_id,a,b);
       domain->cycles.Total[level] += (uint64_t)(CycleTime()-_LevelStart);
     } // up


     // now calculate the norm of the residual...
     #if defined(__PRINT_NORM) || defined(__TEST_MG_CONVERGENCE)
       uint64_t _timeStart = CycleTime();
       level = 0;
       residual(domain,level,__temp,e_id,F_id,a,b);
       #if 1
       #warning Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria...
       mul_grids(domain,level,__temp,1.0,__temp,__lambda); /// <<< precondition by D^{-1} ???
       #endif
       double norm_of_residual = norm(domain,level,__temp);
       uint64_t _timeNorm = CycleTime();
       domain->cycles.Total[0] += (uint64_t)(_timeNorm-_timeStart);
       #if defined(__PRINT_NORM)
       if(domain->rank==0){
         if(v==0)printf("\n");
                 printf("v-cycle=%2d, norm=%22.20f (%12.6e)\n",v+1,norm_of_residual,norm_of_residual);
       }
       #endif
       #if defined(__TEST_MG_CONVERGENCE)
       if(norm_of_residual<desired_mg_norm)break;
       #endif
     #endif

   } // maxVCycles
   domain->cycles.vcycles += (uint64_t)(CycleTime()-_timeStartVCycle);
   domain->cycles.MGSolve += (uint64_t)(CycleTime()-_timeStartMGSolve);
   //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   if(domain->rank==0){printf("done\n");fflush(stdout);}
 }