blob: b5d8cb90c2a6c36a66fdb1bceaf557d8e43de1e0 [file] [log] [blame]
//------------------------------------------------------------------------------------------------------------------------------
// Samuel Williams
// SWWilliams@lbl.gov
// Lawrence Berkeley National Lab
//------------------------------------------------------------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
//------------------------------------------------------------------------------------------------------------------------------
#ifdef __MPI
#include <mpi.h>
#endif
//------------------------------------------------------------------------------------------------------------------------------
#include "timer.h"
#include "defines.h"
#include "box.h"
#include "mg.h"
#include "operators.h"
#include "solver.h"
//------------------------------------------------------------------------------------------------------------------------------
int create_subdomain(subdomain_type * box, int subdomain_low_i, int subdomain_low_j, int subdomain_low_k,
int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k,
int numGrids, int ghosts, int numLevels){
int level;
uint64_t memory_allocated=0;
box->numLevels=numLevels;
box->ghosts=ghosts;
box->low.i = subdomain_low_i;
box->low.j = subdomain_low_j;
box->low.k = subdomain_low_k;
box->dim.i = subdomain_dim_i;
box->dim.j = subdomain_dim_j;
box->dim.k = subdomain_dim_k;
posix_memalign((void**)&(box->levels),64,numLevels*sizeof(box_type));
memory_allocated += numLevels*sizeof(box_type);
for(level=0;level<numLevels;level++){
int numGridsActual = numGrids;
if(level == (numLevels-1))numGridsActual+= IterativeSolver_NumGrids(); // the bottom solver may need a few auxillary grids...
memory_allocated += create_box(&box->levels[level],numGridsActual,subdomain_low_i>>level,subdomain_low_j>>level,subdomain_low_k>>level,
subdomain_dim_i>>level,subdomain_dim_j>>level,subdomain_dim_k>>level,ghosts);
}
return(memory_allocated);
}
void destroy_subdomain(subdomain_type * box){
int level;
for(level=0;level<box->numLevels;level++){
destroy_box(&box->levels[level]);
}
free(box->levels);
}
//------------------------------------------------------------------------------------------------------------------------------
int calculate_neighboring_subdomain_index(domain_type *domain, int bi, int bj, int bk, int di, int dj, int dk){
int ni,nj,nk;
ni=(bi+domain->subdomains_per_rank_in.i+di)%domain->subdomains_per_rank_in.i;
nj=(bj+domain->subdomains_per_rank_in.j+dj)%domain->subdomains_per_rank_in.j;
nk=(bk+domain->subdomains_per_rank_in.k+dk)%domain->subdomains_per_rank_in.k;
int index = ni + nj*domain->subdomains_per_rank_in.i + nk*domain->subdomains_per_rank_in.i*domain->subdomains_per_rank_in.j;
return(index);
}
int calculate_neighboring_subdomain_rank(domain_type * domain, int bi, int bj, int bk, int di, int dj, int dk, int ri, int rj, int rk){
if( domain->boundary_condition.i != __BOUNDARY_PERIODIC){
if( ((domain->subdomains_per_rank_in.i*ri)+bi+di) < 0 )return(-1);
if( ((domain->subdomains_per_rank_in.i*ri)+bi+di) >= domain->subdomains_in.i )return(-1);
}
if( domain->boundary_condition.j != __BOUNDARY_PERIODIC){
if( ((domain->subdomains_per_rank_in.j*rj)+bj+dj) < 0 )return(-1);
if( ((domain->subdomains_per_rank_in.j*rj)+bj+dj) >= domain->subdomains_in.j )return(-1);
}
if( domain->boundary_condition.k != __BOUNDARY_PERIODIC){
if( ((domain->subdomains_per_rank_in.k*rk)+bk+dk) < 0 )return(-1);
if( ((domain->subdomains_per_rank_in.k*rk)+bk+dk) >= domain->subdomains_in.k )return(-1);
}
// if didn't walk off domain, calculate new rank for periodic boundaries...
if((bi+di)<0)ri--;if((bi+di)>=domain->subdomains_per_rank_in.i)ri++;ri=(ri+domain->ranks_in.i)%domain->ranks_in.i;
if((bj+dj)<0)rj--;if((bj+dj)>=domain->subdomains_per_rank_in.j)rj++;rj=(rj+domain->ranks_in.j)%domain->ranks_in.j;
if((bk+dk)<0)rk--;if((bk+dk)>=domain->subdomains_per_rank_in.k)rk++;rk=(rk+domain->ranks_in.k)%domain->ranks_in.k;
int rank = ri + rj*domain->ranks_in.i + rk*domain->ranks_in.i*domain->ranks_in.j;
return(rank);
}
//------------------------------------------------------------------------------------------------------------------------------
int create_domain(domain_type * domain,
int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k,
int subdomains_per_rank_in_i, int subdomains_per_rank_in_j, int subdomains_per_rank_in_k,
int ranks_in_i, int ranks_in_j, int ranks_in_k,
int rank,
int *boundary_conditions,
int numGrids, int ghosts, int numLevels
){
int level;
int i, j, k;
int di,dj,dk;
// FIX ghosts = array (varies with level)
domain->rank = rank;
if(domain->rank==0){printf("creating domain... ");fflush(stdout);}
//FIX, limit ghosts based on coarsest problem size
if(ghosts>(subdomain_dim_i>>(numLevels-1))){if(domain->rank==0)printf("#ghosts(%d)>bottom grid size(%d)\n",ghosts,subdomain_dim_i>>(numLevels-1));exit(0);}
if( (subdomain_dim_i!=subdomain_dim_j)||(subdomain_dim_j!=subdomain_dim_k)||(subdomain_dim_i!=subdomain_dim_k) ){
if(domain->rank==0)printf("subdomain_dim's must be equal\n");exit(0);
}
uint64_t memory_allocated =0;
// processes are laid out in x, then y, then z
//int ranks = ranks_in_i*ranks_in_j*ranks_in_k;
int my_k_rank = (rank ) / (ranks_in_i*ranks_in_j);
int my_j_rank = (rank - (ranks_in_i*ranks_in_j*my_k_rank) ) / (ranks_in_i );
int my_i_rank = (rank - (ranks_in_i*ranks_in_j*my_k_rank) - (ranks_in_i*my_j_rank) );
//printf("%2d: (%2d,%2d,%2d)\n",domain->rank,my_k_rank,my_j_rank,my_i_rank);
domain->ranks_in.i = ranks_in_i;
domain->ranks_in.j = ranks_in_j;
domain->ranks_in.k = ranks_in_k;
domain->subdomains_per_rank_in.i = subdomains_per_rank_in_i;
domain->subdomains_per_rank_in.j = subdomains_per_rank_in_j;
domain->subdomains_per_rank_in.k = subdomains_per_rank_in_k;
domain->subdomains_in.i = subdomains_per_rank_in_i*ranks_in_i;
domain->subdomains_in.j = subdomains_per_rank_in_j*ranks_in_j;
domain->subdomains_in.k = subdomains_per_rank_in_k*ranks_in_k;
domain->subdomains_per_rank = subdomains_per_rank_in_i*subdomains_per_rank_in_j*subdomains_per_rank_in_k;
posix_memalign((void**)&(domain->subdomains),64,domain->subdomains_per_rank*sizeof(subdomain_type));
memory_allocated+=domain->subdomains_per_rank*sizeof(subdomain_type);
domain->dim.i = domain->subdomains_in.i * subdomain_dim_i;
domain->dim.j = domain->subdomains_in.j * subdomain_dim_j;
domain->dim.k = domain->subdomains_in.k * subdomain_dim_k;
domain->boundary_condition.i = boundary_conditions[0];
domain->boundary_condition.j = boundary_conditions[1];
domain->boundary_condition.k = boundary_conditions[2];
domain->numLevels = numLevels;
domain->numGrids = numGrids;
domain->ghosts = ghosts;
for(level=0;level<domain->numLevels;level++){
domain->h[level] = -1;
domain->dominant_eigenvalue_of_DinvA[level] = -1;
}
// calculate the neighbors of this process
for(dk=-1;dk<=1;dk++){
for(dj=-1;dj<=1;dj++){
for(di=-1;di<=1;di++){
int n = 13+di+3*dj+9*dk;
int neighbor_rank_in_i = (my_i_rank+di+ranks_in_i)%ranks_in_i;
int neighbor_rank_in_j = (my_j_rank+dj+ranks_in_j)%ranks_in_j;
int neighbor_rank_in_k = (my_k_rank+dk+ranks_in_k)%ranks_in_k;
domain->rank_of_neighbor[n] = neighbor_rank_in_i + ranks_in_i*neighbor_rank_in_j + ranks_in_i*ranks_in_j*neighbor_rank_in_k;
if( domain->boundary_condition.i != __BOUNDARY_PERIODIC){
if( (my_i_rank+di) < 0 )domain->rank_of_neighbor[n] = -1;
if( (my_i_rank+di) >= ranks_in_i )domain->rank_of_neighbor[n] = -1;
}
if( domain->boundary_condition.j != __BOUNDARY_PERIODIC){
if( (my_j_rank+dj) < 0 )domain->rank_of_neighbor[n] = -1;
if( (my_j_rank+dj) >= ranks_in_j )domain->rank_of_neighbor[n] = -1;
}
if( domain->boundary_condition.k != __BOUNDARY_PERIODIC){
if( (my_k_rank+dk) < 0 )domain->rank_of_neighbor[n] = -1;
if( (my_k_rank+dk) >= ranks_in_k )domain->rank_of_neighbor[n] = -1;
}
}}}
// subdomains within a process are laid out in i, then j, then k
for(k=0;k<subdomains_per_rank_in_k;k++){
for(j=0;j<subdomains_per_rank_in_j;j++){
for(i=0;i<subdomains_per_rank_in_i;i++){
int box = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
int low_i = subdomain_dim_i * (i + subdomains_per_rank_in_i*my_i_rank);
int low_j = subdomain_dim_j * (j + subdomains_per_rank_in_j*my_j_rank);
int low_k = subdomain_dim_k * (k + subdomains_per_rank_in_k*my_k_rank);
memory_allocated += create_subdomain(&domain->subdomains[box],low_i,low_j,low_k,subdomain_dim_i,subdomain_dim_j,subdomain_dim_k,numGrids,ghosts,numLevels);
for(dk=-1;dk<=1;dk++){
for(dj=-1;dj<=1;dj++){
for(di=-1;di<=1;di++){
int n = 13+di+3*dj+9*dk;
domain->subdomains[box].neighbors[n].rank = calculate_neighboring_subdomain_rank(domain,i,j,k,di,dj,dk,my_i_rank,my_j_rank,my_k_rank);
domain->subdomains[box].neighbors[n].local_index = calculate_neighboring_subdomain_index(domain,i,j,k,di,dj,dk);
// FIX boundary conditions??
//printf("rank=%2d, box[%2d].neighbors[%3d]=(%3d,%3d)\n",domain->rank,box,n,domain->subdomains[box].neighbors[n].rank,domain->subdomains[box].neighbors[n].local_index);
}}}
}}}
int FacesEdgesCorners[27] = {4,10,12,14,16,22, 1,3,5,7,9,11,15,17,19,21,23,25, 0,2,6,8,18,20,24,26, 13};
int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0};
int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0};
int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1};
#ifdef __MPI
// allocate MPI send/recv buffers for the 26 neighbors....
for(dk=-1;dk<=1;dk++){
for(dj=-1;dj<=1;dj++){
for(di=-1;di<=1;di++){
int n = 13+di+3*dj+9*dk;if(n!=13){
int bufSize = 1;
if(di==0)bufSize*=domain->subdomains_per_rank_in.i*subdomain_dim_i;else bufSize*=ghosts;
if(dj==0)bufSize*=domain->subdomains_per_rank_in.j*subdomain_dim_j;else bufSize*=ghosts;
if(dk==0)bufSize*=domain->subdomains_per_rank_in.k*subdomain_dim_k;else bufSize*=ghosts;
posix_memalign((void**)&(domain->send_buffer[n]),64,bufSize*sizeof(double));
posix_memalign((void**)&(domain->recv_buffer[n]),64,bufSize*sizeof(double));
memset(domain->send_buffer[n],0,bufSize*sizeof(double));
memset(domain->recv_buffer[n],0,bufSize*sizeof(double));
memory_allocated+=bufSize*sizeof(double);
memory_allocated+=bufSize*sizeof(double);
}}}}
#endif
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// now enumerate a list of data transfers to effect a ghost zone exchange (boundary conditions are skipped)
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
for(level=0;level<domain->numLevels;level++){
int initialize;for(initialize=0;initialize<2;initialize++){ // initialize=0=count/malloc. initialize=1=initialize
int buffer=0;
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// Traverse list of boxes, and create MPI pack for faces, edges, and corners
domain->bufferCopy_Pack_Start=buffer;
int nn;
#ifdef __MPI
for(nn=0;nn<26;nn++){
int n = FacesEdgesCorners[nn];
int di = ((n/1)%3) - 1;
int dj = ((n/3)%3) - 1;
int dk = ((n/9)%3) - 1;
// traverse all boxes in direction di/dj/dk... (e.g. all subdomains on the corresponding face)
int sd_i_lo,sd_i_hi;
int sd_j_lo,sd_j_hi;
int sd_k_lo,sd_k_hi;
switch(di){
case -1:sd_i_lo= 0;sd_i_hi= 1;break;
case 0:sd_i_lo= 0;sd_i_hi=subdomains_per_rank_in_i;break;
case 1:sd_i_lo=subdomains_per_rank_in_i-1;sd_i_hi=subdomains_per_rank_in_i;break;
};
switch(dj){
case -1:sd_j_lo= 0;sd_j_hi= 1;break;
case 0:sd_j_lo= 0;sd_j_hi=subdomains_per_rank_in_j;break;
case 1:sd_j_lo=subdomains_per_rank_in_j-1;sd_j_hi=subdomains_per_rank_in_j;break;
};
switch(dk){
case -1:sd_k_lo= 0;sd_k_hi= 1;break;
case 0:sd_k_lo= 0;sd_k_hi=subdomains_per_rank_in_k;break;
case 1:sd_k_lo=subdomains_per_rank_in_k-1;sd_k_hi=subdomains_per_rank_in_k;break;
};
for(k=sd_k_lo;k<sd_k_hi;k++){
for(j=sd_j_lo;j<sd_j_hi;j++){
for(i=sd_i_lo;i<sd_i_hi;i++){
int off_node_exchange=1;
if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == domain->rank)off_node_exchange=0; // same process
if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == -1)off_node_exchange=0; // domain boundary
if(off_node_exchange){
int sendBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
int grid_i,grid_j,grid_k; // ijk in the source grid (relative to first ghost zone element)
int dim_i, dim_j, dim_k; // dimensions of face/edge/corner
switch(di){
case -1:grid_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].ghosts;break;
case 0:grid_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].dim.i; break;
case 1:grid_i=domain->subdomains[sendBox].levels[level].dim.i; dim_i=domain->subdomains[sendBox].levels[level].ghosts;break;
};
switch(dj){
case -1:grid_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].ghosts;break;
case 0:grid_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].dim.j; break;
case 1:grid_j=domain->subdomains[sendBox].levels[level].dim.j; dim_j=domain->subdomains[sendBox].levels[level].ghosts;break;
};
switch(dk){
case -1:grid_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].ghosts;break;
case 0:grid_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].dim.k; break;
case 1:grid_k=domain->subdomains[sendBox].levels[level].dim.k; dim_k=domain->subdomains[sendBox].levels[level].ghosts;break;
};
int buf_i = dim_i*(i-sd_i_lo); // ijk in the target buffer
int buf_j = dim_j*(j-sd_j_lo);
int buf_k = dim_k*(k-sd_k_lo);
int buf_pencil = dim_i*(sd_i_hi-sd_i_lo);
int buf_plane = dim_j*(sd_j_hi-sd_j_lo)*buf_pencil;
// FIX, could be broken into sub buffers for more parallelism (i.e. pencils)
if(initialize==1){
domain->bufferCopies[level][buffer].dim.i = dim_i;
domain->bufferCopies[level][buffer].dim.j = dim_j;
domain->bufferCopies[level][buffer].dim.k = dim_k;
domain->bufferCopies[level][buffer].read.box = sendBox;
domain->bufferCopies[level][buffer].read.ptr = NULL;
domain->bufferCopies[level][buffer].read.i = grid_i;
domain->bufferCopies[level][buffer].read.j = grid_j;
domain->bufferCopies[level][buffer].read.k = grid_k;
domain->bufferCopies[level][buffer].read.pencil = domain->subdomains[sendBox].levels[level].pencil;
domain->bufferCopies[level][buffer].read.plane = domain->subdomains[sendBox].levels[level].plane;
domain->bufferCopies[level][buffer].write.box = -1;
domain->bufferCopies[level][buffer].write.ptr = domain->send_buffer[n];
domain->bufferCopies[level][buffer].write.i = buf_i;
domain->bufferCopies[level][buffer].write.j = buf_j;
domain->bufferCopies[level][buffer].write.k = buf_k;
domain->bufferCopies[level][buffer].write.pencil = buf_pencil;
domain->bufferCopies[level][buffer].write.plane = buf_plane;
domain->bufferCopies[level][buffer].isFace = faces[n];
domain->bufferCopies[level][buffer].isEdge = edges[n];
domain->bufferCopies[level][buffer].isCorner = corners[n];
}
//else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from box=%3d to (%3d,%3d,%3d) in MPI[%2d,%2d,%2d] (%3d,%4d)\n",level,buffer,dim_i,dim_j,dim_k,grid_i,grid_j,grid_k,sendBox,buf_i,buf_j,buf_k,di,dj,dk,buf_pencil,buf_plane);
buffer++;
} // off-process neighbor
}}} // subdomains
} // directions
#endif
domain->bufferCopy_Pack_End=buffer;
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// Traverse list of boxes, and create local exchange/domain copies for faces, edges, and corners
domain->bufferCopy_Local_Start=buffer;
for(nn=0;nn<26;nn++){
int n = FacesEdgesCorners[nn];
int di = ((n/1)%3) - 1;
int dj = ((n/3)%3) - 1;
int dk = ((n/9)%3) - 1;
for(k=0;k<subdomains_per_rank_in_k;k++){
for(j=0;j<subdomains_per_rank_in_j;j++){
for(i=0;i<subdomains_per_rank_in_i;i++){
int sendBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
int on_node_exchange = 1;
if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) != domain->rank)on_node_exchange=0; // if == -1, then domain boundary and thus not on_node
if(on_node_exchange){
int recvBox = calculate_neighboring_subdomain_index(domain,i,j,k,di,dj,dk);
int send_i,send_j,send_k; // ijk in the source grid (relative to first ghost zone element)
int recv_i,recv_j,recv_k; // ijk in the destination grid (relative to first ghost zone element)
int dim_i, dim_j, dim_k; // dimensions of face/edge/corner
switch(di){
case -1:send_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].ghosts;recv_i=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.i;break;
case 0:send_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].dim.i; recv_i=domain->subdomains[recvBox].levels[level].ghosts;break;
case 1:send_i=domain->subdomains[sendBox].levels[level].dim.i; dim_i=domain->subdomains[sendBox].levels[level].ghosts;recv_i=0;break;
};
switch(dj){
case -1:send_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].ghosts;recv_j=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.j;break;
case 0:send_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].dim.j; recv_j=domain->subdomains[recvBox].levels[level].ghosts;break;
case 1:send_j=domain->subdomains[sendBox].levels[level].dim.j; dim_j=domain->subdomains[sendBox].levels[level].ghosts;recv_j=0;break;
};
switch(dk){
case -1:send_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].ghosts;recv_k=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.k;break;
case 0:send_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].dim.k; recv_k=domain->subdomains[recvBox].levels[level].ghosts;break;
case 1:send_k=domain->subdomains[sendBox].levels[level].dim.k; dim_k=domain->subdomains[sendBox].levels[level].ghosts;recv_k=0;break;
};
// FIX, could be broken into sub buffers for more parallelism (i.e. pencils)
if(initialize==1){
domain->bufferCopies[level][buffer].dim.i = dim_i;
domain->bufferCopies[level][buffer].dim.j = dim_j;
domain->bufferCopies[level][buffer].dim.k = dim_k;
domain->bufferCopies[level][buffer].read.box = sendBox;
domain->bufferCopies[level][buffer].read.ptr = NULL;
domain->bufferCopies[level][buffer].read.i = send_i;
domain->bufferCopies[level][buffer].read.j = send_j;
domain->bufferCopies[level][buffer].read.k = send_k;
domain->bufferCopies[level][buffer].read.pencil = domain->subdomains[sendBox].levels[level].pencil;
domain->bufferCopies[level][buffer].read.plane = domain->subdomains[sendBox].levels[level].plane;
domain->bufferCopies[level][buffer].write.box = recvBox;
domain->bufferCopies[level][buffer].write.ptr = NULL;
domain->bufferCopies[level][buffer].write.i = recv_i;
domain->bufferCopies[level][buffer].write.j = recv_j;
domain->bufferCopies[level][buffer].write.k = recv_k;
domain->bufferCopies[level][buffer].write.pencil = domain->subdomains[recvBox].levels[level].pencil;
domain->bufferCopies[level][buffer].write.plane = domain->subdomains[recvBox].levels[level].plane;
domain->bufferCopies[level][buffer].isFace = faces[n];
domain->bufferCopies[level][buffer].isEdge = edges[n];
domain->bufferCopies[level][buffer].isCorner = corners[n];
}
//else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from box=%3d to (%3d,%3d,%3d) in box=%3d\n",level,buffer,dim_i,dim_j,dim_k,send_i,send_j,send_k,sendBox,recv_i,recv_j,recv_k,recvBox);
buffer++;
}
}}} // all local subdomains
} // directions
domain->bufferCopy_Local_End=buffer;
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// Traverse list of boxes and their neighbors and gather data from buffers...
// faces come from face buffers
// edges can come from either edge buffers or face buffers
// corners can come from either corner buffers, edge buffers or face buffers
domain->bufferCopy_Unpack_Start=buffer;
#ifdef __MPI
for(nn=0;nn<26;nn++){
int n = FacesEdgesCorners[nn];
int di = ((n/1)%3) - 1;
int dj = ((n/3)%3) - 1;
int dk = ((n/9)%3) - 1;
for(k=0;k<subdomains_per_rank_in_k;k++){
for(j=0;j<subdomains_per_rank_in_j;j++){
for(i=0;i<subdomains_per_rank_in_i;i++){
int recvBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j;
int off_node_exchange = 1;
if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == domain->rank)off_node_exchange=0; // same node
if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == -1)off_node_exchange=0; // domain boundary
if(off_node_exchange){
int _di=0;
int _dj=0;
int _dk=0;
if((i+di)<0){_di=-1;}else if((i+di)>=subdomains_per_rank_in_i){_di=1;}
if((j+dj)<0){_dj=-1;}else if((j+dj)>=subdomains_per_rank_in_j){_dj=1;}
if((k+dk)<0){_dk=-1;}else if((k+dk)>=subdomains_per_rank_in_k){_dk=1;}
int _n = 13 + _di + 3*_dj + 9*_dk;
int dim_i, dim_j, dim_k; // dimensions of face/edge/corner
int grid_i,grid_j,grid_k; // ijk in the destination grid (relative to first ghost zone element)
int buf_i, buf_j, buf_k; // ijk in the source buffer
int buf_plane =1;
int buf_pencil=1;
switch(di){
case -1:dim_i=domain->subdomains[recvBox].levels[level].ghosts;grid_i=0; break;
case 0:dim_i=domain->subdomains[recvBox].levels[level].dim.i; grid_i=domain->subdomains[recvBox].levels[level].ghosts; break;
case 1:dim_i=domain->subdomains[recvBox].levels[level].ghosts;grid_i=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.i;break;
};
switch(dj){
case -1:dim_j=domain->subdomains[recvBox].levels[level].ghosts;grid_j=0; break;
case 0:dim_j=domain->subdomains[recvBox].levels[level].dim.j; grid_j=domain->subdomains[recvBox].levels[level].ghosts; break;
case 1:dim_j=domain->subdomains[recvBox].levels[level].ghosts;grid_j=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.j;break;
};
switch(dk){
case -1:dim_k=domain->subdomains[recvBox].levels[level].ghosts;grid_k=0; break;
case 0:dim_k=domain->subdomains[recvBox].levels[level].dim.k; grid_k=domain->subdomains[recvBox].levels[level].ghosts; break;
case 1:dim_k=domain->subdomains[recvBox].levels[level].ghosts;grid_k=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.k;break;
};
switch(_di){
case -1:buf_i=0; buf_plane*= domain->subdomains[recvBox].levels[level].ghosts;buf_pencil= domain->subdomains[recvBox].levels[level].ghosts;break;
case 0:buf_i=i*domain->subdomains[recvBox].levels[level].dim.i;buf_plane*=subdomains_per_rank_in_i*domain->subdomains[recvBox].levels[level].dim.i; buf_pencil=subdomains_per_rank_in_i*domain->subdomains[recvBox].levels[level].dim.i; break;
case 1:buf_i=0; buf_plane*= domain->subdomains[recvBox].levels[level].ghosts;buf_pencil= domain->subdomains[recvBox].levels[level].ghosts;break;
};
switch(_dj){
case -1:buf_j=0; buf_plane*= domain->subdomains[recvBox].levels[level].ghosts;break;
case 0:buf_j=j*domain->subdomains[recvBox].levels[level].dim.j;buf_plane*=subdomains_per_rank_in_j*domain->subdomains[recvBox].levels[level].dim.j; break;
case 1:buf_j=0; buf_plane*= domain->subdomains[recvBox].levels[level].ghosts;break;
};
switch(_dk){
case -1:buf_k=0; break;
case 0:buf_k=k*domain->subdomains[recvBox].levels[level].dim.k;break;
case 1:buf_k=0; break;
};
if( (i+di>=0) && (i+di<subdomains_per_rank_in_i) ){if(di<0)buf_i-=domain->subdomains[recvBox].levels[level].ghosts;else if(di>0)buf_i+=domain->subdomains[recvBox].levels[level].dim.i;}// forward by dim.i or back by ghosts
if( (j+dj>=0) && (j+dj<subdomains_per_rank_in_j) ){if(dj<0)buf_j-=domain->subdomains[recvBox].levels[level].ghosts;else if(dj>0)buf_j+=domain->subdomains[recvBox].levels[level].dim.j;}// forward by dim.j or back by ghosts
if( (k+dk>=0) && (k+dk<subdomains_per_rank_in_k) ){if(dk<0)buf_k-=domain->subdomains[recvBox].levels[level].ghosts;else if(dk>0)buf_k+=domain->subdomains[recvBox].levels[level].dim.k;}// forward by dim.k or back by ghosts
// FIX, could be broken into sub buffers for more parallelism (i.e. pencils)
if(initialize==1){
domain->bufferCopies[level][buffer].dim.i = dim_i;
domain->bufferCopies[level][buffer].dim.j = dim_j;
domain->bufferCopies[level][buffer].dim.k = dim_k;
domain->bufferCopies[level][buffer].read.box = -1;
domain->bufferCopies[level][buffer].read.ptr = domain->recv_buffer[_n];
domain->bufferCopies[level][buffer].read.i = buf_i;
domain->bufferCopies[level][buffer].read.j = buf_j;
domain->bufferCopies[level][buffer].read.k = buf_k;
domain->bufferCopies[level][buffer].read.pencil = buf_pencil;
domain->bufferCopies[level][buffer].read.plane = buf_plane;
domain->bufferCopies[level][buffer].write.box = recvBox;
domain->bufferCopies[level][buffer].write.ptr = NULL;
domain->bufferCopies[level][buffer].write.i = grid_i;
domain->bufferCopies[level][buffer].write.j = grid_j;
domain->bufferCopies[level][buffer].write.k = grid_k;
domain->bufferCopies[level][buffer].write.pencil = domain->subdomains[recvBox].levels[level].pencil;
domain->bufferCopies[level][buffer].write.plane = domain->subdomains[recvBox].levels[level].plane;
domain->bufferCopies[level][buffer].isFace = faces[n];
domain->bufferCopies[level][buffer].isEdge = edges[n];
domain->bufferCopies[level][buffer].isCorner = corners[n];
}
//else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from MPI[%2d,%2d,%2d] (%3d,%4d) to box=%3d at (%3d,%3d,%3d)\n",level,buffer,dim_i,dim_j,dim_k,buf_i,buf_j,buf_k,_di,_dj,_dk,buf_pencil,buf_plane,recvBox,grid_i,grid_j,grid_k);
buffer++;
}
}}}
}
#endif
domain->bufferCopy_Unpack_End=buffer;
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// At the end of initialize 0, malloc the buffers
if(initialize==0){
domain->bufferCopies[level] = (bufferCopy_type*)malloc(buffer*sizeof(bufferCopy_type));
memory_allocated+=buffer*sizeof(bufferCopy_type);
}
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
}}
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if(domain->rank==0){
printf("done\n");fflush(stdout);
printf(" %d x %d x %d (per subdomain)\n",subdomain_dim_i,subdomain_dim_j,subdomain_dim_k);
printf(" %d x %d x %d (per process)\n",subdomains_per_rank_in_i*subdomain_dim_i,subdomains_per_rank_in_j*subdomain_dim_j,subdomains_per_rank_in_k*subdomain_dim_k);
printf(" %d x %d x %d (overall)\n",ranks_in_i*subdomains_per_rank_in_i*subdomain_dim_i,ranks_in_j*subdomains_per_rank_in_j*subdomain_dim_j,ranks_in_k*subdomains_per_rank_in_k*subdomain_dim_k);
printf(" %d-deep ghost zones\n",ghosts);
printf(" allocated %llu MB\n",memory_allocated>>20);
fflush(stdout);
}
return(memory_allocated);
}
void destroy_domain(domain_type * domain){
if(domain->rank==0){printf("deallocating domain... ");fflush(stdout);}
int box;for(box=0;box<domain->subdomains_per_rank;box++){
destroy_subdomain(&domain->subdomains[box]);
}
free(domain->subdomains);
// FIX, free buffer_copies, etc...
if(domain->rank==0){printf("done\n");fflush(stdout);}
}
//------------------------------------------------------------------------------------------------------------------------------
void print_timing(domain_type *domain){
#ifdef TIMING
int level,numLevels = domain->numLevels;
uint64_t _timeStart=CycleTime();sleep(1);uint64_t _timeEnd=CycleTime();
double SecondsPerCycle = (double)1.0/(double)(_timeEnd-_timeStart);
SecondsPerCycle = SecondsPerCycle/(double)domain->MGSolves_performed; // prints average performance per MGSolve
if(domain->rank!=0)return;
uint64_t total;
printf(" ");for(level=0;level<(numLevels );level++){printf("%12d ",level);}printf("\n");
printf(" ");for(level=0;level<(numLevels );level++){printf("%10d^3 ",domain->subdomains[0].levels[0].dim.i>>level);}printf(" total\n");
total=0;printf("smooth ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.smooth[level]);total+= domain->cycles.smooth[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf("residual ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.residual[level]);total+= domain->cycles.residual[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf("restriction ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.restriction[level]);total+= domain->cycles.restriction[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf("interpolation ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double)domain->cycles.interpolation[level]);total+=domain->cycles.interpolation[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf("applyOp ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.apply_op[level]);total+= domain->cycles.apply_op[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf("BLAS1 ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.blas1[level]);total+= domain->cycles.blas1[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf("BLAS3 ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.blas3[level]);total+= domain->cycles.blas3[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf("communication ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double)domain->cycles.communication[level]);total+=domain->cycles.communication[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf(" local exchange ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.grid2grid[level]);total+= domain->cycles.grid2grid[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
#ifdef __MPI
total=0;printf(" pack MPI buffers ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.pack[level]);total+= domain->cycles.pack[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf(" unpack MPI buffers ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.unpack[level]);total+= domain->cycles.unpack[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf(" MPI_Isend ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.send[level]);total+= domain->cycles.send[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf(" MPI_Irecv ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.recv[level]);total+= domain->cycles.recv[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf(" MPI_Waitall ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.wait[level]);total+= domain->cycles.wait[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
total=0;printf(" MPI_collectives ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.collectives[level]);total+= domain->cycles.collectives[level];}printf("%12.6f\n",SecondsPerCycle*(double)total);
#endif
total=0;printf("-------------- ");for(level=0;level<(numLevels+1);level++){printf("------------ ");}printf("\n");
total=0;printf("Total by level ");for(level=0;level<(numLevels );level++){printf("%12.6f " ,SecondsPerCycle*(double)domain->cycles.Total[level]);total+=domain->cycles.Total[level];}
printf("%12.6f\n",SecondsPerCycle*(double)total);
printf("\n");
printf( " Total time in MGBuild %12.6f\n",SecondsPerCycle*(double)domain->cycles.MGBuild);
printf( " Total time in MGSolve %12.6f\n",SecondsPerCycle*(double)domain->cycles.MGSolve);
printf(" \" v-cycles %12.6f\n",SecondsPerCycle*(double)domain->cycles.vcycles);
printf( " number of v-cycles %12d\n" ,domain->vcycles_performed/domain->MGSolves_performed);
printf( "Bottom solver iterations %12d\n" ,domain->Krylov_iterations/domain->MGSolves_performed);
#if defined(__USE_CABICGSTAB) || defined(__USE_CACG)
printf( " formations of G[][] %12d\n" ,domain->CAKrylov_formations_of_G/domain->MGSolves_performed);
#endif
printf("\n\n");fflush(stdout);
#endif
}
//------------------------------------------------------------------------------------------------------------------------------
void MGResetTimers(domain_type * domain){
int level;
for(level=0;level<10;level++){
domain->cycles.smooth[level] = 0;
domain->cycles.apply_op[level] = 0;
domain->cycles.residual[level] = 0;
domain->cycles.communication[level] = 0;
domain->cycles.restriction[level] = 0;
domain->cycles.interpolation[level] = 0;
domain->cycles.blas1[level] = 0;
domain->cycles.blas3[level] = 0;
domain->cycles.pack[level] = 0;
domain->cycles.unpack[level] = 0;
domain->cycles.grid2grid[level] = 0;
domain->cycles.recv[level] = 0;
domain->cycles.send[level] = 0;
domain->cycles.wait[level] = 0;
domain->cycles.collectives[level] = 0;
domain->cycles.Total[level] = 0;
}
domain->cycles.vcycles = 0;
domain->cycles.MGSolve = 0;
domain->vcycles_performed = 0;
domain->MGSolves_performed = 0;
domain->Krylov_iterations = 0;
domain->CAKrylov_formations_of_G = 0;
}
//------------------------------------------------------------------------------------------------------------------------------
void MGBuild(domain_type * domain, double a, double b, double h0){
int box,level,numLevels = domain->numLevels;
if(domain->rank==0){printf("MGBuild... ");fflush(stdout);}
// initialize timers...
MGResetTimers(domain);
domain->cycles.MGBuild = 0;
uint64_t _timeStartMGBuild = CycleTime();
// calculate hLevel for all levels/boxes
for(level=0;level<numLevels;level++){
double hLevel = h0 * (double)(1<<level);
domain->h[level] = hLevel;
for(box=0;box<domain->subdomains_per_rank;box++){
domain->subdomains[box].levels[level].h = hLevel;
}}
// alias all red/black masks to box0's - performance tweak to avoid having a single global list of RedBlack masks as a function of box size
//for(level=0;level<numLevels-1;level++){
for(level=0;level<numLevels;level++){
for(box=1;box<domain->subdomains_per_rank;box++){
domain->subdomains[box].levels[level].RedBlack_64bMask = domain->subdomains[0].levels[level].RedBlack_64bMask;
domain->subdomains[box].levels[level].RedBlack_FP[0] = domain->subdomains[0].levels[level].RedBlack_FP[0];
domain->subdomains[box].levels[level].RedBlack_FP[1] = domain->subdomains[0].levels[level].RedBlack_FP[1];
}}
// form all restrictions of alpha[] for all boxes...
for(level=0;level<numLevels-1;level++){ restriction(domain,level,__alpha,__alpha);}
for(level=0;level<numLevels ;level++){exchange_boundary(domain,level,__alpha ,1,1,1);} // FIX, only necessary if CA version
// form all restrictions of beta_*[] for all boxes...
#if 0
#warning cell centered beta's are first restricted then projected for faces
for(level=0;level<numLevels-1;level++){restriction(domain,level,__beta,__beta);}
for(level=0;level<numLevels;level++){
exchange_boundary(domain,level,__beta,1,1,1);
project_cell_to_face(domain,level,__beta,__beta_i,0);
project_cell_to_face(domain,level,__beta,__beta_j,1);
project_cell_to_face(domain,level,__beta,__beta_k,2);
}
#else
#warning cell centered beta's are first projected to faces, then restricted to coarsest grid
exchange_boundary(domain,0,__beta,1,1,1);
project_cell_to_face(domain,0,__beta,__beta_i,0);
project_cell_to_face(domain,0,__beta,__beta_j,1);
project_cell_to_face(domain,0,__beta,__beta_k,2);
for(level=0;level<numLevels;level++){
exchange_boundary(domain,level,__beta_i,1,1,1); // must communicate betas before precomputing lambda (i.e. need high values of beta from next subdomain)
exchange_boundary(domain,level,__beta_j,1,1,1);
exchange_boundary(domain,level,__beta_k,1,1,1);
if(level<numLevels-1)restriction_betas(domain,level,level+1);
}
#endif
// form all restrictions of lambda[] for all boxes...
for(level=0;level<numLevels;level++){ rebuild_lambda(domain,level,a,b);}
for(level=0;level<numLevels;level++){exchange_boundary(domain,level,__lambda,1,1,1);} // FIX, only necessary if CA version
/*
// find eigenvalue_max using the power method...
// probably won't converge, so use slop factor(*1.1) in chebyshev smoother
if(domain->rank==0){printf("\n");fflush(stdout);}
for(level=0;level<numLevels;level++){
int k;
int bk = __ee; // arbitrary... just reusing a grid...
int Abk = __temp; // arbitrary... just reusing a grid...
noise(domain,level,bk,-1,1); // careful... an initial guess of all 1's gives a*alpha*I+b*beta*(1-1) = {a*alpha}'s
double eigenvalue_max = 0.0;
for(k=0;k<10;k++){
apply_op(domain,level,Abk,bk,a,b); // eigenvalue of D^{-1}A == lambda A bk
mul_grids(domain,level,Abk,1.0,Abk,__lambda);
eigenvalue_max = dot(domain,level,bk,Abk)/dot(domain,level,bk,bk);
double den = sqrt(dot(domain,level,Abk,Abk)); // normalize bk+1 by the magnitude of Abk
scale_grid(domain,level,bk,1.0/den,Abk); // i.e. bk+1 = Abk / ||Abk||
}
if(domain->rank==0){printf(" level=%2d, eigenvalue_max ~= %13.3f\n",level,eigenvalue_max);fflush(stdout);}
domain->dominant_eigenvalue_of_DinvA[level] = eigenvalue_max;
}
// find eigenvalue_min using the power method...
if(domain->rank==0){printf("\n");fflush(stdout);}
for(level=0;level<numLevels;level++){
int k;
int bk = __ee; // arbitrary... just reusing a grid...
int Abk = __temp; // arbitrary... just reusing a grid...
noise(domain,level,bk,-1,1); // careful... an initial guess of all 1's gives a*alpha*I+b*beta*(1-1) = {a*alpha}'s
double eigenvalue_min = 0.0;
for(k=0;k<10;k++){
apply_op(domain,level,Abk,bk,a,b); // eigenvalue of D^{-1}A == lambda A bk
mul_grids(domain,level,Abk,1.0,Abk,__lambda);
add_grids(domain,level,Abk,1.0,Abk,-domain->dominant_eigenvalue_of_DinvA[level],bk); // Abk = (A-ev_max I)bk = Abk - ev_max*bk
eigenvalue_min = dot(domain,level,bk,Abk)/dot(domain,level,bk,bk);
double den = sqrt(dot(domain,level,Abk,Abk)); // normalize bk+1 by the magnitude of Abk
scale_grid(domain,level,bk,1.0/den,Abk); // i.e. bk+1 = Abk / ||Abk||
}
eigenvalue_min += domain->dominant_eigenvalue_of_DinvA[level];
if(domain->rank==0){printf(" level=%2d, eigenvalue_min ~= %13.3f\n",level,eigenvalue_min);fflush(stdout);}
}
*/
uint64_t delta = (CycleTime()-_timeStartMGBuild);
domain->cycles.MGBuild += delta;
if(domain->rank==0){printf("done\n");fflush(stdout);}
}
//------------------------------------------------------------------------------------------------------------------------------
void MGSolve(domain_type * domain, int u_id, int F_id, double a, double b, double desired_mg_norm){
domain->MGSolves_performed++;
int level;
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
int e_id = __u;
int R_id = __f_minus_Av;
int box,v;
int numLevels = domain->numLevels;
#ifdef __TEST_MG_CONVERGENCE
int maxVCycles = 20;
#else
int maxVCycles = 10;
#endif
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if(domain->rank==0){printf("MGSolve... ");fflush(stdout);}
uint64_t _timeStartMGSolve = CycleTime();
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// make initial guess for e (=0) and setup the RHS
#if 1
zero_grid(domain,0,e_id); // ee = 0
scale_grid(domain,0,R_id,1.0,F_id); // R_id = F_id
#else
mul_grids(domain,0,e_id,1.0,F_id,__lambda); // e_id = F_id*lambda
scale_grid(domain,0,R_id,1.0,F_id); // R_id = F_id
#endif
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// now do v-cycles to calculate the correction...
uint64_t _timeStartVCycle = CycleTime();
for(v=0;v<maxVCycles;v++){
domain->vcycles_performed++;
// down the v-cycle...
for(level=0;level<(domain->numLevels-1);level++){
uint64_t _LevelStart = CycleTime();
#ifdef __FUSION_RESIDUAL_RESTRICTION
smooth(domain,level,e_id,R_id,a,b);
residual_and_restriction(domain,level,e_id,R_id,level+1,R_id,a,b);
zero_grid(domain,level+1,e_id);
#else
smooth(domain,level,e_id,R_id,a,b);
residual(domain,level,__temp,e_id,R_id,a,b);
restriction(domain,level,R_id,__temp);
zero_grid(domain,level+1,e_id);
#endif
domain->cycles.Total[level] += (uint64_t)(CycleTime()-_LevelStart);
} // down
// bottom solve...
uint64_t _timeBottomStart = CycleTime();
level = domain->numLevels-1;
IterativeSolver(domain,level,e_id,R_id,a,b,__DEFAULT_BOTTOM_NORM);
domain->cycles.Total[level] += (uint64_t)(CycleTime()-_timeBottomStart);
// back up the v-cycle...
for(level=(domain->numLevels-2);level>=0;level--){
uint64_t _LevelStart = CycleTime();
#ifdef __LINEAR_INTERPOLATION
interpolation_linear(domain,level,1.0,e_id,e_id);
#else
interpolation_constant(domain,level,1.0,e_id,e_id);
#endif
smooth(domain,level,e_id,R_id,a,b);
domain->cycles.Total[level] += (uint64_t)(CycleTime()-_LevelStart);
} // up
// now calculate the norm of the residual...
#if defined(__PRINT_NORM) || defined(__TEST_MG_CONVERGENCE)
uint64_t _timeStart = CycleTime();
level = 0;
residual(domain,level,__temp,e_id,F_id,a,b);
#if 1
#warning Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria...
mul_grids(domain,level,__temp,1.0,__temp,__lambda); /// <<< precondition by D^{-1} ???
#endif
double norm_of_residual = norm(domain,level,__temp);
uint64_t _timeNorm = CycleTime();
domain->cycles.Total[0] += (uint64_t)(_timeNorm-_timeStart);
#if defined(__PRINT_NORM)
if(domain->rank==0){
if(v==0)printf("\n");
printf("v-cycle=%2d, norm=%22.20f (%12.6e)\n",v+1,norm_of_residual,norm_of_residual);
}
#endif
#if defined(__TEST_MG_CONVERGENCE)
if(norm_of_residual<desired_mg_norm)break;
#endif
#endif
} // maxVCycles
domain->cycles.vcycles += (uint64_t)(CycleTime()-_timeStartVCycle);
domain->cycles.MGSolve += (uint64_t)(CycleTime()-_timeStartMGSolve);
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if(domain->rank==0){printf("done\n");fflush(stdout);}
}