blob: 8a237edd9ad8e43811a2f61c2ca334d364684305 [file] [log] [blame]
//------------------------------------------------------------------------------------------------------------------------------
// Samuel Williams
// SWWilliams@lbl.gov
// Lawrence Berkeley National Lab
//------------------------------------------------------------------------------------------------------------------------------
#include <stdint.h>
#include "timer.h"
#include "mg.h"
//------------------------------------------------------------------------------------------------------------------------------
inline void DoBufferCopy(domain_type *domain, int level, int grid_id, int buffer){
// copy 3D array from read_i,j,k of read[] to write_i,j,k in write[]
int dim_i = domain->bufferCopies[level][buffer].dim.i;
int dim_j = domain->bufferCopies[level][buffer].dim.j;
int dim_k = domain->bufferCopies[level][buffer].dim.k;
int read_i = domain->bufferCopies[level][buffer].read.i;
int read_j = domain->bufferCopies[level][buffer].read.j;
int read_k = domain->bufferCopies[level][buffer].read.k;
int read_pencil = domain->bufferCopies[level][buffer].read.pencil;
int read_plane = domain->bufferCopies[level][buffer].read.plane;
int write_i = domain->bufferCopies[level][buffer].write.i;
int write_j = domain->bufferCopies[level][buffer].write.j;
int write_k = domain->bufferCopies[level][buffer].write.k;
int write_pencil = domain->bufferCopies[level][buffer].write.pencil;
int write_plane = domain->bufferCopies[level][buffer].write.plane;
double * __restrict__ read = domain->bufferCopies[level][buffer].read.ptr;
double * __restrict__ write = domain->bufferCopies[level][buffer].write.ptr;
if(domain->bufferCopies[level][buffer].read.box >=0) read = domain->subdomains[ domain->bufferCopies[level][buffer].read.box].levels[level].grids[grid_id];
if(domain->bufferCopies[level][buffer].write.box>=0)write = domain->subdomains[domain->bufferCopies[level][buffer].write.box].levels[level].grids[grid_id];
int i,j,k,read_ijk,write_ijk;
if(dim_i==1){ // be smart and don't have an inner loop from 0 to 1
for(k=0;k<dim_k;k++){
for(j=0;j<dim_j;j++){
int read_ijk = ( read_i) + (j+ read_j)* read_pencil + (k+ read_k)* read_plane;
int write_ijk = (write_i) + (j+write_j)*write_pencil + (k+write_k)*write_plane;
write[write_ijk] = read[read_ijk];
}}
}else if(dim_i==4){ // be smart and don't have an inner loop from 0 to 4
for(k=0;k<dim_k;k++){
for(j=0;j<dim_j;j++){
int read_ijk = ( read_i) + (j+ read_j)* read_pencil + (k+ read_k)* read_plane;
int write_ijk = (write_i) + (j+write_j)*write_pencil + (k+write_k)*write_plane;
write[write_ijk+0] = read[read_ijk+0];
write[write_ijk+1] = read[read_ijk+1];
write[write_ijk+2] = read[read_ijk+2];
write[write_ijk+3] = read[read_ijk+3];
}}
}else{
for(k=0;k<dim_k;k++){
for(j=0;j<dim_j;j++){
for(i=0;i<dim_i;i++){
int read_ijk = (i+ read_i) + (j+ read_j)* read_pencil + (k+ read_k)* read_plane;
int write_ijk = (i+write_i) + (j+write_j)*write_pencil + (k+write_k)*write_plane;
write[write_ijk] = read[read_ijk];
}}}
}
}
//------------------------------------------------------------------------------------------------------------------------------
// Exchange boundaries by aggregating into domain buffers
//------------------------------------------------------------------------------------------------------------------------------
void exchange_boundary(domain_type *domain, int level, int grid_id, int exchange_faces, int exchange_edges, int exchange_corners){
uint64_t _timeCommunicationStart = CycleTime();
uint64_t _timeStart,_timeEnd;
int buffer=0;
int sendBox,recvBox,n;
int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0};
int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0};
int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1};
int exchange[27] = {0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0};
for(n=0;n<27;n++){
if( exchange_faces )exchange[n] |= faces[n];
if( exchange_edges )exchange[n] |= edges[n];
if( exchange_corners )exchange[n] |= corners[n];
}
#ifdef __MPI
// there are up to 27 sends and up to 27 recvs
// packed lists of message info...
double * buffers_packed[54];
int sizes_packed[54];
int ranks_packed[54];
int tags_packed[54];
MPI_Request requests_packed[54];
MPI_Status status_packed[54];
int nMessages=0;
int sizes_all[27];
// precompute the possible sizes of each buffer (n.b. not all are necessarily used)
int di,dj,dk;
for(dk=-1;dk<=1;dk++){
for(dj=-1;dj<=1;dj++){
for(di=-1;di<=1;di++){
int n = 13+di+3*dj+9*dk;
sizes_all[n] = 1;
if(di==0)sizes_all[n]*=domain->subdomains_per_rank_in.i*domain->subdomains[0].levels[level].dim.i;else sizes_all[n]*=domain->subdomains[0].levels[level].ghosts;
if(dj==0)sizes_all[n]*=domain->subdomains_per_rank_in.j*domain->subdomains[0].levels[level].dim.j;else sizes_all[n]*=domain->subdomains[0].levels[level].ghosts;
if(dk==0)sizes_all[n]*=domain->subdomains_per_rank_in.k*domain->subdomains[0].levels[level].dim.k;else sizes_all[n]*=domain->subdomains[0].levels[level].ghosts;
}}}
// enumerate a packed list of messages... starting with receives...
for(n=0;n<27;n++)if(exchange[26-n] && (domain->rank_of_neighbor[26-n] != domain->rank) ){
buffers_packed[nMessages] = domain->recv_buffer[26-n];
sizes_packed[nMessages] = sizes_all[26-n];
ranks_packed[nMessages] = domain->rank_of_neighbor[26-n];
tags_packed[nMessages] = n;
nMessages++;
}
// enumerate a packed list of messages... continuing with sends...
for(n=0;n<27;n++)if(exchange[n] && (domain->rank_of_neighbor[n] != domain->rank) ){
buffers_packed[nMessages] = domain->send_buffer[n];
sizes_packed[nMessages] = sizes_all[n];
ranks_packed[nMessages] = domain->rank_of_neighbor[n];
tags_packed[nMessages] = n;
nMessages++;
}
// loop through packed list of MPI receives and prepost Irecv's...
_timeStart = CycleTime();
#ifdef __MPI_THREAD_MULTIPLE
#pragma omp parallel for schedule(dynamic,1)
#endif
for(n=0;n<nMessages/2;n++){
MPI_Irecv(buffers_packed[n],sizes_packed[n],MPI_DOUBLE,ranks_packed[n],tags_packed[n],MPI_COMM_WORLD,&requests_packed[n]);
}
_timeEnd = CycleTime();
domain->cycles.recv[level] += (_timeEnd-_timeStart);
// pack MPI send buffers...
_timeStart = CycleTime();
#pragma omp parallel for schedule(static,1)
for(buffer=domain->bufferCopy_Pack_Start;buffer<domain->bufferCopy_Pack_End;buffer++){
if( (domain->bufferCopies[level][buffer].isFace && exchange_faces ) ||
(domain->bufferCopies[level][buffer].isEdge && exchange_edges ) ||
(domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){
DoBufferCopy(domain,level,grid_id,buffer);
}}
_timeEnd = CycleTime();
domain->cycles.pack[level] += (_timeEnd-_timeStart);
// loop through MPI send buffers and post Isend's...
_timeStart = CycleTime();
#ifdef __MPI_THREAD_MULTIPLE
#pragma omp parallel for schedule(dynamic,1)
#endif
for(n=nMessages/2;n<nMessages;n++){
MPI_Isend(buffers_packed[n],sizes_packed[n],MPI_DOUBLE,ranks_packed[n],tags_packed[n],MPI_COMM_WORLD,&requests_packed[n]);
}
_timeEnd = CycleTime();
domain->cycles.send[level] += (_timeEnd-_timeStart);
#endif
// exchange locally... try and hide within Isend latency...
_timeStart = CycleTime();
#ifdef OMP
#pragma omp parallel for schedule(static,1)
#endif
for(buffer=domain->bufferCopy_Local_Start;buffer<domain->bufferCopy_Local_End;buffer++){
if( (domain->bufferCopies[level][buffer].isFace && exchange_faces ) ||
(domain->bufferCopies[level][buffer].isEdge && exchange_edges ) ||
(domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){
DoBufferCopy(domain,level,grid_id,buffer);
}}
_timeEnd = CycleTime();
domain->cycles.grid2grid[level] += (_timeEnd-_timeStart);
// wait for MPI to finish...
#ifdef __MPI
_timeStart = CycleTime();
MPI_Waitall(nMessages,requests_packed,status_packed);
_timeEnd = CycleTime();
domain->cycles.wait[level] += (_timeEnd-_timeStart);
// unpack MPI receive buffers
_timeStart = CycleTime();
#pragma omp parallel for schedule(static,1)
for(buffer=domain->bufferCopy_Unpack_Start;buffer<domain->bufferCopy_Unpack_End;buffer++){
if( (domain->bufferCopies[level][buffer].isFace && exchange_faces ) ||
(domain->bufferCopies[level][buffer].isEdge && exchange_edges ) ||
(domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){
DoBufferCopy(domain,level,grid_id,buffer);
}}
_timeEnd = CycleTime();
domain->cycles.unpack[level] += (_timeEnd-_timeStart);
#endif
domain->cycles.communication[level] += (uint64_t)(CycleTime()-_timeCommunicationStart);
}