blob: af3cb762d6204dbc0f7e96d2159836dbb30371cb [file] [log] [blame]
//------------------------------------------------------------------------------------------------------------------------------
// Samuel Williams
// SWWilliams@lbl.gov
// Lawrence Berkeley National Lab
//------------------------------------------------------------------------------------------------------------------------------
#include <stdint.h>
#include "timer.h"
#include "mg.h"
//------------------------------------------------------------------------------------------------------------------------------
void residual(domain_type * domain, int level, int res_id, int phi_id, int rhs_id, double a, double b){
// exchange the boundary for x in prep for Ax...
// for 7-point stencil, only needs to be a 1-deep ghost zone & faces only
exchange_boundary(domain,level,phi_id,1,0,0);
// now do residual/restriction proper...
uint64_t _timeStart = CycleTime();
int CollaborativeThreadingBoxSize = 100000; // i.e. never
#ifdef __COLLABORATIVE_THREADING
CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
#endif
int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize);
int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize);
int box;
#ifdef OMP
#pragma omp parallel for private(box) if(omp_across_boxes)
#endif
for(box=0;box<domain->subdomains_per_rank;box++){
int i,j,k;
int pencil = domain->subdomains[box].levels[level].pencil;
int plane = domain->subdomains[box].levels[level].plane;
int ghosts = domain->subdomains[box].levels[level].ghosts;
int dim_k = domain->subdomains[box].levels[level].dim.k;
int dim_j = domain->subdomains[box].levels[level].dim.j;
int dim_i = domain->subdomains[box].levels[level].dim.i;
double h2inv = 1.0/(domain->h[level]*domain->h[level]);
double * __restrict__ phi = domain->subdomains[box].levels[level].grids[ phi_id] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point
double * __restrict__ rhs = domain->subdomains[box].levels[level].grids[ rhs_id] + ghosts*(1+pencil+plane);
double * __restrict__ alpha = domain->subdomains[box].levels[level].grids[__alpha ] + ghosts*(1+pencil+plane);
double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[__beta_i] + ghosts*(1+pencil+plane);
double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[__beta_j] + ghosts*(1+pencil+plane);
double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane);
double * __restrict__ res = domain->subdomains[box].levels[level].grids[ res_id] + ghosts*(1+pencil+plane);
#ifdef OMP
#pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2)
#endif
for(k=0;k<dim_k;k++){
for(j=0;j<dim_j;j++){
for(i=0;i<dim_i;i++){
int ijk = i + j*pencil + k*plane;
double helmholtz = a*alpha[ijk]*phi[ijk]
-b*h2inv*(
beta_i[ijk+1 ]*( phi[ijk+1 ]-phi[ijk ] )
-beta_i[ijk ]*( phi[ijk ]-phi[ijk-1 ] )
+beta_j[ijk+pencil]*( phi[ijk+pencil]-phi[ijk ] )
-beta_j[ijk ]*( phi[ijk ]-phi[ijk-pencil] )
+beta_k[ijk+plane ]*( phi[ijk+plane ]-phi[ijk ] )
-beta_k[ijk ]*( phi[ijk ]-phi[ijk-plane ] )
);
res[ijk] = rhs[ijk]-helmholtz;
}}}
}
domain->cycles.residual[level] += (uint64_t)(CycleTime()-_timeStart);
}
#if 1
//------------------------------------------------------------------------------------------------------------------------------
// This version maximizes parallelism by parallelizing over the resultant coarse grid.
// Thus,
// one parallelizes over the list of 2x2 fine-grid bars,
// initializes a coarse grid pencil to zero,
// additively restricts each pencil in the 2x2 fine-grid bar to the coarse grid pencil
//------------------------------------------------------------------------------------------------------------------------------
void residual_and_restriction(domain_type *domain, int level_f, int phi_id, int rhs_id, int level_c, int res_id, double a, double b){
// exchange the boundary for x in prep for Ax...
// for 7-point stencil, only needs to be a 1-deep ghost zone & faces only
exchange_boundary(domain,level_f,phi_id,1,0,0);
// now do residual/restriction proper...
uint64_t _timeStart = CycleTime();
int CollaborativeThreadingBoxSize = 100000; // i.e. never
#ifdef __COLLABORATIVE_THREADING
CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
#endif
int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i < CollaborativeThreadingBoxSize);
int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize);
int box;
#ifdef OMP
#pragma omp parallel for private(box) if(omp_across_boxes)
#endif
for(box=0;box<domain->subdomains_per_rank;box++){
int kk,jj;
int pencil_c = domain->subdomains[box].levels[level_c].pencil;
int plane_c = domain->subdomains[box].levels[level_c].plane;
int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
int dim_k_c = domain->subdomains[box].levels[level_c].dim.k;
int dim_j_c = domain->subdomains[box].levels[level_c].dim.j;
int dim_i_c = domain->subdomains[box].levels[level_c].dim.i;
int pencil_f = domain->subdomains[box].levels[level_f].pencil;
int plane_f = domain->subdomains[box].levels[level_f].plane;
int ghosts_f = domain->subdomains[box].levels[level_f].ghosts;
int dim_k_f = domain->subdomains[box].levels[level_f].dim.k;
int dim_j_f = domain->subdomains[box].levels[level_f].dim.j;
int dim_i_f = domain->subdomains[box].levels[level_f].dim.i;
double h2inv = 1.0/(domain->h[level_f]*domain->h[level_f]);
double * __restrict__ phi = domain->subdomains[box].levels[level_f].grids[ phi_id] + ghosts_f*(1+pencil_f+plane_f); // i.e. [0] = first non ghost zone point
double * __restrict__ rhs = domain->subdomains[box].levels[level_f].grids[ rhs_id] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ alpha = domain->subdomains[box].levels[level_f].grids[__alpha ] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ beta_i = domain->subdomains[box].levels[level_f].grids[__beta_i] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ beta_j = domain->subdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ beta_k = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ res = domain->subdomains[box].levels[level_c].grids[ res_id] + ghosts_c*(1+pencil_c+plane_c);
#ifdef OMP
#pragma omp parallel for private(kk,jj) if(omp_within_a_box) collapse(2)
#endif
for(kk=0;kk<dim_k_f;kk+=2){
for(jj=0;jj<dim_j_f;jj+=2){
int i,j,k;
for(i=0;i<dim_i_c;i++){
int ijk_c = (i) + (jj>>1)*pencil_c + (kk>>1)*plane_c;
res[ijk_c] = 0.0;
}
for(k=kk;k<kk+2;k++){
for(j=jj;j<jj+2;j++){
for(i=0;i<dim_i_f;i++){
int ijk_f = (i ) + (j )*pencil_f + (k )*plane_f;
int ijk_c = (i>>1) + (j>>1)*pencil_c + (k>>1)*plane_c;
double helmholtz = a*alpha[ijk_f]*phi[ijk_f]
-b*h2inv*(
beta_i[ijk_f+1 ]*( phi[ijk_f+1 ]-phi[ijk_f ] )
-beta_i[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-1 ] )
+beta_j[ijk_f+pencil_f]*( phi[ijk_f+pencil_f]-phi[ijk_f ] )
-beta_j[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-pencil_f] )
+beta_k[ijk_f+plane_f ]*( phi[ijk_f+plane_f ]-phi[ijk_f ] )
-beta_k[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-plane_f ] )
);
res[ijk_c] += (rhs[ijk_f]-helmholtz)*0.125;
}
}}}}
}
domain->cycles.residual[level_f] += (uint64_t)(CycleTime()-_timeStart);
}
#else
//------------------------------------------------------------------------------------------------------------------------------
// This version performs a 1D parallelization over the coarse-grid k-dimension (every two fine-grid planes)
// It first zeros the coarse grid plane, then increments with restrictions from the fine grid
//------------------------------------------------------------------------------------------------------------------------------
void residual_and_restriction(domain_type *domain, int level_f, int phi_id, int rhs_id, int level_c, int res_id, double a, double b){
// exchange the boundary for x in prep for Ax...
// for 7-point stencil, only needs to be a 1-deep ghost zone & faces only
exchange_boundary(domain,level_f,phi_id,1,0,0);
uint64_t _timeStart = CycleTime();
int CollaborativeThreadingBoxSize = 100000; // i.e. never
#ifdef __COLLABORATIVE_THREADING
CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING;
#endif
int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i < CollaborativeThreadingBoxSize);
int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize);
int box;
#ifdef OMP
#pragma omp parallel for private(box) if(omp_across_boxes)
#endif
for(box=0;box<domain->subdomains_per_rank;box++){
int pencil_c = domain->subdomains[box].levels[level_c].pencil;
int plane_c = domain->subdomains[box].levels[level_c].plane;
int ghosts_c = domain->subdomains[box].levels[level_c].ghosts;
int dim_k_c = domain->subdomains[box].levels[level_c].dim.k;
int dim_j_c = domain->subdomains[box].levels[level_c].dim.j;
int dim_i_c = domain->subdomains[box].levels[level_c].dim.i;
int pencil_f = domain->subdomains[box].levels[level_f].pencil;
int plane_f = domain->subdomains[box].levels[level_f].plane;
int ghosts_f = domain->subdomains[box].levels[level_f].ghosts;
int dim_k_f = domain->subdomains[box].levels[level_f].dim.k;
int dim_j_f = domain->subdomains[box].levels[level_f].dim.j;
int dim_i_f = domain->subdomains[box].levels[level_f].dim.i;
double h2inv = 1.0/(domain->h[level_f]*domain->h[level_f]);
double * __restrict__ phi = domain->subdomains[box].levels[level_f].grids[ phi_id] + ghosts_f*(1+pencil_f+plane_f); // i.e. [0] = first non ghost zone point
double * __restrict__ rhs = domain->subdomains[box].levels[level_f].grids[ rhs_id] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ alpha = domain->subdomains[box].levels[level_f].grids[__alpha ] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ beta_i = domain->subdomains[box].levels[level_f].grids[__beta_i] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ beta_j = domain->subdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ beta_k = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*(1+pencil_f+plane_f);
double * __restrict__ res = domain->subdomains[box].levels[level_c].grids[ res_id] + ghosts_c*(1+pencil_c+plane_c);
int kk;
#ifdef OMP
#pragma omp parallel for private(kk) if(omp_within_a_box)
#endif
for(kk=0;kk<dim_k_f;kk+=2){
int i,j,k;
// zero out the next coarse grid plane
for(j=0;j<dim_j_c;j++){
for(i=0;i<dim_i_c;i++){
int ijk_c = (i) + (j)*pencil_c + (kk>>1)*plane_c;
res[ijk_c] = 0.0;
}}
// restrict two fine grid planes into one coarse grid plane
for(k=kk;k<kk+2;k++){
for(j=0;j<dim_j_f;j++){
for(i=0;i<dim_i_f;i++){
int ijk_f = (i ) + (j )*pencil_f + (k )*plane_f;
int ijk_c = (i>>1) + (j>>1)*pencil_c + (k>>1)*plane_c;
double helmholtz = a*alpha[ijk_f]*phi[ijk_f]
-b*h2inv*(
beta_i[ijk_f+1 ]*( phi[ijk_f+1 ]-phi[ijk_f ] )
-beta_i[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-1 ] )
+beta_j[ijk_f+pencil_f]*( phi[ijk_f+pencil_f]-phi[ijk_f ] )
-beta_j[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-pencil_f] )
+beta_k[ijk_f+plane_f ]*( phi[ijk_f+plane_f ]-phi[ijk_f ] )
-beta_k[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-plane_f ] )
);
res[ijk_c] += (rhs[ijk_f]-helmholtz)*0.125;
}}}
}
}
domain->cycles.residual[level_f] += (uint64_t)(CycleTime()-_timeStart);
}
#endif
//------------------------------------------------------------------------------------------------------------------------------