blob: f87dff19647772af47bc5d68f92c9a7b5fcdcf94 [file] [log] [blame]
/*
* Copyright (c) 2011-2012, Los Alamos National Security, LLC.
* All rights Reserved.
*
* Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
* under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
* Laboratory (LANL), which is operated by Los Alamos National Security, LLC
* for the U.S. Department of Energy. The U.S. Government has rights to use,
* reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
* ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
* ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
* to produce derivative works, such modified software should be clearly marked,
* so as not to confuse it with the version available from LANL.
*
* Additionally, redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the Los Alamos National Security, LLC, Los Alamos
* National Laboratory, LANL, the U.S. Government, nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
* NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
* SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* CLAMR -- LA-CC-11-094
* This research code is being developed as part of the
* 2011 X Division Summer Workshop for the express purpose
* of a collaborative code for development of ideas in
* the implementation of AMR codes for Exascale platforms
*
* AMR implementation of the Wave code previously developed
* as a demonstration code for regular grids on Exascale platforms
* as part of the Supercomputing Challenge and Los Alamos
* National Laboratory
*
* Authors: Bob Robey XCP-2 brobey@lanl.gov
* Neal Davis davis68@lanl.gov, davis68@illinois.edu
* David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com
* Dennis Trujillo dptrujillo@lanl.gov, dptru10@gmail.com
*
*/
#ifdef HAVE_MPI
#include "mpi.h"
#endif
#include <algorithm>
#include <unistd.h>
#include <limits.h>
#include <time.h>
#ifdef _OPENMP
#include <omp.h>
#endif
//#include "hsfc.h"
#include "KDTree.h"
#include "mesh.h"
#ifdef HAVE_OPENCL
#include "ezcl/ezcl.h"
#endif
#include "timer.h"
#ifdef HAVE_MPI
#include "l7/l7.h"
#endif
#include "reduce.h"
#include "genmalloc.h"
#include "hash.h"
#define DEBUG 0
//#define BOUNDS_CHECK 1
#ifndef DEBUG
#define DEBUG 0
#endif
#define DEBUG_RESTORE_VALS 1
typedef int scanInt;
void scan ( scanInt *input , scanInt *output , scanInt length);
#ifdef _OPENMP
#undef REZONE_NO_OPTIMIZATION
#else
#define REZONE_NO_OPTIMIZATION 1
#endif
#define TIMING_LEVEL 2
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define IPOW2(a) (2 << (a))
#if defined(MINIMUM_PRECISION)
#define CONSERVATION_EPS .1
#define STATE_EPS 15.0
#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
#define CONSERVATION_EPS .02
#define STATE_EPS .025
#elif defined(FULL_PRECISION)
#define CONSERVATION_EPS .02
#define STATE_EPS .025
#endif
typedef unsigned int uint;
#ifdef __APPLE_CC__
typedef unsigned long ulong;
#endif
#define TWO 2
#define HALF 0.5
#define __NEW_STENCIL__
//#define __OLD_STENCIL__
//#define STENCIL_WARNING 1
#ifdef STENCIL_WARNING
int do_stencil_warning=1;
#else
int do_stencil_warning=0;
#endif
#ifdef HAVE_OPENCL
#include "mesh_kernel.inc"
#endif
extern bool localStencil;
int calc_neighbor_type;
bool dynamic_load_balance_on;
bool neighbor_remap;
#ifdef _OPENMP
static bool iversion_flag = false;
#endif
static const char *mesh_timer_descriptor[MESH_TIMER_SIZE] = {
"mesh_timer_count_BCs",
"mesh_timer_calc_neighbors",
"mesh_timer_hash_setup",
"mesh_timer_hash_query",
"mesh_timer_find_boundary",
"mesh_timer_push_setup",
"mesh_timer_push_boundary",
"mesh_timer_local_list",
"mesh_timer_layer1",
"mesh_timer_layer2",
"mesh_timer_layer_list",
"mesh_timer_copy_mesh_data",
"mesh_timer_fill_mesh_ghost",
"mesh_timer_fill_neigh_ghost",
"mesh_timer_set_corner_neigh",
"mesh_timer_neigh_adjust",
"mesh_timer_setup_comm",
"mesh_timer_kdtree_setup",
"mesh_timer_kdtree_query",
"mesh_timer_refine_smooth",
"mesh_timer_rezone_all",
"mesh_timer_partition",
"mesh_timer_calc_spatial_coordinates",
"mesh_timer_load_balance"
};
#ifdef HAVE_OPENCL
cl_kernel kernel_hash_adjust_sizes;
cl_kernel kernel_hash_setup;
cl_kernel kernel_hash_setup_local;
cl_kernel kernel_neighbor_init;
cl_kernel kernel_calc_neighbors;
cl_kernel kernel_calc_neighbors_local;
cl_kernel kernel_calc_border_cells;
cl_kernel kernel_calc_border_cells2;
cl_kernel kernel_finish_scan;
cl_kernel kernel_get_border_data;
cl_kernel kernel_calc_layer1;
cl_kernel kernel_calc_layer1_sethash;
cl_kernel kernel_calc_layer2;
cl_kernel kernel_get_border_data2;
cl_kernel kernel_calc_layer2_sethash;
cl_kernel kernel_copy_mesh_data;
cl_kernel kernel_fill_mesh_ghost;
cl_kernel kernel_fill_neighbor_ghost;
cl_kernel kernel_set_corner_neighbor;
cl_kernel kernel_adjust_neighbors_local;
cl_kernel kernel_reduction_scan2;
cl_kernel kernel_reduction_count;
cl_kernel kernel_reduction_count2;
cl_kernel kernel_hash_size;
cl_kernel kernel_finish_hash_size;
cl_kernel kernel_calc_spatial_coordinates;
cl_kernel kernel_count_BCs;
cl_kernel kernel_do_load_balance_lower;
cl_kernel kernel_do_load_balance_middle;
cl_kernel kernel_do_load_balance_upper;
#ifndef MINIMUM_PRECISION
cl_kernel kernel_do_load_balance_double;
#endif
cl_kernel kernel_do_load_balance_float;
cl_kernel kernel_refine_smooth;
cl_kernel kernel_coarsen_smooth;
cl_kernel kernel_coarsen_check_block;
cl_kernel kernel_rezone_all;
cl_kernel kernel_rezone_neighbors;
#ifndef MINIMUM_PRECISION
cl_kernel kernel_rezone_one_double;
#endif
cl_kernel kernel_rezone_one_float;
cl_kernel kernel_copy_mpot_ghost_data;
cl_kernel kernel_set_boundary_refinement;
#endif
extern size_t hash_header_size;
extern int choose_hash_method;
void Mesh::write_grid(int ncycle)
{
FILE *fp;
char filename[20];
if (ncycle<0) ncycle=0;
sprintf(filename,"grid%02d.gph",ncycle);
fp=fopen(filename,"w");
fprintf(fp,"viewport %lf %lf %lf %lf\n",xmin,ymin,xmax,ymax);
for (uint ic = 0; ic < ncells; ic++) {
fprintf(fp,"rect %lf %lf %lf %lf\n",x[ic],y[ic],x[ic]+dx[ic],y[ic]+dy[ic]);
}
fprintf(fp,"line_init %lf %lf\n",x[0]+0.5*dx[0],y[0]+0.5*dy[0]);
for (uint ic = 1; ic < ncells; ic++){
fprintf(fp,"line %lf %lf\n",x[ic]+0.5*dx[ic],y[ic]+0.5*dy[ic]);
}
for (uint ic = 0; ic < ncells; ic++){
fprintf(fp,"text %lf %lf %d\n",x[ic]+0.5*dx[ic],y[ic]+0.5*dy[ic],ic);
}
fclose(fp);
}
Mesh::Mesh(FILE *fin, int *numpe)
{
char string[80];
ibase = 1;
time_t trand;
time(&trand);
srand48((long)trand);
if(fgets(string, 80, fin) == NULL) exit(-1);
sscanf(string,"levmax %d",&levmx);
if(fgets(string, 80, fin) == NULL) exit(-1);
sscanf(string,"cells %ld",&ncells);
if(fgets(string, 80, fin) == NULL) exit(-1);
sscanf(string,"numpe %d",numpe);
if(fgets(string, 80, fin) == NULL) exit(-1);
sscanf(string,"ndim %d",&ndim);
if(fgets(string, 80, fin) == NULL) exit(-1);
#ifdef MINIMUM_PRECISION
sscanf(string,"xaxis %f %f",&xmin, &deltax);
#else
sscanf(string,"xaxis %lf %lf",&xmin, &deltax);
#endif
if(fgets(string, 80, fin) == NULL) exit(-1);
sscanf(string,"yaxis %lf %lf",(double*)&ymin, (double*)&deltay);
if (ndim == THREE_DIMENSIONAL){
if(fgets(string, 80, fin) == NULL) exit(-1);
sscanf(string,"zaxis %lf %lf",(double*)&zmin, (double*)&deltaz);
}
if(fgets(string, 80, fin) == NULL) exit(-1);
index.resize(ncells);
allocate(ncells);
uint ic=0;
while(fgets(string, 80, fin)!=NULL){
sscanf(string, "%d %d %d %d", &(index[ic]), &(i[ic]), &(j[ic]), &(level[ic]));
ic++;
}
ibase = 0;
calc_spatial_coordinates(ibase);
KDTree_Initialize(&tree);
print();
if (ic != ncells) {
printf("Error -- cells read does not match number specified\n");
}
return;
}
void Mesh::print(void)
{
assert(&nlft[0] != NULL);
assert(&x[0] != NULL);
assert(&index[0] != NULL);
//printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
printf("index orig index i j lev nlft nrht nbot ntop xlow xhigh ylow yhigh\n");
for (uint ic=0; ic<ncells; ic++)
{ printf("%6d %6d %4d %4d %4d %4d %4d %4d %4d ", ic, index[ic], i[ic], j[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
printf("%8.2lf %8.2lf %8.2lf %8.2lf\n", x[ic], x[ic]+dx[ic], y[ic], y[ic]+dy[ic]); }
}
void Mesh::print_local()
{ //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
if (mesh_memory.get_memory_size(nlft) >= ncells_ghost){
fprintf(fp,"%d: index global i j lev nlft nrht nbot ntop \n",mype);
for (uint ic=0; ic<ncells; ic++) {
fprintf(fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mype,ic, ic+noffset,i[ic], j[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
}
for (uint ic=ncells; ic<ncells_ghost; ic++) {
fprintf(fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mype,ic, ic+noffset,i[ic], j[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
}
} else {
fprintf(fp,"%d: index i j lev\n",mype);
for (uint ic=0; ic<ncells_ghost; ic++) {
fprintf(fp,"%d: %6d %4d %4d %4d \n", mype,ic, i[ic], j[ic], level[ic]);
}
}
}
#ifdef HAVE_OPENCL
void Mesh::print_dev_local(void)
{
cl_command_queue command_queue = ezcl_get_command_queue();
vector<int>i_tmp(ncells_ghost);
vector<int>j_tmp(ncells_ghost);
vector<int>level_tmp(ncells_ghost);
vector<int>nlft_tmp(ncells_ghost);
vector<int>nrht_tmp(ncells_ghost);
vector<int>nbot_tmp(ncells_ghost);
vector<int>ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
//fprintf(fp,"\n%d: Printing mesh for dev_local\n\n",mype);
fprintf(fp,"%d: index global i j lev nlft nrht nbot ntop \n",mype);
for (uint ic=0; ic<MAX(ncells_ghost,ncells); ic++) {
fprintf(fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mype,ic, ic+noffset,i_tmp[ic], j_tmp[ic], level_tmp[ic], nlft_tmp[ic], nrht_tmp[ic], nbot_tmp[ic], ntop_tmp[ic]);
}
//fprintf(fp,"\n%d: Finished printing mesh for dev_local\n\n",mype);
}
void Mesh::compare_dev_local_to_local(void)
{
cl_command_queue command_queue = ezcl_get_command_queue();
vector<int>i_tmp(ncells_ghost);
vector<int>j_tmp(ncells_ghost);
vector<int>level_tmp(ncells_ghost);
vector<int>nlft_tmp(ncells_ghost);
vector<int>nrht_tmp(ncells_ghost);
vector<int>nbot_tmp(ncells_ghost);
vector<int>ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
fprintf(fp,"\n%d: Comparing mesh for dev_local to local\n\n",mype);
//fprintf(fp,"%d: index global i j lev nlft nrht nbot ntop \n",mype);
for (uint ic=0; ic<ncells_ghost; ic++) {
if (i_tmp[ic] != i[ic] ) fprintf(fp,"%d: Error: cell %d dev_i %d i %d\n",mype,ic,i_tmp[ic], i[ic]);
if (j_tmp[ic] != j[ic] ) fprintf(fp,"%d: Error: cell %d dev_j %d j %d\n",mype,ic,j_tmp[ic], j[ic]);
if (level_tmp[ic] != level[ic]) fprintf(fp,"%d: Error: cell %d dev_level %d level %d\n",mype,ic,level_tmp[ic],level[ic]);
//fprintf(fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mype,ic, ic+noffset,i_tmp[ic], j_tmp[ic], level_tmp[ic], nlft_tmp[ic], nrht_tmp[ic], nbot_tmp[ic], ntop_tmp[ic]);
}
fprintf(fp,"\n%d: Finished comparing mesh for dev_local to local\n\n",mype);
}
void Mesh::compare_neighbors_gpu_global_to_cpu_global()
{
cl_command_queue command_queue = ezcl_get_command_queue();
vector<int>nlft_check(ncells);
vector<int>nrht_check(ncells);
vector<int>nbot_check(ncells);
vector<int>ntop_check(ncells);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells*sizeof(cl_int), &nlft_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells*sizeof(cl_int), &nrht_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells*sizeof(cl_int), &nbot_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells*sizeof(cl_int), &ntop_check[0], NULL);
//printf("\n%d: Comparing neighbors for gpu_global to cpu_global\n\n",mype);
for (uint ic=0; ic<ncells; ic++) {
if (nlft[ic] != nlft_check[ic]) printf("DEBUG -- nlft: ic %d nlft %d nlft_check %d\n",ic, nlft[ic], nlft_check[ic]);
if (nrht[ic] != nrht_check[ic]) printf("DEBUG -- nrht: ic %d nrht %d nrht_check %d\n",ic, nrht[ic], nrht_check[ic]);
if (nbot[ic] != nbot_check[ic]) printf("DEBUG -- nbot: ic %d nbot %d nbot_check %d\n",ic, nbot[ic], nbot_check[ic]);
if (ntop[ic] != ntop_check[ic]) printf("DEBUG -- ntop: ic %d ntop %d ntop_check %d\n",ic, ntop[ic], ntop_check[ic]);
}
//printf("\n%d: Finished comparing mesh for dev_local to local\n\n",mype);
}
#endif
void Mesh::compare_neighbors_cpu_local_to_cpu_global(uint ncells_ghost, uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl)
{
#ifdef HAVE_MPI
int *nlft_global = mesh_global->nlft;
int *nrht_global = mesh_global->nrht;
int *nbot_global = mesh_global->nbot;
int *ntop_global = mesh_global->ntop;
vector<int> Test(ncells_ghost);
for(uint ic=0; ic<ncells; ic++){
Test[ic] = mype*1000 +ic;
}
if (numpe > 1) L7_Update(&Test[0], L7_INT, cell_handle);
vector<int> Test_global(ncells_global);
MPI_Allgatherv(&Test[0], nsizes[mype], MPI_INT, &Test_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
vector<int> Test_check(ncells);
vector<int> Test_check_global(ncells_global);
// ==================== check left value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nlft[ic]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nlft_global[ic]] != Test_check_global[ic]) {
if (mype == 0) printf("%d: Error with nlft for cell %d -- nlft %d global %d check %d\n",mype,ic,nlft_global[ic],Test_global[nlft_global[ic]],Test_check_global[ic]);
}
}
// ==================== check left left value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nlft[nlft[ic]]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nlft_global[nlft_global[ic]]] != Test_check_global[ic]) {
printf("%d: Error with nlft nlft for cell %5d -- nlftg %5d nlftg nlftg %5d global %5d\n",
mype,ic,nlft_global[ic],nlft_global[nlft_global[ic]],Test_global[nlft_global[nlft_global[ic]]]);
printf("%d: check %5d -- nlftl %5d nlftl nlftl %5d check %5d\n",
mype,ic,nlft[ic],nlft[nlft[ic]],Test_check_global[ic]);
}
}
// ==================== check right value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nrht[ic]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nrht_global[ic]] != Test_check_global[ic]) {
if (mype == 0) printf("%d: Error with nrht for cell %d -- %d %d\n",mype,ic,Test_global[nrht_global[ic]],Test_check_global[ic]);
}
}
// ==================== check right right value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nrht[nrht[ic]]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nrht_global[nrht_global[ic]]] != Test_check_global[ic]) {
printf("%d: Error with nrht nrht for cell %5d -- nrhtg %5d nrhtg nrhtg %5d global %5d\n",
mype,ic,nrht_global[ic],nrht_global[nrht_global[ic]],Test_global[nrht_global[nrht_global[ic]]]);
printf("%d: check %5d -- nrhtl %5d nrhtl nrhtl %5d check %5d\n",
mype,ic,nrht[ic],nrht[nrht[ic]],Test_check_global[ic]);
}
}
// ==================== check bottom value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nbot[ic]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nbot_global[ic]] != Test_check_global[ic]) {
if (mype == 0) printf("%d: Error with nbot for cell %d -- %d %d\n",mype,ic,Test_global[nbot_global[ic]],Test_check_global[ic]);
}
}
// ==================== check bottom bottom value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nbot[nbot[ic]]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nbot_global[nbot_global[ic]]] != Test_check_global[ic]) {
printf("%d: Error with nbot nbot for cell %5d -- nbotg %5d nbotg nbotg %5d global %5d\n",
mype,ic,nbot_global[ic],nbot_global[nbot_global[ic]],Test_global[nbot_global[nbot_global[ic]]]);
printf("%d: check %5d -- nbotl %5d nbotl nbotl %5d check %5d\n",
mype,ic,nbot[ic],nbot[nbot[ic]],Test_check_global[ic]);
}
}
// ==================== check top value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[ntop[ic]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[ntop_global[ic]] != Test_check_global[ic]) {
if (mype == 0) printf("%d: Error with ntop for cell %d -- %d %d\n",mype,ic,Test_global[ntop_global[ic]],Test_check_global[ic]);
}
}
// ==================== check top top value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[ntop[ntop[ic]]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[ntop_global[ntop_global[ic]]] != Test_check_global[ic]) {
printf("%d: Error with ntop ntop for cell %5d -- ntopg %5d ntopg ntopg %5d global %5d\n",
mype,ic,ntop_global[ic],ntop_global[ntop_global[ic]],Test_global[ntop_global[ntop_global[ic]]]);
printf("%d: check %5d -- ntopl %5d ntopl ntopl %5d check %5d\n",
mype,ic,ntop[ic],ntop[ntop[ic]],Test_check_global[ic]);
}
}
#else
// Just to get rid of compiler warnings
if (1 == 2) printf("DEBUG -- ncells_global %d ncells_ghost %d mesh_global %p nsizes[0] %d ndispl[0] %d\n",
ncells_global,ncells_ghost,mesh_global,nsizes[0],ndispl[0]);
#endif
}
#ifdef HAVE_OPENCL
void Mesh::compare_neighbors_all_to_gpu_local(Mesh *mesh_global, int *nsizes, int *ndispl)
//uint ncells_ghost, uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl)
{
#ifdef HAVE_MPI
cl_command_queue command_queue = ezcl_get_command_queue();
size_t &ncells_global = mesh_global->ncells;
int *nlft_global = mesh_global->nlft;
int *nrht_global = mesh_global->nrht;
int *nbot_global = mesh_global->nbot;
int *ntop_global = mesh_global->ntop;
// Checking CPU parallel to CPU global
vector<int> Test(ncells_ghost);
for(uint ic=0; ic<ncells; ic++){
Test[ic] = mype*1000 +ic;
}
if (numpe > 1) L7_Update(&Test[0], L7_INT, cell_handle);
vector<int> Test_global(ncells_global);
MPI_Allgatherv(&Test[0], nsizes[mype], MPI_INT, &Test_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
vector<int> Test_check(ncells);
vector<int> Test_check_global(ncells_global);
// ==================== check left value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nlft[ic]];
//if (mype == 1 && ic==0) printf("%d: nlft check for ic 0 is %d\n",mype,nlft[0]);
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
//if (Test_global[nlft_global[ic]] != Test_check_global[ic]) {
//if (mype == 0) printf("%d: Error with nlft for cell %d -- nlft %d global %d check %d\n",mype,ic,nlft_global[ic],Test_global[nlft_global[ic]],Test_check_global[ic]);
//}
}
// ==================== check left left value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nlft[nlft[ic]]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nlft_global[nlft_global[ic]]] != Test_check_global[ic]) {
printf("%d: Error with nlft nlft for cell %5d -- nlftg %5d nlftg nlftg %5d global %5d\n",
mype,ic,nlft_global[ic],nlft_global[nlft_global[ic]],Test_global[nlft_global[nlft_global[ic]]]);
printf("%d: check %5d -- nlftl %5d nlftl nlftl %5d check %5d\n",
mype,ic,nlft[ic],nlft[nlft[ic]],Test_check_global[ic]);
}
}
// ==================== check right value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nrht[ic]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nrht_global[ic]] != Test_check_global[ic]) {
if (mype == 0) printf("%d: Error with nrht for cell %d -- %d %d\n",mype,ic,Test_global[nrht_global[ic]],Test_check_global[ic]);
}
}
// ==================== check right right value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nrht[nrht[ic]]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nrht_global[nrht_global[ic]]] != Test_check_global[ic]) {
printf("%d: Error with nrht nrht for cell %5d -- nrhtg %5d nrhtg nrhtg %5d global %5d\n",
mype,ic,nrht_global[ic],nrht_global[nrht_global[ic]],Test_global[nrht_global[nrht_global[ic]]]);
printf("%d: check %5d -- nrhtl %5d nrhtl nrhtl %5d check %5d\n",
mype,ic,nrht[ic],nrht[nrht[ic]],Test_check_global[ic]);
}
}
// ==================== check bottom value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nbot[ic]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nbot_global[ic]] != Test_check_global[ic]) {
if (mype == 0) printf("%d: Error with nbot for cell %d -- %d %d\n",mype,ic,Test_global[nbot_global[ic]],Test_check_global[ic]);
}
}
// ==================== check bottom bottom value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[nbot[nbot[ic]]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[nbot_global[nbot_global[ic]]] != Test_check_global[ic]) {
printf("%d: Error with nbot nbot for cell %5d -- nbotg %5d nbotg nbotg %5d global %5d\n",
mype,ic,nbot_global[ic],nbot_global[nbot_global[ic]],Test_global[nbot_global[nbot_global[ic]]]);
printf("%d: check %5d -- nbotl %5d nbotl nbotl %5d check %5d\n",
mype,ic,nbot[ic],nbot[nbot[ic]],Test_check_global[ic]);
}
}
// ==================== check top value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[ntop[ic]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[ntop_global[ic]] != Test_check_global[ic]) {
if (mype == 0) printf("%d: Error with ntop for cell %d -- %d %d\n",mype,ic,Test_global[ntop_global[ic]],Test_check_global[ic]);
}
}
// ==================== check top top value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[ntop[ntop[ic]]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[ntop_global[ntop_global[ic]]] != Test_check_global[ic]) {
printf("%d: Error with ntop ntop for cell %5d -- ntopg %5d ntopg ntopg %5d global %5d\n",
mype,ic,ntop_global[ic],ntop_global[ntop_global[ic]],Test_global[ntop_global[ntop_global[ic]]]);
printf("%d: check %5d -- ntopl %5d ntopl ntopl %5d check %5d\n",
mype,ic,ntop[ic],ntop[ntop[ic]],Test_check_global[ic]);
}
}
// checking gpu results
vector<int> nlft_check(ncells_ghost); vector<int> nrht_check(ncells_ghost);
vector<int> nbot_check(ncells_ghost); vector<int> ntop_check(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_check[0], NULL);
for (uint ic=0; ic<ncells_ghost; ic++){
if (nlft[ic] != nlft_check[ic]) printf("%d: Error with gpu calculated nlft for cell %d nlft %d check %d\n",mype,ic,nlft[ic],nlft_check[ic]);
if (nrht[ic] != nrht_check[ic]) printf("%d: Error with gpu calculated nrht for cell %d nrht %d check %d\n",mype,ic,nrht[ic],nrht_check[ic]);
if (nbot[ic] != nbot_check[ic]) printf("%d: Error with gpu calculated nbot for cell %d nbot %d check %d\n",mype,ic,nbot[ic],nbot_check[ic]);
if (ntop[ic] != ntop_check[ic]) printf("%d: Error with gpu calculated ntop for cell %d ntop %d check %d\n",mype,ic,ntop[ic],ntop_check[ic]);
}
// ==================== check top top value ====================
for (uint ic=0; ic<ncells; ic++){
Test_check[ic] = Test[ntop[ntop[ic]]];
}
MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic=0; ic<ncells_global; ic++){
if (Test_global[ntop_global[ntop_global[ic]]] != Test_check_global[ic]) {
printf("%d: Error with ntop ntop for cell %5d -- ntopg %5d ntopg ntopg %5d global %5d\n",
mype,ic,ntop_global[ic],ntop_global[ntop_global[ic]],Test_global[ntop_global[ntop_global[ic]]]);
printf("%d: check %5d -- ntopl %5d ntopl ntopl %5d check %5d\n",
mype,ic,ntop[ic],ntop[ntop[ic]],Test_check_global[ic]);
}
}
// checking gpu results
//vector<int> nlft_check(ncells_ghost); vector<int> nrht_check(ncells_ghost);
//vector<int> nbot_check(ncells_ghost); vector<int> ntop_check(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_check[0], NULL);
for (uint ic=0; ic<ncells_ghost; ic++){
if (nlft[ic] != nlft_check[ic]) printf("%d: Error with gpu calculated nlft for cell %d nlft %d check %d\n",mype,ic,nlft[ic],nlft_check[ic]);
if (nrht[ic] != nrht_check[ic]) printf("%d: Error with gpu calculated nrht for cell %d nrht %d check %d\n",mype,ic,nrht[ic],nrht_check[ic]);
if (nbot[ic] != nbot_check[ic]) printf("%d: Error with gpu calculated nbot for cell %d nbot %d check %d\n",mype,ic,nbot[ic],nbot_check[ic]);
if (ntop[ic] != ntop_check[ic]) printf("%d: Error with gpu calculated ntop for cell %d ntop %d check %d\n",mype,ic,ntop[ic],ntop_check[ic]);
}
#else
// Just to get rid of compiler warnings
if (1 == 2) printf("DEBUG -- mesh_global %p nsizes[0] %d ndispl[0] %d\n",
mesh_global,nsizes[0],ndispl[0]);
#endif
}
void Mesh::compare_indices_gpu_global_to_cpu_global(void)
{
cl_command_queue command_queue = ezcl_get_command_queue();
vector<int> i_check(ncells);
vector<int> j_check(ncells);
vector<int> level_check(ncells);
vector<int> celltype_check(ncells);
/// Set read buffers for data.
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), &j_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells*sizeof(cl_int), &level_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE, 0, ncells*sizeof(cl_int), &celltype_check[0], NULL);
for (uint ic = 0; ic < ncells; ic++){
if (i[ic] != i_check[ic] ) printf("DEBUG -- i: ic %d i %d i_check %d\n",ic, i[ic], i_check[ic]);
if (j[ic] != j_check[ic] ) printf("DEBUG -- j: ic %d j %d j_check %d\n",ic, j[ic], j_check[ic]);
if (level[ic] != level_check[ic] ) printf("DEBUG -- level: ic %d level %d level_check %d\n",ic, level[ic], level_check[ic]);
if (celltype[ic] != celltype_check[ic] ) printf("DEBUG -- celltype: ic %d celltype %d celltype_check %d\n",ic, celltype[ic], celltype_check[ic]);
}
}
#endif
void Mesh::compare_indices_cpu_local_to_cpu_global(uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl, int cycle)
{
int *celltype_global = mesh_global->celltype;
int *i_global = mesh_global->i;
int *j_global = mesh_global->j;
int *level_global = mesh_global->level;
vector<int> i_check_global(ncells_global);
vector<int> j_check_global(ncells_global);
vector<int> level_check_global(ncells_global);
vector<int> celltype_check_global(ncells_global);
/*
vector<int> i_check_local(ncells);
vector<int> j_check_local(ncells);
vector<int> level_check_local(ncells);
vector<int> celltype_check_local(ncells);
*/
#ifdef HAVE_MPI
MPI_Allgatherv(&celltype[0], nsizes[mype], MPI_INT, &celltype_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
MPI_Allgatherv(&i[0], nsizes[mype], MPI_INT, &i_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
MPI_Allgatherv(&j[0], nsizes[mype], MPI_INT, &j_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
MPI_Allgatherv(&level[0], nsizes[mype], MPI_INT, &level_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
/*
MPI_Scatterv(&celltype_global[0], &nsizes[0], &ndispl[0], MPI_INT, &celltype_check_local[0], nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
MPI_Scatterv(&i_global[0], &nsizes[0], &ndispl[0], MPI_INT, &i_check_local[0], nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
MPI_Scatterv(&j_global[0], &nsizes[0], &ndispl[0], MPI_INT, &j_check_local[0], nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
MPI_Scatterv(&level_global[0], &nsizes[0], &ndispl[0], MPI_INT, &level_check_local[0], nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
*/
#else
// Just to get rid of compiler warnings
if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d\n",
nsizes[0],ndispl[0]);
#endif
for (uint ic = 0; ic < ncells_global; ic++){
if (celltype_global[ic] != celltype_check_global[ic]) printf("DEBUG rezone 3 at cycle %d celltype_global & celltype_check_global %d %d %d \n",cycle,ic,celltype_global[ic],celltype_check_global[ic]);
if (i_global[ic] != i_check_global[ic]) printf("DEBUG rezone 3 at cycle %d i_global & i_check_global %d %d %d \n",cycle,ic,i_global[ic],i_check_global[ic]);
if (j_global[ic] != j_check_global[ic]) printf("DEBUG rezone 3 at cycle %d j_global & j_check_global %d %d %d \n",cycle,ic,j_global[ic],j_check_global[ic]);
if (level_global[ic] != level_check_global[ic]) printf("DEBUG rezone 3 at cycle %d level_global & level_check_global %d %d %d \n",cycle,ic,level_global[ic],level_check_global[ic]);
}
/*
for (uint ic = 0; ic < ncells; ic++){
if (celltype[ic] != celltype_check_local[ic]) fprintf(fp,"DEBUG rezone 3 at cycle %d celltype & celltype_check_local %d %d %d \n",cycle,ic,celltype[ic],celltype_check_local[ic]);
if (i[ic] != i_check_local[ic]) fprintf(fp,"DEBUG rezone 3 at cycle %d i & i_check_local %d %d %d \n",cycle,ic,i[ic],i_check_local[ic]);
if (j[ic] != j_check_local[ic]) fprintf(fp,"DEBUG rezone 3 at cycle %d j & j_check_local %d %d %d \n",cycle,ic,j[ic],j_check_local[ic]);
if (level[ic] != level_check_local[ic]) fprintf(fp,"DEBUG rezone 3 at cycle %d level & level_check_local %d %d %d \n",cycle,ic,level[ic],level_check_local[ic]);
}
*/
}
#ifdef HAVE_OPENCL
void Mesh::compare_indices_all_to_gpu_local(Mesh *mesh_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle)
{
#ifdef HAVE_MPI
cl_command_queue command_queue = ezcl_get_command_queue();
int *level_global = mesh_global->level;
int *celltype_global = mesh_global->celltype;
int *i_global = mesh_global->i;
int *j_global = mesh_global->j;
cl_mem &dev_celltype_global = mesh_global->dev_celltype;
cl_mem &dev_i_global = mesh_global->dev_i;
cl_mem &dev_j_global = mesh_global->dev_j;
cl_mem &dev_level_global = mesh_global->dev_level;
// Need to compare dev_H to H, etc
vector<int> level_check(ncells);
vector<int> celltype_check(ncells);
vector<int> i_check(ncells);
vector<int> j_check(ncells);
/// Set read buffers for data.
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells*sizeof(cl_int), &level_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_FALSE, 0, ncells*sizeof(cl_int), &celltype_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_TRUE, 0, ncells*sizeof(cl_int), &j_check[0], NULL);
for (uint ic = 0; ic < ncells; ic++){
if (level[ic] != level_check[ic] ) printf("%d: DEBUG rezone 1 cell %d level %d level_check %d\n",mype, ic, level[ic], level_check[ic]);
if (celltype[ic] != celltype_check[ic] ) printf("%d: DEBUG rezone 1 cell %d celltype %d celltype_check %d\n",mype, ic, celltype[ic], celltype_check[ic]);
if (i[ic] != i_check[ic] ) printf("%d: DEBUG rezone 1 cell %d i %d i_check %d\n",mype, ic, i[ic], i_check[ic]);
if (j[ic] != j_check[ic] ) printf("%d: DEBUG rezone 1 cell %d j %d j_check %d\n",mype, ic, j[ic], j_check[ic]);
}
// And compare dev_H gathered to H_global, etc
vector<int>celltype_check_global(ncells_global);
vector<int>i_check_global(ncells_global);
vector<int>j_check_global(ncells_global);
vector<int>level_check_global(ncells_global);
MPI_Allgatherv(&celltype_check[0], nsizes[mype], MPI_INT, &celltype_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
MPI_Allgatherv(&i_check[0], nsizes[mype], MPI_INT, &i_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
MPI_Allgatherv(&j_check[0], nsizes[mype], MPI_INT, &j_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
MPI_Allgatherv(&level_check[0], nsizes[mype], MPI_INT, &level_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic = 0; ic < ncells_global; ic++){
if (level_global[ic] != level_check_global[ic] ) printf("%d: DEBUG rezone 2 cell %d level_global %d level_check_global %d\n",mype, ic, level_global[ic], level_check_global[ic]);
if (celltype_global[ic] != celltype_check_global[ic] ) printf("%d: DEBUG rezone 2 cell %d celltype_global %d celltype_check_global %d\n",mype, ic, celltype_global[ic], celltype_check_global[ic]);
if (i_global[ic] != i_check_global[ic] ) printf("%d: DEBUG rezone 2 cell %d i_global %d i_check_global %d\n",mype, ic, i_global[ic], i_check_global[ic]);
if (j_global[ic] != j_check_global[ic] ) printf("%d: DEBUG rezone 2 cell %d j_global %d j_check_global %d\n",mype, ic, j_global[ic], j_check_global[ic]);
}
// And compare H gathered to H_global, etc
MPI_Allgatherv(&celltype[0], nsizes[mype], MPI_INT, &celltype_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
MPI_Allgatherv(&i[0], nsizes[mype], MPI_INT, &i_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
MPI_Allgatherv(&j[0], nsizes[mype], MPI_INT, &j_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
MPI_Allgatherv(&level[0], nsizes[mype], MPI_INT, &level_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic = 0; ic < ncells_global; ic++){
if (celltype_global[ic] != celltype_check_global[ic]) printf("DEBUG rezone 3 at cycle %d celltype_global & celltype_check_global %d %d %d \n",ncycle,ic,celltype_global[ic],celltype_check_global[ic]);
if (i_global[ic] != i_check_global[ic]) printf("DEBUG rezone 3 at cycle %d i_global & i_check_global %d %d %d \n",ncycle,ic,i_global[ic],i_check_global[ic]);
if (j_global[ic] != j_check_global[ic]) printf("DEBUG rezone 3 at cycle %d j_global & j_check_global %d %d %d \n",ncycle,ic,j_global[ic],j_check_global[ic]);
if (level_global[ic] != level_check_global[ic]) printf("DEBUG rezone 3 at cycle %d level_global & level_check_global %d %d %d \n",ncycle,ic,level_global[ic],level_check_global[ic]);
}
// Now the global dev_H_global to H_global, etc
ezcl_enqueue_read_buffer(command_queue, dev_celltype_global, CL_FALSE, 0, ncells_global*sizeof(cl_int), &celltype_check_global[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_i_global, CL_FALSE, 0, ncells_global*sizeof(cl_int), &i_check_global[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j_global, CL_FALSE, 0, ncells_global*sizeof(cl_int), &j_check_global[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level_global, CL_TRUE, 0, ncells_global*sizeof(cl_int), &level_check_global[0], NULL);
for (uint ic = 0; ic < ncells_global; ic++){
if (celltype_global[ic] != celltype_check_global[ic]) printf("DEBUG rezone 4 at cycle %d celltype_global & celltype_check_global %d %d %d \n",ncycle,ic,celltype_global[ic],celltype_check_global[ic]);
if (i_global[ic] != i_check_global[ic]) printf("DEBUG rezone 4 at cycle %d i_global & i_check_global %d %d %d \n",ncycle,ic,i_global[ic],i_check_global[ic]);
if (j_global[ic] != j_check_global[ic]) printf("DEBUG rezone 4 at cycle %d j_global & j_check_global %d %d %d \n",ncycle,ic,j_global[ic],j_check_global[ic]);
if (level_global[ic] != level_check_global[ic]) printf("DEBUG rezone 4 at cycle %d level_global & level_check_global %d %d %d \n",ncycle,ic,level_global[ic],level_check_global[ic]);
}
#else
// Just to get rid of compiler warnings
if (1 == 2) printf("DEBUG -- mesh_global %p ncells_global %d nsizes[0] %d ndispl[0] %d ncycle %d\n",
mesh_global,ncells_global,nsizes[0],ndispl[0],ncycle);
#endif
}
void Mesh::compare_coordinates_gpu_global_to_cpu_global_double(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, double *H)
{
cl_command_queue command_queue = ezcl_get_command_queue();
vector<spatial_t>x_check(ncells);
vector<spatial_t>dx_check(ncells);
vector<spatial_t>y_check(ncells);
vector<spatial_t>dy_check(ncells);
vector<double>H_check(ncells);
ezcl_enqueue_read_buffer(command_queue, dev_x, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &x_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_dx, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dx_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_y, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &y_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_dy, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dy_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_H, CL_TRUE, 0, ncells*sizeof(cl_double), &H_check[0], NULL);
for (uint ic = 0; ic < ncells; ic++){
if (x[ic] != x_check[ic] || dx[ic] != dx_check[ic] || y[ic] != y_check[ic] || dy[ic] != dy_check[ic] ) {
printf("Error -- mismatch in spatial coordinates for cell %d is gpu %lf %lf %lf %lf cpu %lf %lf %lf %lf\n",ic,x_check[ic],dx_check[ic],y_check[ic],dy_check[ic],x[ic],dx[ic],y[ic],dy[ic]);
exit(0);
}
}
for (uint ic = 0; ic < ncells; ic++){
if (fabs(H[ic] - H_check[ic]) > CONSERVATION_EPS) {
printf("Error -- mismatch in H for cell %d is gpu %lf cpu %lf\n",ic,H_check[ic],H[ic]);
exit(0);
}
}
}
void Mesh::compare_coordinates_gpu_global_to_cpu_global_float(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, float *H)
{
cl_command_queue command_queue = ezcl_get_command_queue();
vector<spatial_t>x_check(ncells);
vector<spatial_t>dx_check(ncells);
vector<spatial_t>y_check(ncells);
vector<spatial_t>dy_check(ncells);
vector<float>H_check(ncells);
ezcl_enqueue_read_buffer(command_queue, dev_x, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &x_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_dx, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dx_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_y, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &y_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_dy, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dy_check[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_H, CL_TRUE, 0, ncells*sizeof(cl_float), &H_check[0], NULL);
for (uint ic = 0; ic < ncells; ic++){
if (x[ic] != x_check[ic] || dx[ic] != dx_check[ic] || y[ic] != y_check[ic] || dy[ic] != dy_check[ic] ) {
printf("Error -- mismatch in spatial coordinates for cell %d is gpu %lf %lf %lf %lf cpu %lf %lf %lf %lf\n",ic,x_check[ic],dx_check[ic],y_check[ic],dy_check[ic],x[ic],dx[ic],y[ic],dy[ic]);
exit(0);
}
}
for (uint ic = 0; ic < ncells; ic++){
if (fabs(H[ic] - H_check[ic]) > CONSERVATION_EPS) {
printf("Error -- mismatch in H for cell %d is gpu %lf cpu %lf\n",ic,H_check[ic],H[ic]);
exit(0);
}
}
}
#endif
void Mesh::compare_coordinates_cpu_local_to_cpu_global_double(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, double *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, double *H_global, int cycle)
{
vector<spatial_t> x_check_global(ncells_global);
vector<spatial_t> dx_check_global(ncells_global);
vector<spatial_t> y_check_global(ncells_global);
vector<spatial_t> dy_check_global(ncells_global);
vector<double> H_check_global(ncells_global);
#ifdef HAVE_MPI
MPI_Allgatherv(&x[0], nsizes[mype], MPI_SPATIAL_T, &x_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
MPI_Allgatherv(&dx[0], nsizes[mype], MPI_SPATIAL_T, &dx_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
MPI_Allgatherv(&y[0], nsizes[mype], MPI_SPATIAL_T, &y_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
MPI_Allgatherv(&dy[0], nsizes[mype], MPI_SPATIAL_T, &dy_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
MPI_Allgatherv(&H[0], nsizes[mype], MPI_DOUBLE, &H_check_global[0], &nsizes[0], &ndispl[0], MPI_DOUBLE, MPI_COMM_WORLD);
#else
// Just to get rid of compiler warnings
if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d x %p dx %p y %p dy %p H %p\n",
nsizes[0],ndispl[0],x,dx,y,dy,H);
#endif
for (uint ic = 0; ic < ncells_global; ic++){
if (fabs(x_global[ic] -x_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d x_global & x_check_global %d %lf %lf \n",cycle,ic,x_global[ic], x_check_global[ic]);
if (fabs(dx_global[ic]-dx_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dx_global & dx_check_global %d %lf %lf \n",cycle,ic,dx_global[ic],dx_check_global[ic]);
if (fabs(y_global[ic] -y_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d y_global & y_check_global %d %lf %lf \n",cycle,ic,y_global[ic], y_check_global[ic]);
if (fabs(dy_global[ic]-dy_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dy_global & dy_check_global %d %lf %lf \n",cycle,ic,dy_global[ic],dy_check_global[ic]);
if (fabs(H_global[ic] -H_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d H_global & H_check_global %d %lf %lf \n",cycle,ic,H_global[ic], H_check_global[ic]);
}
}
void Mesh::compare_coordinates_cpu_local_to_cpu_global_float(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, float *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, float *H_global, int cycle)
{
vector<spatial_t> x_check_global(ncells_global);
vector<spatial_t> dx_check_global(ncells_global);
vector<spatial_t> y_check_global(ncells_global);
vector<spatial_t> dy_check_global(ncells_global);
vector<float> H_check_global(ncells_global);
#ifdef HAVE_MPI
MPI_Allgatherv(&x[0], nsizes[mype], MPI_SPATIAL_T, &x_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
MPI_Allgatherv(&dx[0], nsizes[mype], MPI_SPATIAL_T, &dx_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
MPI_Allgatherv(&y[0], nsizes[mype], MPI_SPATIAL_T, &y_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
MPI_Allgatherv(&dy[0], nsizes[mype], MPI_SPATIAL_T, &dy_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
MPI_Allgatherv(&H[0], nsizes[mype], MPI_FLOAT, &H_check_global[0], &nsizes[0], &ndispl[0], MPI_FLOAT, MPI_COMM_WORLD);
#else
// Just to get rid of compiler warnings
if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d x %p dx %p y %p dy %p H %p\n",
nsizes[0],ndispl[0],x,dx,y,dy,H);
#endif
for (uint ic = 0; ic < ncells_global; ic++){
if (fabs(x_global[ic] -x_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d x_global & x_check_global %d %lf %lf \n",cycle,ic,x_global[ic], x_check_global[ic]);
if (fabs(dx_global[ic]-dx_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dx_global & dx_check_global %d %lf %lf \n",cycle,ic,dx_global[ic],dx_check_global[ic]);
if (fabs(y_global[ic] -y_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d y_global & y_check_global %d %lf %lf \n",cycle,ic,y_global[ic], y_check_global[ic]);
if (fabs(dy_global[ic]-dy_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dy_global & dy_check_global %d %lf %lf \n",cycle,ic,dy_global[ic],dy_check_global[ic]);
if (fabs(H_global[ic] -H_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d H_global & H_check_global %d %lf %lf \n",cycle,ic,H_global[ic], H_check_global[ic]);
}
}
#ifdef HAVE_OPENCL
void Mesh::compare_mpot_gpu_global_to_cpu_global(int *mpot, cl_mem dev_mpot)
{
cl_command_queue command_queue = ezcl_get_command_queue();
vector<int>mpot_check(ncells);
ezcl_enqueue_read_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), &mpot_check[0], NULL);
for (uint ic=0; ic<ncells; ic++) {
if (mpot[ic] != mpot_check[ic]) printf("DEBUG -- mpot: ic %d mpot %d mpot_check %d\n",ic, mpot[ic], mpot_check[ic]);
}
}
#endif
void Mesh::compare_mpot_cpu_local_to_cpu_global(uint ncells_global, int *nsizes, int *ndispl, int *mpot, int *mpot_global, int cycle)
{
vector<int>mpot_save_global(ncells_global);
#ifdef HAVE_MPI
MPI_Allgatherv(&mpot[0], ncells, MPI_INT, &mpot_save_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
#else
// Just to get rid of compiler warnings
if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d mpot %p\n",
nsizes[0],ndispl[0],mpot);
#endif
for (uint ic = 0; ic < ncells_global; ic++){
if (mpot_global[ic] != mpot_save_global[ic]) {
if (mype == 0) printf("%d: DEBUG refine_potential 3 at cycle %d cell %d mpot_global & mpot_save_global %d %d \n",mype,cycle,ic,mpot_global[ic],mpot_save_global[ic]);
}
}
}
#ifdef HAVE_OPENCL
void Mesh::compare_mpot_all_to_gpu_local(int *mpot, int *mpot_global, cl_mem dev_mpot, cl_mem dev_mpot_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle)
{
#ifdef HAVE_MPI
cl_command_queue command_queue = ezcl_get_command_queue();
// Need to compare dev_mpot to mpot
vector<int>mpot_save(ncells);
ezcl_enqueue_read_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), &mpot_save[0], NULL);
for (uint ic = 0; ic < ncells; ic++){
if (mpot[ic] != mpot_save[ic]) {
printf("%d: DEBUG refine_potential 1 at cycle %d cell %d mpot & mpot_save %d %d \n",mype,ncycle,ic,mpot[ic],mpot_save[ic]);
}
}
// Compare dev_mpot to mpot_global
vector<int>mpot_save_global(ncells_global);
MPI_Allgatherv(&mpot_save[0], nsizes[mype], MPI_INT, &mpot_save_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic = 0; ic < ncells_global; ic++){
if (mpot_global[ic] != mpot_save_global[ic]) {
if (mype == 0) printf("%d: DEBUG refine_potential 2 at cycle %d cell %d mpot_global & mpot_save_global %d %d \n",mype,ncycle,ic,mpot_global[ic],mpot_save_global[ic]);
}
}
// Compare mpot to mpot_global
MPI_Allgatherv(&mpot[0], nsizes[mype], MPI_INT, &mpot_save_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
for (uint ic = 0; ic < ncells_global; ic++){
if (mpot_global[ic] != mpot_save_global[ic]) {
if (mype == 0) printf("%d: DEBUG refine_potential 3 at cycle %d cell %d mpot_global & mpot_save_global %d %d \n",mype,ncycle,ic,mpot_global[ic],mpot_save_global[ic]);
}
}
// Compare dev_mpot_global to mpot_global
ezcl_enqueue_read_buffer(command_queue, dev_mpot_global, CL_TRUE, 0, ncells_global*sizeof(cl_int), &mpot_save_global[0], NULL);
for (uint ic = 0; ic < ncells_global; ic++){
if (mpot_global[ic] != mpot_save_global[ic]) {
if (mype == 0) printf("%d: DEBUG refine_potential 4 at cycle %d cell %u mpot_global & mpot_save_global %d %d \n",mype,ncycle,ic,mpot_global[ic],mpot_save_global[ic]);
}
}
#else
// Just to get rid of compiler warnings
if (1 == 2) printf("DEBUG -- mpot %p mpot_global %p dev_mpot %p dev_mpot_global %p ncells_global %d nsizes[0] %d ndispl[0] %d ncycle %d\n",
mpot,mpot_global,dev_mpot,dev_mpot_global,ncells_global,nsizes[0],ndispl[0],ncycle);
#endif
}
void Mesh::compare_ioffset_gpu_global_to_cpu_global(uint old_ncells, int *mpot)
{
cl_command_queue command_queue = ezcl_get_command_queue();
size_t local_work_size = MIN(ncells, TILE_SIZE);
size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
//size_t block_size = (ncells + TILE_SIZE - 1) / TILE_SIZE; // For on-device global reduction kernel.
size_t block_size = global_work_size/local_work_size;
vector<int> ioffset_check(block_size);
ezcl_enqueue_read_buffer(command_queue, dev_ioffset, CL_TRUE, 0, block_size*sizeof(cl_int), &ioffset_check[0], NULL);
int mcount, mtotal;
mtotal = 0;
for (uint ig=0; ig<(old_ncells+TILE_SIZE-1)/TILE_SIZE; ig++){
mcount = 0;
for (uint ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){
if (ic >= old_ncells) break;
if (mpot[ic] < 0) {
if (celltype[ic] == REAL_CELL) {
// remove all but cell that will remain to get count right when split
// across processors
if (is_lower_left(i[ic],j[ic]) ) mcount++;
} else {
// either upper right or lower left will remain for boundary cells
if (is_upper_right(i[ic],j[ic]) || is_lower_left(i[ic],j[ic]) ) mcount++;
}
}
if (mpot[ic] >= 0) {
if (celltype[ic] == REAL_CELL){
mcount += mpot[ic] ? 4 : 1;
} else {
mcount += mpot[ic] ? 2 : 1;
}
}
}
if (mtotal != ioffset_check[ig]) printf("DEBUG ig %d ioffset %d mcount %d\n",ig,ioffset_check[ig],mtotal);
mtotal += mcount;
}
}
void Mesh::compare_ioffset_all_to_gpu_local(uint old_ncells, uint old_ncells_global, int block_size, int block_size_global, int *mpot, int *mpot_global, cl_mem dev_ioffset, cl_mem dev_ioffset_global, int *ioffset, int *ioffset_global, int *celltype_global, int *i_global, int *j_global)
{
cl_command_queue command_queue = ezcl_get_command_queue();
// This compares ioffset for each block in the calculation
ezcl_enqueue_read_buffer(command_queue, dev_ioffset, CL_TRUE, 0, block_size*sizeof(cl_int), &ioffset[0], NULL);
int mtotal = 0;
for (uint ig=0; ig<(old_ncells+TILE_SIZE-1)/TILE_SIZE; ig++){
int mcount = 0;
for (uint ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){
if (ic >= old_ncells) break;
if (mpot[ic] < 0) {
if (celltype[ic] == REAL_CELL) {
// remove all but cell that will remain to get count right when split
// across processors
if (is_lower_left(i[ic],j[ic]) ) mcount++;
} else {
// either upper right or lower left will remain for boundary cells
if (is_upper_right(i[ic],j[ic]) || is_lower_left(i[ic],j[ic]) ) mcount++;
}
}
if (mpot[ic] >= 0) {
if (celltype[ic] == REAL_CELL){
mcount += mpot[ic] ? 4 : 1;
} else {
mcount += mpot[ic] ? 2 : 1;
}
}
}
if (mtotal != ioffset[ig]) printf("%d: DEBUG ig %d ioffset %d mtotal %d\n",mype,ig,ioffset[ig],mtotal);
mtotal += mcount;
}
// For global This compares ioffset for each block in the calculation
ezcl_enqueue_read_buffer(command_queue, dev_ioffset_global, CL_TRUE, 0, block_size_global*sizeof(cl_int), &ioffset_global[0], NULL);
mtotal = 0;
int count = 0;
for (uint ig=0; ig<(old_ncells_global+TILE_SIZE-1)/TILE_SIZE; ig++){
int mcount = 0;
for (uint ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){
if (ic >= old_ncells_global) break;
if (mpot_global[ic] < 0) {
if (celltype_global[ic] == REAL_CELL) {
// remove all but cell that will remain to get count right when split
// across processors
if (is_lower_left(i_global[ic],j_global[ic]) ) mcount++;
} else {
// either upper right or lower left will remain for boundary cells
if (is_upper_right(i_global[ic],j_global[ic]) || is_lower_left(i_global[ic],j_global[ic]) ) mcount++;
}
}
if (mpot_global[ic] >= 0) {
if (celltype_global[ic] == REAL_CELL) {
mcount += mpot_global[ic] ? 4 : 1;
} else {
mcount += mpot_global[ic] ? 2 : 1;
}
}
}
if (mtotal != ioffset_global[ig]) {
printf("DEBUG global ig %d ioffset %d mtotal %d\n",ig,ioffset_global[ig],mtotal);
count++;
}
if (count > 10) exit(0);
mtotal += mcount;
}
}
#endif
Mesh::Mesh(int nx, int ny, int levmx_in, int ndim_in, double deltax_in, double deltay_in, int boundary, int parallel_in, int do_gpu_calc)
{
lowerBound_Global = NULL;
upperBound_Global = NULL;
for (int i = 0; i < MESH_TIMER_SIZE; i++){
cpu_timers[i] = 0.0;
gpu_timers[i] = 0L;
}
for (int i = 0; i < MESH_COUNTER_SIZE; i++){
cpu_counters[i] = 0;
gpu_counters[i] = 0;
}
ndim = ndim_in;
levmx = levmx_in;
#ifdef HAVE_OPENCL
if (ndim == TWO_DIMENSIONAL) defines = "-DTWO_DIMENSIONAL -DCARTESIAN";
#endif
offtile_ratio_local = 0;
offtile_local_count = 1;
mype = 0;
numpe = 1;
ncells = 0;
ncells_ghost = 0;
parallel = parallel_in;
noffset = 0;
mem_factor = 1.0;
//mem_factor = 1.5;
#ifdef HAVE_MPI
int mpi_init;
MPI_Initialized(&mpi_init);
if (mpi_init && parallel){
MPI_Comm_rank(MPI_COMM_WORLD,&mype);
MPI_Comm_size(MPI_COMM_WORLD,&numpe);
}
// TODO add fini
if (parallel) mesh_memory.pinit(MPI_COMM_WORLD, 2L * 1024 * 1024 * 1024);
#endif
cell_handle = 0;
if (numpe == 1) mem_factor = 1.0;
deltax = deltax_in;
deltay = deltay_in;
have_boundary = boundary;
//int istart = 1;
//int jstart = 1;
//int iend = nx;
//int jend = ny;
int nxx = nx;
int nyy = ny;
imin = 0;
jmin = 0;
imax = nx+1;
jmax = ny+1;
if (have_boundary) {
//istart = 0;
//jstart = 0;
//iend = nx + 1;
//jend = ny + 1;
nxx = nx + 2;
nyy = ny + 2;
imin = 0;
jmin = 0;
imax = nx + 1;
jmax = ny + 1;
}
xmin = -deltax * 0.5 * (real_t)nxx;
ymin = -deltay * 0.5 * (real_t)nyy;
xmax = deltax * 0.5 * (real_t)nxx;
ymax = deltay * 0.5 * (real_t)nyy;
size_t lvlMxSize = levmx + 1;
levtable.resize(lvlMxSize);
lev_ibegin.resize(lvlMxSize);
lev_jbegin.resize(lvlMxSize);
lev_iend.resize( lvlMxSize);
lev_jend.resize( lvlMxSize);
lev_deltax.resize(lvlMxSize);
lev_deltay.resize(lvlMxSize);
lev_ibegin[0] = imin + 1;
lev_iend[0] = imax - 1;
lev_jbegin[0] = jmin + 1;
lev_jend[0] = jmax - 1;
lev_deltax[0] = deltax;
lev_deltay[0] = deltay;
for (int lev = 1; lev <= levmx; lev++) {
lev_ibegin[lev] = lev_ibegin[lev-1]*2;
lev_iend[lev] = lev_iend [lev-1]*2 + 1;
lev_jbegin[lev] = lev_jbegin[lev-1]*2;
lev_jend[lev] = lev_jend [lev-1]*2 + 1;
lev_deltax[lev] = lev_deltax[lev-1]*0.5;
lev_deltay[lev] = lev_deltay[lev-1]*0.5;
}
for (uint lev=0; lev<lvlMxSize; lev++){
levtable[lev] = IPOW2(lev);
}
if (do_gpu_calc) {
#ifdef HAVE_OPENCL
// The copy host ptr flag will have the data copied to the GPU as part of the allocation
dev_levtable = ezcl_malloc(&levtable[0], const_cast<char *>("dev_levtable"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
dev_levdx = ezcl_malloc(&lev_deltax[0], const_cast<char *>("dev_levdx"), &lvlMxSize, sizeof(cl_real_t), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
dev_levdy = ezcl_malloc(&lev_deltay[0], const_cast<char *>("dev_levdy"), &lvlMxSize, sizeof(cl_real_t), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
dev_levibeg = ezcl_malloc(&lev_ibegin[0], const_cast<char *>("dev_levibeg"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
dev_leviend = ezcl_malloc(&lev_iend[0], const_cast<char *>("dev_leviend"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
dev_levjbeg = ezcl_malloc(&lev_jbegin[0], const_cast<char *>("dev_levjbeg"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
dev_levjend = ezcl_malloc(&lev_jend[0], const_cast<char *>("dev_levjend"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
#endif
}
ibase = 0;
int ncells_corners = 4;
int i_corner[] = { 0, 0,imax,imax};
int j_corner[] = { 0,jmax, 0,jmax};
for(int ic=0; ic<ncells_corners; ic++){
for (int jj = j_corner[ic]*IPOW2(levmx); jj < (j_corner[ic]+1)*IPOW2(levmx); jj++) {
for (int ii = i_corner[ic]*IPOW2(levmx); ii < (i_corner[ic]+1)*IPOW2(levmx); ii++) {
corners_i.push_back(ii);
corners_j.push_back(jj);
}
}
}
do_rezone = true;
gpu_do_rezone = true;
celltype = NULL;
nlft = NULL;
nrht = NULL;
nbot = NULL;
ntop = NULL;
}
void Mesh::init(int nx, int ny, real_t circ_radius, partition_method initial_order, int do_gpu_calc)
{
if (do_gpu_calc) {
#ifdef HAVE_OPENCL
cl_context context = ezcl_get_context();
hash_lib_init();
if (mype == 0) printf("Starting compile of kernels in mesh\n");
char *bothsources = (char *)malloc(strlen(mesh_kern_source)+strlen(get_hash_kernel_source_string())+1);
strcpy(bothsources, get_hash_kernel_source_string());
strcat(bothsources, mesh_kern_source);
strcat(bothsources, "\0");
const char *defines = NULL;
cl_program program = ezcl_create_program_wsource(context, defines, bothsources);
free(bothsources);
kernel_reduction_scan2 = ezcl_create_kernel_wprogram(program, "finish_reduction_scan2_cl");
kernel_reduction_count = ezcl_create_kernel_wprogram(program, "finish_reduction_count_cl");
kernel_reduction_count2 = ezcl_create_kernel_wprogram(program, "finish_reduction_count2_cl");
kernel_hash_adjust_sizes = ezcl_create_kernel_wprogram(program, "hash_adjust_sizes_cl");
kernel_hash_setup = ezcl_create_kernel_wprogram(program, "hash_setup_cl");
kernel_hash_setup_local = ezcl_create_kernel_wprogram(program, "hash_setup_local_cl");
kernel_neighbor_init = ezcl_create_kernel_wprogram(program, "neighbor_init_cl");
kernel_calc_neighbors = ezcl_create_kernel_wprogram(program, "calc_neighbors_cl");
kernel_calc_neighbors_local = ezcl_create_kernel_wprogram(program, "calc_neighbors_local_cl");
kernel_calc_border_cells = ezcl_create_kernel_wprogram(program, "calc_border_cells_cl");
kernel_calc_border_cells2 = ezcl_create_kernel_wprogram(program, "calc_border_cells2_cl");
kernel_finish_scan = ezcl_create_kernel_wprogram(program, "finish_scan_cl");
kernel_get_border_data = ezcl_create_kernel_wprogram(program, "get_border_data_cl");
kernel_calc_layer1 = ezcl_create_kernel_wprogram(program, "calc_layer1_cl");
kernel_calc_layer1_sethash = ezcl_create_kernel_wprogram(program, "calc_layer1_sethash_cl");
kernel_calc_layer2 = ezcl_create_kernel_wprogram(program, "calc_layer2_cl");
kernel_get_border_data2 = ezcl_create_kernel_wprogram(program, "get_border_data2_cl");
kernel_calc_layer2_sethash = ezcl_create_kernel_wprogram(program, "calc_layer2_sethash_cl");
kernel_copy_mesh_data = ezcl_create_kernel_wprogram(program, "copy_mesh_data_cl");
kernel_fill_mesh_ghost = ezcl_create_kernel_wprogram(program, "fill_mesh_ghost_cl");
kernel_fill_neighbor_ghost = ezcl_create_kernel_wprogram(program, "fill_neighbor_ghost_cl");
kernel_set_corner_neighbor = ezcl_create_kernel_wprogram(program, "set_corner_neighbor_cl");
kernel_adjust_neighbors_local = ezcl_create_kernel_wprogram(program, "adjust_neighbors_local_cl");
kernel_hash_size = ezcl_create_kernel_wprogram(program, "calc_hash_size_cl");
kernel_finish_hash_size = ezcl_create_kernel_wprogram(program, "finish_reduction_minmax4_cl");
kernel_calc_spatial_coordinates = ezcl_create_kernel_wprogram(program, "calc_spatial_coordinates_cl");
kernel_do_load_balance_lower = ezcl_create_kernel_wprogram(program, "do_load_balance_lower_cl");
kernel_do_load_balance_middle = ezcl_create_kernel_wprogram(program, "do_load_balance_middle_cl");
kernel_do_load_balance_upper = ezcl_create_kernel_wprogram(program, "do_load_balance_upper_cl");
#ifndef MINIMUM_PRECISION
kernel_do_load_balance_double = ezcl_create_kernel_wprogram(program, "do_load_balance_double_cl");
#endif
kernel_do_load_balance_float = ezcl_create_kernel_wprogram(program, "do_load_balance_float_cl");
kernel_refine_smooth = ezcl_create_kernel_wprogram(program, "refine_smooth_cl");
kernel_coarsen_smooth = ezcl_create_kernel_wprogram(program, "coarsen_smooth_cl");
kernel_coarsen_check_block = ezcl_create_kernel_wprogram(program, "coarsen_check_block_cl");
kernel_rezone_all = ezcl_create_kernel_wprogram(program, "rezone_all_cl");
kernel_rezone_neighbors = ezcl_create_kernel_wprogram(program, "rezone_neighbors_cl");
#ifndef MINIMUM_PRECISION
kernel_rezone_one_double = ezcl_create_kernel_wprogram(program, "rezone_one_double_cl");
#endif
kernel_rezone_one_float = ezcl_create_kernel_wprogram(program, "rezone_one_float_cl");
kernel_copy_mpot_ghost_data = ezcl_create_kernel_wprogram(program, "copy_mpot_ghost_data_cl");
kernel_set_boundary_refinement = ezcl_create_kernel_wprogram(program, "set_boundary_refinement");
init_kernel_2stage_sum();
init_kernel_2stage_sum_int();
if (! have_boundary){
kernel_count_BCs = ezcl_create_kernel_wprogram(program, "count_BCs_cl");
}
ezcl_program_release(program);
if (mype == 0) printf("Finishing compile of kernels in mesh\n");
#endif
}
//KDTree_Initialize(&tree);
if (ncells > 0) { // this is a restart.
nsizes.resize (numpe);
ndispl.resize (numpe);
if (parallel && numpe > 1) {
#ifdef HAVE_MPI
int ncells_int = ncells;
MPI_Allgather(&ncells_int, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
ndispl[0]=0;
for (int ip=1; ip<numpe; ip++){
ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
}
noffset=ndispl[mype];
ncells_global = ndispl[numpe-1] + nsizes[numpe-1];
#endif
} else {
noffset = 0;
ncells_global = ncells;
proc.resize (ncells);
calc_distribution(numpe);
}
calc_celltype(ncells);
} else {
int istart = 1,
jstart = 1,
iend = nx,
jend = ny,
nxx = nx,
nyy = ny;
if (have_boundary) {
istart = 0;
jstart = 0;
iend = nx + 1;
jend = ny + 1;
nxx = nx + 2;
nyy = ny + 2;
}
if (ndim == TWO_DIMENSIONAL) ncells = nxx * nyy - have_boundary * 4;
else ncells = nxx * nyy;
noffset = 0;
if (parallel) {
ncells_global = ncells;
nsizes.resize(numpe);
ndispl.resize(numpe);
for (int ip=0; ip<numpe; ip++){
nsizes[ip] = ncells_global/numpe;
if (ip < (int)(ncells_global%numpe)) nsizes[ip]++;
}
ndispl[0]=0;
for (int ip=1; ip<numpe; ip++){
ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
}
ncells= nsizes[mype];
noffset=ndispl[mype];
}
allocate(ncells);
index.resize(ncells);
int ic = 0;
for (int jj = jstart; jj <= jend; jj++) {
for (int ii = istart; ii <= iend; ii++) {
if (have_boundary && ii == 0 && jj == 0 ) continue;
if (have_boundary && ii == 0 && jj == jend) continue;
if (have_boundary && ii == iend && jj == 0 ) continue;
if (have_boundary && ii == iend && jj == jend) continue;
if (ic >= (int)noffset && ic < (int)(ncells+noffset)){
int iclocal = ic-noffset;
index[iclocal] = ic;
i[iclocal] = ii;
j[iclocal] = jj;
level[iclocal] = 0;
}
ic++;
}
}
//if (numpe > 1 && (initial_order != HILBERT_SORT && initial_order != HILBERT_PARTITION) ) mem_factor = 2.0;
partition_cells(numpe, index, initial_order);
calc_celltype(ncells);
calc_spatial_coordinates(0);
// Start lev loop here
for (int ilevel=1; ilevel<=levmx; ilevel++) {
//int old_ncells = ncells;
ncells_ghost = ncells;
calc_neighbors_local();
kdtree_setup();
int nez;
vector<int> ind(ncells);
#ifdef FULL_PRECISION
KDTree_QueryCircleIntersect_Double(&tree, &nez, &(ind[0]), circ_radius, ncells, &x[0], &dx[0], &y[0], &dy[0]);
#else
KDTree_QueryCircleIntersect_Float(&tree, &nez, &(ind[0]), circ_radius, ncells, &x[0], &dx[0], &y[0], &dy[0]);
#endif
vector<int> mpot(ncells_ghost,0);
for (int ic=0; ic<nez; ++ic){
if (level[ind[ic]] < levmx) mpot[ind[ic]] = 1;
}
KDTree_Destroy(&tree);
// Refine the cells.
int icount = 0;
int jcount = 0;
int new_ncells = refine_smooth(mpot, icount, jcount);
MallocPlus dummy;
rezone_all(icount, jcount, mpot, 0, dummy);
ncells = new_ncells;
calc_spatial_coordinates(0);
#ifdef HAVE_MPI
if (parallel && numpe > 1) {
int ncells_int = ncells;
MPI_Allgather(&ncells_int, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
ndispl[0]=0;
for (int ip=1; ip<numpe; ip++){
ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
}
noffset=ndispl[mype];
ncells_global = ndispl[numpe-1] + nsizes[numpe-1];
}
#endif
} // End lev loop here
index.clear();
ncells_ghost = ncells;
}
int ncells_corners = 4;
int i_corner[] = { 0, 0,imax,imax};
int j_corner[] = { 0,jmax, 0,jmax};
for(int ic=0; ic<ncells_corners; ic++){
for (int jj = j_corner[ic]*IPOW2(levmx); jj < (j_corner[ic]+1)*IPOW2(levmx); jj++) {
for (int ii = i_corner[ic]*IPOW2(levmx); ii < (i_corner[ic]+1)*IPOW2(levmx); ii++) {
corners_i.push_back(ii);
corners_j.push_back(jj);
}
}
}
}
size_t Mesh::refine_smooth(vector<int> &mpot, int &icount, int &jcount)
{
vector<int> mpot_old;
int newcount;
int newcount_global;
struct timeval tstart_lev2;
rezone_count(mpot, icount, jcount);
#ifdef _OPENMP
#pragma omp parallel
{ //START Parallel Region
#endif
#ifdef _OPENMP
#pragma omp master
{//MASTER START
#endif
newcount = icount;
newcount_global = newcount;
if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
#ifdef HAVE_MPI
if (parallel) {
MPI_Allreduce(&newcount, &newcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
#endif
#ifdef _OPENMP
}//END MASTER
#pragma omp barrier
#endif
if(newcount_global > 0 && levmx > 1) {
size_t my_ncells=ncells;
if (parallel) my_ncells=ncells_ghost;
#ifdef _OPENMP
#pragma omp master
{//MASTER START
#endif
cpu_counters[MESH_COUNTER_REFINE_SMOOTH]++;
mpot_old.resize(my_ncells);
#ifdef _OPENMP
}//END MASTER
#pragma omp barrier
#endif
int levcount = 1;
while (newcount_global > 0 && levcount < levmx){
levcount++;
#ifdef _OPENMP
#pragma omp master
{//MASTER START
#endif
mpot.swap(mpot_old);
newcount=0;
#ifdef HAVE_MPI
if (numpe > 1) {
L7_Update(&mpot_old[0], L7_INT, cell_handle);
}
#endif
#ifdef _OPENMP
}//END MASTER
#pragma omp barrier
#endif
int upperBound, lowerBound;
get_bounds(upperBound, lowerBound);
int mynewcount = newcount; //All threads get a mynewcount
#ifdef _OPENMP
#pragma omp for reduction(+:newcount)
#endif
for(uint ic = 0; ic < ncells; ic++) {
// for(uint ic = lowerBound; ic < upperBound; ic++){
int lev = level[ic];
mpot[ic] = mpot_old[ic];
if(mpot_old[ic] > 0) continue;
int nl = nlft[ic];
if (nl >= 0 && nl < (int)ncells_ghost) {
int ll = level[nl];
if(mpot_old[nl] > 0) ll++;
if(ll - lev > 1) {
mpot[ic]=1;
mynewcount++;
continue;
}
ll = level[nl];
if (ll > lev) {
int nlt = ntop[nl];
if (nlt >= 0 && nlt < (int)ncells_ghost) {
int llt = level[nlt];
if(mpot_old[nlt] > 0) llt++;
if(llt - lev > 1) {
mpot[ic]=1;
mynewcount++;
continue;
}
}
}
}
int nr = nrht[ic];
if (nr >= 0 && nr < (int)ncells_ghost) {
int lr = level[nr];
if(mpot_old[nr] > 0) lr++;
if(lr - lev > 1) {
mpot[ic]=1;
mynewcount++;
continue;
}
lr = level[nr];
if (lr > lev) {
int nrt = ntop[nr];
if (nrt >= 0 && nrt < (int)ncells_ghost) {
int lrt = level[nrt];
if(mpot_old[nrt] > 0) lrt++;
if(lrt - lev > 1) {
mpot[ic]=1;
mynewcount++;
continue;
}
}
}
}
int nt = ntop[ic];
if (nt >= 0 && nt < (int)ncells_ghost) {
int lt = level[nt];
if(mpot_old[nt] > 0) lt++;
if(lt - lev > 1) {
mpot[ic]=1;
mynewcount++;
continue;
}
lt = level[nt];
if (lt > lev) {
int ntr = nrht[nt];
if (ntr >= 0 && ntr < (int)ncells_ghost) {
int ltr = level[ntr];
if(mpot_old[ntr] > 0) ltr++;
if(ltr - lev > 1) {
mpot[ic]=1;
mynewcount++;
continue;
}
}
}
}
int nb = nbot[ic];
if (nb >= 0 && nb < (int)ncells_ghost) {
int lb = level[nb];
if(mpot_old[nb] > 0) lb++;
if(lb - lev > 1) {
mpot[ic]=1;
mynewcount++;
continue;
}
lb = level[nb];
if (lb > lev) {
int nbr = nrht[nb];
if (nbr >= 0 && nbr < (int)ncells_ghost) {
int lbr = level[nbr];
if(mpot_old[nbr] > 0) lbr++;
if(lbr - lev > 1) {
mpot[ic]=1;
mynewcount++;
continue;
}
}
}
}
}
#ifdef _OPENMP
#pragma omp atomic
#endif
newcount += mynewcount;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
icount += newcount;
newcount_global = newcount;
#ifdef HAVE_MPI
if (parallel) {
MPI_Allreduce(&newcount, &newcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
#endif
#ifdef _OPENMP
}//END MASTER
#pragma omp barrier
#endif
} // while (newcount_global > 0 && levcount < levmx);
}
#ifdef _OPENMP
#pragma omp master
{
#endif
#ifdef HAVE_MPI
if (numpe > 1) {
L7_Update(&mpot[0], L7_INT, cell_handle);
}
#endif
mpot_old.clear();
mpot_old.resize(ncells_ghost);
mpot_old.swap(mpot);
#ifdef _OPENMP
}//END MASTER
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for(uint ic=0; ic<ncells; ic++) {
mpot[ic] = mpot_old[ic];
if (mpot_old[ic] >= 0) continue;
if (mpot_old[ic] <= -1000000) continue;
if ( is_upper_right(i[ic],j[ic]) ) {
int nr = nrht[ic];
int lr = level[nr];
if (mpot_old[nr] > 0) lr++;
int nt = ntop[ic];
int lt = level[nt];
if (mpot_old[nt] > 0) lt++;
if (lr > level[ic] || lt > level[ic]) mpot[ic] = 0;
} else if ( is_upper_left(i[ic],j[ic] ) ) {
int nl = nlft[ic];
int ll = level[nl];
if (mpot_old[nl] > 0) ll++;
int nt = ntop[ic];
int lt = level[nt];
if (mpot_old[nt] > 0) lt++;
if (ll > level[ic] || lt > level[ic]) mpot[ic] = 0;
} else if ( is_lower_right(i[ic],j[ic] ) ) {
int nr = nrht[ic];
int lr = level[nr];
if (mpot_old[nr] > 0) lr++;
int nb = nbot[ic];
int lb = level[nb];
if (mpot_old[nb] > 0) lb++;
if (lr > level[ic] || lb > level[ic]) mpot[ic] = 0;
} else if ( is_lower_left(i[ic],j[ic] ) ) {
int nl = nlft[ic];
int ll = level[nl];
if (mpot_old[nl] > 0) ll++;
int nb = nbot[ic];
int lb = level[nb];
if (mpot_old[nb] > 0) lb++;
if (ll > level[ic] || lb > level[ic]) mpot[ic] = 0;
}
}
#ifdef _OPENMP
#pragma omp master
{
#endif
#ifdef HAVE_MPI
if (numpe > 1) {
L7_Update(&mpot[0], L7_INT, cell_handle);
}
#endif
mpot_old.swap(mpot);
#ifdef _OPENMP
}//END MASTER
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for(uint ic=0; ic<ncells; ic++) {
int n1=0, n2=0, n3=0;
mpot[ic] = mpot_old[ic];
if (mpot_old[ic] >= 0) continue;
if (mpot_old[ic] <= -1000000) continue;
if ( is_upper_right(i[ic],j[ic]) ) {
n1 = nbot[ic];
n2 = nlft[ic];
n3 = nlft[n1];
} else if ( is_upper_left(i[ic],j[ic] ) ) {
n1 = nbot[ic];
n2 = nrht[ic];
n3 = nrht[n1];
} else if ( is_lower_right(i[ic],j[ic] ) ) {
n1 = ntop[ic];
n2 = nlft[ic];
n3 = nlft[n1];
} else if ( is_lower_left(i[ic],j[ic] ) ) {
n1 = ntop[ic];
n2 = nrht[ic];
n3 = nrht[n1];
}
if (n3 < 0) {
mpot[ic] = 0;
} else {
int lev1 = level[n1];
int lev2 = level[n2];
int lev3 = level[n3];
if (mpot_old[n1] > 0) lev1++;
if (mpot_old[n2] > 0) lev2++;
if (mpot_old[n3] > 0) lev3++;
if (mpot_old[n1] != -1 || lev1 != level[ic] ||
mpot_old[n2] != -1 || lev2 != level[ic] ||
mpot_old[n3] != -1 || lev3 != level[ic]) {
mpot[ic] = 0;
}
}
}
#ifdef _OPENMP
#pragma omp master
{
#endif
#ifdef HAVE_MPI
if (numpe > 1) {
L7_Update(&mpot[0], L7_INT, cell_handle);
}
#endif
#ifdef _OPENMP
}//END MASTER
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (uint ic=0; ic<ncells; ic++) {
if (celltype[ic] < 0) {
switch (celltype[ic]) {
case LEFT_BOUNDARY:
mpot[ic] = mpot[nrht[ic]];
break;
case RIGHT_BOUNDARY:
mpot[ic] = mpot[nlft[ic]];
break;
case BOTTOM_BOUNDARY:
mpot[ic] = mpot[ntop[ic]];
break;
case TOP_BOUNDARY:
mpot[ic] = mpot[nbot[ic]];
break;
}
}
}
#ifdef _OPENMP
#pragma omp barrier
}//END Parallel Region
#endif
newcount = ncells + rezone_count(mpot, icount, jcount);
#ifdef HAVE_MPI
int icount_global = icount;
int jcount_global = jcount;
if (parallel) {
int count[2], global_count[2];
count[0] = icount;
count[1] = jcount;
MPI_Allreduce(&count, &global_count, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
icount_global = global_count[0];
jcount_global = global_count[1];
}
do_rezone = (icount_global != 0 || jcount_global != 0) ? true : false;
#else
do_rezone = (icount != 0 || jcount != 0) ? true : false;
#endif
if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_REFINE_SMOOTH] += cpu_timer_stop(tstart_lev2);
return(newcount);
}
#ifdef HAVE_OPENCL
int Mesh::gpu_refine_smooth(cl_mem &dev_mpot, int &icount, int &jcount)
{
struct timeval tstart_lev2;
if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
cl_command_queue command_queue = ezcl_get_command_queue();
size_t local_work_size = 128;
size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
size_t block_size = global_work_size/local_work_size;
int icount_global = icount;
int jcount_global = jcount;
#ifdef HAVE_MPI
if (parallel) {
int count[2], count_global[2];
count[0] = icount;
count[1] = jcount;
MPI_Allreduce(&count, &count_global, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
icount_global = count_global[0];
jcount_global = count_global[1];
}
#endif
int levcount = 1;
//int which_smooth=0;
if(icount_global > 0 && levcount < levmx) {
size_t result_size = 1;
cl_mem dev_result = ezcl_malloc(NULL, const_cast<char *>("dev_result"), &result_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_mpot_old = ezcl_malloc(NULL, const_cast<char *>("dev_mpot_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
int newcount = icount;
int newcount_global = icount_global;
while (newcount_global > 0 && levcount < levmx) {
levcount++;
gpu_counters[MESH_COUNTER_REFINE_SMOOTH]++;
#ifdef HAVE_MPI
if (numpe > 1) {
L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
}
#endif
if (icount_global) {
ezcl_device_memory_swap(&dev_mpot_old, &dev_mpot);
ezcl_set_kernel_arg(kernel_refine_smooth, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_refine_smooth, 1, sizeof(cl_int), (void *)&ncells_ghost);
ezcl_set_kernel_arg(kernel_refine_smooth, 2, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_refine_smooth, 3, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_refine_smooth, 4, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_refine_smooth, 5, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_refine_smooth, 6, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_refine_smooth, 7, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_refine_smooth, 8, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_refine_smooth, 9, sizeof(cl_mem), (void *)&dev_mpot_old);
ezcl_set_kernel_arg(kernel_refine_smooth,10, sizeof(cl_mem), (void *)&dev_mpot);
ezcl_set_kernel_arg(kernel_refine_smooth,11, sizeof(cl_mem), (void *)&dev_redscratch);
ezcl_set_kernel_arg(kernel_refine_smooth,12, sizeof(cl_mem), (void *)&dev_result);
ezcl_set_kernel_arg(kernel_refine_smooth,13, local_work_size*sizeof(cl_int), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_refine_smooth, 1, NULL, &global_work_size, &local_work_size, NULL);
gpu_rezone_count(block_size, local_work_size, dev_redscratch, dev_result);
int result;
ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int), &result, NULL);
//printf("result = %d after %d refine smooths\n",result,which_smooth);
//which_smooth++;
icount = result;
}
newcount = icount-newcount;
newcount_global = newcount;
#ifdef HAVE_MPI
if (parallel) {
MPI_Allreduce(&newcount, &newcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
#endif
icount_global += newcount_global;
//printf("DEBUG -- icount %d icount_global %d newcount %d newcount_global %d\n",icount,icount_global,newcount,newcount_global);
}
ezcl_device_memory_delete(dev_mpot_old);
ezcl_device_memory_delete(dev_redscratch);
ezcl_device_memory_delete(dev_result);
}
if (jcount_global) {
#ifdef HAVE_MPI
if (numpe > 1) {
L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
}
#endif
cl_mem dev_mpot_old = ezcl_malloc(NULL, const_cast<char *>("dev_mpot_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
if (jcount) {
ezcl_device_memory_swap(&dev_mpot_old, &dev_mpot);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 1, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 2, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 3, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 4, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 5, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 6, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 7, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 8, sizeof(cl_mem), (void *)&dev_mpot_old);
ezcl_set_kernel_arg(kernel_coarsen_smooth, 9, sizeof(cl_mem), (void *)&dev_mpot);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_coarsen_smooth, 1, NULL, &global_work_size, &local_work_size, NULL);
}
#ifdef HAVE_MPI
if (numpe > 1) {
L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
}
#endif
if (jcount) {
size_t result_size = 1;
cl_mem dev_result = ezcl_malloc(NULL, const_cast<char *>("dev_result"), &result_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_device_memory_swap(&dev_mpot_old, &dev_mpot);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 1, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 2, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 3, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 4, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 5, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 6, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 7, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 8, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_coarsen_check_block, 9, sizeof(cl_mem), (void *)&dev_mpot_old);
ezcl_set_kernel_arg(kernel_coarsen_check_block,10, sizeof(cl_mem), (void *)&dev_mpot);
ezcl_set_kernel_arg(kernel_coarsen_check_block,11, sizeof(cl_mem), (void *)&dev_redscratch);
ezcl_set_kernel_arg(kernel_coarsen_check_block,12, sizeof(cl_mem), (void *)&dev_result);
ezcl_set_kernel_arg(kernel_coarsen_check_block,13, local_work_size*sizeof(cl_int), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_coarsen_check_block, 1, NULL, &global_work_size, &local_work_size, NULL);
gpu_rezone_count(block_size, local_work_size, dev_redscratch, dev_result);
int result;
ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int), &result, NULL);
//printf("result = %d after coarsen smooth\n",result);
jcount = result;
ezcl_device_memory_delete(dev_redscratch);
ezcl_device_memory_delete(dev_result);
}
jcount_global = jcount;
#ifdef HAVE_MPI
if (parallel) {
MPI_Allreduce(&jcount, &jcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
#endif
ezcl_device_memory_delete(dev_mpot_old);
}
if (icount_global || jcount_global) {
#ifdef HAVE_MPI
if (numpe > 1) {
L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
}
#endif
size_t result_size = 1;
cl_mem dev_result = ezcl_malloc(NULL, const_cast<char *>("dev_result"), &result_size, sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size, sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 1, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 2, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 3, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 4, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 5, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 6, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 7, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 8, sizeof(cl_mem), (void *)&dev_mpot);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 9, sizeof(cl_mem), (void *)&dev_redscratch);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 10, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 11, sizeof(cl_mem), (void *)&dev_result);
ezcl_set_kernel_arg(kernel_set_boundary_refinement, 12, local_work_size*sizeof(cl_int2), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_set_boundary_refinement, 1, NULL, &global_work_size, &local_work_size, NULL);
gpu_rezone_count2(block_size, local_work_size, dev_redscratch, dev_result);
int my_result[2];
ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, 1*sizeof(cl_int2), &my_result, NULL);
//printf("Result is %lu icount %d jcount %d\n", ncells+my_result[0]-my_result[1],my_result[0],my_result[1]);
icount = my_result[0];
jcount = my_result[1];
icount_global = icount;
jcount_global = jcount;
#ifdef HAVE_MPI
if (parallel) {
int count[2], count_global[2];
count[0] = icount;
count[1] = jcount;
MPI_Allreduce(&count, &count_global, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
icount_global = count_global[0];
jcount_global = count_global[1];
}
#endif
gpu_rezone_scan(block_size, local_work_size, dev_ioffset, dev_result);
//ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int), &my_result, NULL);
//printf("After scan, Result is %d\n", my_result[0]);
ezcl_device_memory_delete(dev_result);
ezcl_device_memory_delete(dev_redscratch);
} else {
ezcl_device_memory_delete(dev_mpot);
dev_mpot = NULL;
}
gpu_do_rezone = (icount_global != 0 || jcount_global != 0) ? true : false;
if (TIMING_LEVEL >= 2) gpu_timers[MESH_TIMER_REFINE_SMOOTH] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
return ncells+icount-jcount;
}
#endif
void Mesh::terminate(void)
{
mesh_memory.memory_delete(i);
mesh_memory.memory_delete(j);
mesh_memory.memory_delete(level);
mesh_memory.memory_delete(celltype);
if (neighbor_remap) {
mesh_memory.memory_delete(nlft);
mesh_memory.memory_delete(nrht);
mesh_memory.memory_delete(nbot);
mesh_memory.memory_delete(ntop);
}
#ifdef HAVE_OPENCL
hash_lib_terminate();
ezcl_device_memory_delete(dev_levtable);
ezcl_device_memory_delete(dev_levdx);
ezcl_device_memory_delete(dev_levdy);
ezcl_device_memory_delete(dev_levibeg);
ezcl_device_memory_delete(dev_leviend);
ezcl_device_memory_delete(dev_levjbeg);
ezcl_device_memory_delete(dev_levjend);
ezcl_device_memory_delete(dev_level);
ezcl_device_memory_delete(dev_i);
ezcl_device_memory_delete(dev_j);
ezcl_device_memory_delete(dev_celltype);
if (neighbor_remap && dev_nlft != NULL){
ezcl_device_memory_delete(dev_nlft);
ezcl_device_memory_delete(dev_nrht);
ezcl_device_memory_delete(dev_nbot);
ezcl_device_memory_delete(dev_ntop);
}
ezcl_kernel_release(kernel_reduction_scan2);
ezcl_kernel_release(kernel_reduction_count);
ezcl_kernel_release(kernel_reduction_count2);
ezcl_kernel_release(kernel_hash_adjust_sizes);
ezcl_kernel_release(kernel_hash_setup);
ezcl_kernel_release(kernel_hash_setup_local);
ezcl_kernel_release(kernel_neighbor_init);
ezcl_kernel_release(kernel_calc_neighbors);
ezcl_kernel_release(kernel_calc_neighbors_local);
ezcl_kernel_release(kernel_calc_border_cells);
ezcl_kernel_release(kernel_calc_border_cells2);
ezcl_kernel_release(kernel_finish_scan);
ezcl_kernel_release(kernel_get_border_data);
ezcl_kernel_release(kernel_calc_layer1);
ezcl_kernel_release(kernel_calc_layer1_sethash);
ezcl_kernel_release(kernel_calc_layer2);
ezcl_kernel_release(kernel_get_border_data2);
ezcl_kernel_release(kernel_calc_layer2_sethash);
//ezcl_kernel_release(kernel_calc_neighbors_local2);
ezcl_kernel_release(kernel_copy_mesh_data);
ezcl_kernel_release(kernel_fill_mesh_ghost);
ezcl_kernel_release(kernel_fill_neighbor_ghost);
ezcl_kernel_release(kernel_set_corner_neighbor);
ezcl_kernel_release(kernel_adjust_neighbors_local);
//ezcl_kernel_release(kernel_copy_ghost_data);
//ezcl_kernel_release(kernel_adjust_neighbors);
ezcl_kernel_release(kernel_hash_size);
ezcl_kernel_release(kernel_finish_hash_size);
ezcl_kernel_release(kernel_calc_spatial_coordinates);
ezcl_kernel_release(kernel_do_load_balance_lower);
ezcl_kernel_release(kernel_do_load_balance_middle);
ezcl_kernel_release(kernel_do_load_balance_upper);
#ifndef MINIMUM_PRECISION
ezcl_kernel_release(kernel_do_load_balance_double);
#endif
ezcl_kernel_release(kernel_do_load_balance_float);
ezcl_kernel_release(kernel_refine_smooth);
ezcl_kernel_release(kernel_coarsen_smooth);
ezcl_kernel_release(kernel_coarsen_check_block);
ezcl_kernel_release(kernel_rezone_all);
ezcl_kernel_release(kernel_rezone_neighbors);
#ifndef MINIMUM_PRECISION
ezcl_kernel_release(kernel_rezone_one_double);
#endif
ezcl_kernel_release(kernel_rezone_one_float);
ezcl_kernel_release(kernel_copy_mpot_ghost_data);
ezcl_kernel_release(kernel_set_boundary_refinement);
terminate_kernel_2stage_sum();
terminate_kernel_2stage_sum_int();
if (! have_boundary){
ezcl_kernel_release(kernel_count_BCs);
}
#endif
#if defined(HAVE_J7) && defined(HAVE_MPI)
if (parallel) mesh_memory.pfini();
#endif
}
int Mesh::rezone_count(vector<int> mpot, int &icount, int &jcount)
{
int my_icount=0;
int my_jcount=0;
#ifdef _OPENMP
#pragma omp parallel for reduction (+:my_jcount,my_icount)
#endif
for (uint ic=0; ic<ncells; ++ic){
if (mpot[ic] < 0) {
if (celltype[ic] == REAL_CELL) {
// remove all but cell that will remain to get count right when split
// across processors
if (! is_lower_left(i[ic],j[ic]) ) my_jcount--;
} else {
// either upper right or lower left will remain for boundary cells
if (! (is_upper_right(i[ic],j[ic]) || is_lower_left(i[ic],j[ic]) ) ) my_jcount--;
}
}
if (mpot[ic] > 0) {
//printf("mpot[%d] = %d level %d levmx %d\n",ic,mpot[ic],level[ic],levmx);
if (celltype[ic] == REAL_CELL){
my_icount += 3;
} else {
my_icount ++;
}
}
}
//printf("icount is %d\n",my_icount);
icount = my_icount;
jcount = my_jcount;
return(icount+jcount);
}
#ifdef HAVE_OPENCL
void Mesh::gpu_rezone_count2(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result)
{
cl_command_queue command_queue = ezcl_get_command_queue();
/*
__kernel void finish_reduction_count2_cl(
const int isize, // 0
__global int *redscratch, // 1
__global int *result, // 2
__local int *tile) // 3
*/
ezcl_set_kernel_arg(kernel_reduction_count2, 0, sizeof(cl_int), (void *)&block_size);
ezcl_set_kernel_arg(kernel_reduction_count2, 1, sizeof(cl_mem), (void *)&dev_redscratch);
ezcl_set_kernel_arg(kernel_reduction_count2, 2, sizeof(cl_mem), (void *)&dev_result);
ezcl_set_kernel_arg(kernel_reduction_count2, 3, local_work_size*sizeof(cl_int2), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_count2, 1, NULL, &local_work_size, &local_work_size, NULL);
}
void Mesh::gpu_rezone_count(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result)
{
cl_command_queue command_queue = ezcl_get_command_queue();
/*
__kernel void finish_reduction_count_cl(
const int isize, // 0
__global int *redscratch, // 1
__global int *result, // 2
__local int *tile) // 3
*/
ezcl_set_kernel_arg(kernel_reduction_count, 0, sizeof(cl_int), (void *)&block_size);
ezcl_set_kernel_arg(kernel_reduction_count, 1, sizeof(cl_mem), (void *)&dev_redscratch);
ezcl_set_kernel_arg(kernel_reduction_count, 2, sizeof(cl_mem), (void *)&dev_result);
ezcl_set_kernel_arg(kernel_reduction_count, 3, local_work_size*sizeof(cl_int), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_count, 1, NULL, &local_work_size, &local_work_size, NULL);
}
void Mesh::gpu_rezone_scan(size_t block_size, size_t local_work_size, cl_mem dev_ioffset, cl_mem &dev_result)
{
cl_command_queue command_queue = ezcl_get_command_queue();
/*
__kernel void finish_reduction_scan_cl(
const int isize, // 0
__global int *ioffset, // 1
__global int *result, // 2
__local int *tile) // 3
*/
ezcl_set_kernel_arg(kernel_reduction_scan2, 0, sizeof(cl_int), (void *)&block_size);
ezcl_set_kernel_arg(kernel_reduction_scan2, 1, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_reduction_scan2, 2, sizeof(cl_mem), (void *)&dev_result);
ezcl_set_kernel_arg(kernel_reduction_scan2, 3, local_work_size*sizeof(cl_uint2), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_scan2, 1, NULL, &local_work_size, &local_work_size, NULL);
}
#endif
void Mesh::kdtree_setup()
{
KDTree_Initialize(&tree);
TBounds box;
for (uint ic=0; ic<ncells; ic++) {
box.min.x = x[ic];
box.max.x = x[ic]+dx[ic];
box.min.y = y[ic];
box.max.y = y[ic]+dy[ic];
KDTree_AddElement(&tree, &box);
}
}
void Mesh::calc_spatial_coordinates(int ibase)
{
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
x.resize(ncells);
dx.resize(ncells);
y.resize(ncells);
dy.resize(ncells);
#ifdef _OPENMP
#pragma omp parallel
{
#endif
int lowerBounds, upperBounds;
set_bounds(ncells);
get_bounds(lowerBounds, upperBounds);
if (have_boundary) {
for (uint ic = lowerBounds; ic < upperBounds; ic++) {
int lev = level[ic];
x[ic] = xmin + (lev_deltax[lev] * (i[ic] - ibase));
dx[ic] = lev_deltax[lev];
y[ic] = ymin + (lev_deltay[lev] * (j[ic] - ibase));
dy[ic] = lev_deltay[lev];
}
} else {
for (uint ic = lowerBounds; ic < upperBounds; ic++) {
int lev = level[ic];
x[ic] = xmin + (lev_deltax[lev] * (i[ic] - lev_ibegin[lev]));
dx[ic] = lev_deltax[lev];
y[ic] = ymin + (lev_deltay[lev] * (j[ic] - lev_jbegin[lev]));
dy[ic] = lev_deltay[lev];
}
}
cpu_timers[MESH_TIMER_CALC_SPATIAL_COORDINATES] += cpu_timer_stop(tstart_cpu);
#ifdef _OPENMP
#pragma omp barrier
} // end parallel region
#endif
}
#ifdef HAVE_OPENCL
void Mesh::gpu_calc_spatial_coordinates(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy)
{
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
cl_event calc_spatial_coordinates_event;
cl_command_queue command_queue = ezcl_get_command_queue();
size_t local_work_size = MIN(ncells, TILE_SIZE);
size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
// Only coded for base 0 and have boundary
// Need:
// xmin
// ymin
//
// lev_deltax -- dev_levdx
// lev_deltay -- dev_levdy
// x
// dx
// y
// dy
// level
// i
// j
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 1, sizeof(cl_real_t), (void *)&xmin);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 2, sizeof(cl_real_t), (void *)&ymin);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 3, sizeof(cl_mem), (void *)&dev_levdx);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 4, sizeof(cl_mem), (void *)&dev_levdy);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 5, sizeof(cl_mem), (void *)&dev_x);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 6, sizeof(cl_mem), (void *)&dev_dx);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 7, sizeof(cl_mem), (void *)&dev_y);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 8, sizeof(cl_mem), (void *)&dev_dy);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 9, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 10, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 11, sizeof(cl_mem), (void *)&dev_j);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_spatial_coordinates, 1, NULL, &global_work_size, &local_work_size, &calc_spatial_coordinates_event);
ezcl_wait_for_events(1, &calc_spatial_coordinates_event);
ezcl_event_release(calc_spatial_coordinates_event);
gpu_timers[MESH_TIMER_CALC_SPATIAL_COORDINATES] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
}
#endif
void Mesh::calc_minmax(void)
{
xmin=+1.0e30, ymin=+1.0e30, zmin=+1.0e30;
for (uint ic=0; ic<ncells; ic++){
if (x[ic] < xmin) xmin = x[ic];
}
for (uint ic=0; ic<ncells; ic++){
if (y[ic] < ymin) ymin = y[ic];
}
if (ndim > TWO_DIMENSIONAL) {
for (uint ic=0; ic<ncells; ic++){
if (z[ic] < zmin) zmin = z[ic];
}
}
xmax=-1.0e30, ymax=-1.0e30, zmax=-1.0e30;
real_t xhigh, yhigh, zhigh;
for (uint ic=0; ic<ncells; ic++){
xhigh = x[ic]+dx[ic];
if (xhigh > xmax) xmax = xhigh;
}
for (uint ic=0; ic<ncells; ic++){
yhigh = y[ic]+dy[ic];
if (yhigh > ymax) ymax = yhigh;
}
if (ndim > TWO_DIMENSIONAL) {
for (uint ic=0; ic<ncells; ic++){
zhigh = z[ic]+dz[ic];
if (zhigh > zmax) zmax = zhigh;
}
}
#ifdef HAVE_MPI
if (parallel) {
real_t xmin_global,xmax_global,ymin_global,ymax_global;
MPI_Allreduce(&xmin, &xmin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
MPI_Allreduce(&xmax, &xmax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
MPI_Allreduce(&ymin, &ymin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
MPI_Allreduce(&ymax, &ymax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
xmin = xmin_global;
xmax = xmax_global;
ymin = ymin_global;
ymax = ymax_global;
}
#endif
}
void Mesh::calc_centerminmax(void)
{
xcentermin=+1.0e30, ycentermin=+1.0e30, zcentermin=+1.0e30;
xcentermax=-1.0e30, ycentermax=-1.0e30, zcentermax=-1.0e30;
real_t xmid, ymid, zmid;
for (uint ic=0; ic<ncells; ic++){
xmid = x[ic]+0.5*dx[ic];
if (xmid < xcentermin) xcentermin = xmid;
if (xmid > xcentermax) xcentermax = xmid;
}
for (uint ic=0; ic<ncells; ic++){
ymid = y[ic]+0.5*dy[ic];
if (ymid < ycentermin) ycentermin = ymid;
if (ymid > ycentermax) ycentermax = ymid;
}
if (ndim > TWO_DIMENSIONAL) {
for (uint ic=0; ic<ncells; ic++){
zmid = z[ic]+0.5*dz[ic];
if (zmid < zcentermin) zcentermin = zmid;
if (zmid > zcentermax) zcentermax = zmid;
}
}
#ifdef HAVE_MPI
if (parallel) {
real_t xcentermin_global,xcentermax_global,ycentermin_global,ycentermax_global;
MPI_Allreduce(&xcentermin, &xcentermin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
MPI_Allreduce(&xcentermax, &xcentermax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
MPI_Allreduce(&ycentermin, &ycentermin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
MPI_Allreduce(&ycentermax, &ycentermax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
xcentermin = xcentermin_global;
xcentermax = xcentermax_global;
ycentermin = ycentermin_global;
ycentermax = ycentermax_global;
}
#endif
}
void Mesh::rezone_all(int icount, int jcount, vector<int> mpot, int have_state, MallocPlus &state_memory)
{
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
if (! do_rezone) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
index.clear();
index.resize(ncells);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (uint ic=0; ic<ncells; ic++){
index[ic]=ic;
}
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_REZONE_ALL] += cpu_timer_stop(tstart_cpu);
} else {
// sign for jcount is different in GPU and CPU code -- abs is a quick fix
int add_ncells = icount - abs(jcount);
#ifdef _OPENMP
#pragma omp master
#endif
cpu_counters[MESH_COUNTER_REZONE]++;
static vector<int> celltype_save;
static int new_ncells;
static int *i_old, *j_old, *level_old;
static int ifirst;
static int ilast;
static int jfirst;
static int jlast;
static int level_first;
static int level_last;
static vector<int> new_ic;
#ifdef _OPENMP
#pragma omp master
{
#endif
celltype_save.resize(ncells);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
if (have_state) {
#ifdef _OPENMP
#pragma omp for
#endif
for (int ic = 0; ic < (int)ncells; ic++){
celltype_save[ic] = celltype[ic];
}
}
#ifdef _OPENMP
#pragma omp master
{
#endif
new_ncells = ncells + add_ncells;
#ifdef _OPENMP
}
#pragma omp barrier
#endif
// int ref_entry_count = 0;
if (have_state){
#ifdef _OPENMP
#pragma omp for
#endif
for (uint ic=0; ic<ncells; ic++) {
// if (mpot[ic] > 0) ref_entry_count++;
if (mpot[ic] < 0) {
// Normal cell coarsening
if (is_lower_left(i[ic],j[ic]) ) mpot[ic] = -2;
// Boundary cell case
if (celltype[ic] != REAL_CELL && is_upper_right(i[ic],j[ic]) ) mpot[ic] = -3;
}
}
}
// Initialize new variables
// int *i_old, *j_old, *level_old;
int flags = RESTART_DATA;
#ifdef HAVE_J7
if (parallel) flags = LOAD_BALANCE_MEMORY;
#endif
#ifdef _OPENMP
#pragma omp master
{
#endif
i_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "i_old", flags);
j_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "j_old", flags);
level_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "level_old", flags);
mesh_memory.memory_swap(&i, &i_old);
mesh_memory.memory_swap(&j, &j_old);
mesh_memory.memory_swap(&level, &level_old);
index.clear();
index.resize(new_ncells);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
static vector<int> order; // Vector of refined mesh traversal order; set to -1 to indicate errors.
//
//vector<int> invorder(4, -1); // Vector mapping location from base index.
//int ref_entry = 0;
#ifdef _OPENMP
#pragma omp master
{
#endif
// Insert new cells into the mesh at the point of refinement.
order.resize(4, -1); // Vector of refined mesh traversal order; set to -1 to indicate errors.
ifirst = 0;
ilast = 0;
jfirst = 0;
jlast = 0;
level_first = 0;
level_last = 0;
if (parallel) {
#ifdef HAVE_MPI
MPI_Request req[12];
MPI_Status status[12];
static int prev = MPI_PROC_NULL;
static int next = MPI_PROC_NULL;
if (mype != 0) prev = mype-1;
if (mype < numpe - 1) next = mype+1;
MPI_Isend(&i_old[ncells-1], 1,MPI_INT,next,1,MPI_COMM_WORLD,req+0);
MPI_Irecv(&ifirst, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+1);
MPI_Isend(&i_old[0], 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+2);
MPI_Irecv(&ilast, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+3);
MPI_Isend(&j_old[ncells-1], 1,MPI_INT,next,1,MPI_COMM_WORLD,req+4);
MPI_Irecv(&jfirst, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+5);
MPI_Isend(&j_old[0], 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+6);
MPI_Irecv(&jlast, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+7);
MPI_Isend(&level_old[ncells-1], 1,MPI_INT,next,1,MPI_COMM_WORLD,req+8);
MPI_Irecv(&level_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+9);
MPI_Isend(&level_old[0], 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+10);
MPI_Irecv(&level_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+11);
MPI_Waitall(12, req, status);
#endif
}
#ifdef _OPENMP
}
#pragma omp barrier
#endif
#ifdef REZONE_NO_OPTIMIZATION
vector<int> invorder(4, -1); // Vector mapping location from base index.
for (int ic = 0, nc = 0; ic < (int)ncells; ic++)
{
if (mpot[ic] == 0 || mpot[ic] == -1000000)
{ // No change is needed; copy the old cell straight to the new mesh at this location.
index[ic] = nc;
i[nc] = i_old[ic];
j[nc] = j_old[ic];
level[nc] = level_old[ic];
nc++;
} // Complete no change needed.
else if (mpot[ic] < 0)
{ // Coarsening is needed; remove this cell and the other three and replace them with one.
index[ic] = nc;
if (mpot[ic] <= -2) {
//printf(" %d: DEBUG -- coarsening cell %d nc %d\n",mype,ic,nc);
i[nc] = i_old[ic]/2;
j[nc] = j_old[ic]/2;
level[nc] = level_old[ic] - 1;
nc++;
}
} // Coarsening complete.
else if (mpot[ic] > 0)
{ // Refinement is needed; insert four cells where once was one.
index[ic] = nc;
if (celltype[ic] == REAL_CELL)
{
set_refinement_order(&order[0], ic, ifirst, ilast, jfirst, jlast,
level_first, level_last, i_old, j_old, level_old);
// Create the cells in the correct order and orientation.
for (int ii = 0; ii < 4; ii++)
{ level[nc] = level_old[ic] + 1;
switch (order[ii])
{ case SW:
// lower left
invorder[SW] = ii;
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2;
nc++;
break;
case SE:
// lower right
invorder[SE] = ii;
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2;
nc++;
break;
case NW:
// upper left
invorder[NW] = ii;
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2 + 1;
nc++;
break;
case NE:
// upper right
invorder[NE] = ii;
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2 + 1;
nc++;
break; } } // Complete cell refinement.
} // Complete real cell refinement.
else if (celltype[ic] == LEFT_BOUNDARY) {
// lower
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2;
level[nc] = level_old[ic] + 1;
nc++;
// upper
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2 + 1;
level[nc] = level_old[ic] + 1;
nc++;
}
else if (celltype[ic] == RIGHT_BOUNDARY) {
// lower
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2;
level[nc] = level_old[ic] + 1;
nc++;
// upper
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2 + 1;
level[nc] = level_old[ic] + 1;
nc++;
}
else if (celltype[ic] == BOTTOM_BOUNDARY) {
// left
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2 + 1;
level[nc] = level_old[ic] + 1;
nc++;
// right
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2 + 1;
level[nc] = level_old[ic] + 1;
nc++;
}
else if (celltype[ic] == TOP_BOUNDARY) {
// right
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2;
level[nc] = level_old[ic] + 1;
nc++;
// left
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2;
level[nc] = level_old[ic] + 1;
nc++;
}
} // Complete refinement needed.
} // Complete addition of new cells to the mesh.
mesh_memory.memory_delete(i_old);
mesh_memory.memory_delete(j_old);
mesh_memory.memory_delete(level_old);
calc_celltype(new_ncells);
if (have_state){
flags = RESTART_DATA;
MallocPlus state_memory_old = state_memory;
malloc_plus_memory_entry *memory_item;
for (memory_item = state_memory_old.memory_entry_by_name_begin();
memory_item != state_memory_old.memory_entry_by_name_end();
memory_item = state_memory_old.memory_entry_by_name_next() ) {
//printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
if (memory_item->mem_elsize == 8) {
double *state_temp_double = (double *)state_memory.memory_malloc(new_ncells, sizeof(double),
"state_temp_double", flags);
double *mem_ptr_double = (double *)memory_item->mem_ptr;
//ref_entry = 0;
for (int ic=0, nc=0; ic<(int)ncells; ic++) {
if (mpot[ic] == 0) {
state_temp_double[nc] = mem_ptr_double[ic];
nc++;
} else if (mpot[ic] < 0){
if (mpot[ic] == -2) {
int nr = nrht[ic];
int nt = ntop[ic];
int nrt = nrht[nt];
state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nr] +
mem_ptr_double[nt] + mem_ptr_double[nrt])*0.25;
nc++;
}
if (mpot[ic] == -3) {
int nl = nlft[ic];
int nb = nbot[ic];
int nlb = nlft[nb];
state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nl] +
mem_ptr_double[nb] + mem_ptr_double[nlb])*0.25;
nc++;
}
} else if (mpot[ic] > 0){
// lower left
state_temp_double[nc] = mem_ptr_double[ic];
nc++;
// lower right
state_temp_double[nc] = mem_ptr_double[ic];
nc++;
if (celltype_save[ic] == REAL_CELL){
// upper left
state_temp_double[nc] = mem_ptr_double[ic];
nc++;
// upper right
state_temp_double[nc] = mem_ptr_double[ic];
nc++;
}
}
}
state_memory.memory_replace(mem_ptr_double, state_temp_double);
} else if (memory_item->mem_elsize == 4) {
float *state_temp_float = (float *)state_memory.memory_malloc(new_ncells, sizeof(float),
"state_temp_float", flags);
float *mem_ptr_float = (float *)memory_item->mem_ptr;
for (int ic=0, nc=0; ic<(int)ncells; ic++) {
if (mpot[ic] == 0) {
state_temp_float[nc] = mem_ptr_float[ic];
nc++;
} else if (mpot[ic] < 0){
if (mpot[ic] == -2) {
int nr = nrht[ic];
int nt = ntop[ic];
int nrt = nrht[nt];
state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nr] +
mem_ptr_float[nt] + mem_ptr_float[nrt])*0.25;
nc++;
}
if (mpot[ic] == -3) {
int nl = nlft[ic];
int nb = nbot[ic];
int nlb = nlft[nb];
state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nl] +
mem_ptr_float[nb] + mem_ptr_float[nlb])*0.25;
nc++;
}
} else if (mpot[ic] > 0){
// lower left
state_temp_float[nc] = mem_ptr_float[ic];
nc++;
// lower right
state_temp_float[nc] = mem_ptr_float[ic];
nc++;
if (celltype_save[ic] == REAL_CELL){
// upper left
state_temp_float[nc] = mem_ptr_float[ic];
nc++;
// upper right
state_temp_float[nc] = mem_ptr_float[ic];
nc++;
}
}
}
state_memory.memory_replace(mem_ptr_float, state_temp_float);
}
}
}
#else
// Data parallel optimizations for thread parallel -- slows down serial
// code by about 25%
static vector<int> add_count;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
add_count.resize(ncells);
new_ic.resize(ncells+1);
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (int ic = 0; ic < (int)ncells; ic++){
if (mpot[ic] == 0) {
add_count[ic] = 1;
} else if (mpot[ic] < 0) {
if (mpot[ic] == -2){
add_count[ic] = 1;
} else {
add_count[ic] = 0;
}
} else if (mpot[ic] > 0) {
if (celltype[ic] != REAL_CELL) {
add_count[ic] = 2;
} else {
add_count[ic] = 4;
}
}
}
#ifdef _OPENMP
#pragma omp barrier
#endif
scan (&add_count[0], &new_ic[0], ncells);
#ifdef _OPENMP
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (int ic = 0; ic < (int)ncells; ic++) {
vector<int> invorder(4, -1); // Vector mapping location from base index.
int nc = new_ic[ic];
if (mpot[ic] == 0)
{ // No change is needed; copy the old cell straight to the new mesh at this location.
index[ic] = nc;
i[nc] = i_old[ic];
j[nc] = j_old[ic];
level[nc] = level_old[ic];
} // Complete no change needed.
else if (mpot[ic] < 0)
{ // Coarsening is needed; remove this cell and the other three and replace them with one.
index[ic] = nc;
if (mpot[ic] <= -2) {
//printf(" %d: DEBUG -- coarsening cell %d nc %d\n",mype,ic,nc);
i[nc] = i_old[ic]/2;
j[nc] = j_old[ic]/2;
level[nc] = level_old[ic] - 1;
}
} // Coarsening complete.
else if (mpot[ic] > 0)
{ // Refinement is needed; insert four cells where once was one.
index[ic] = nc;
if (celltype[ic] == REAL_CELL)
{
int order[4];
set_refinement_order(&order[0], ic, ifirst, ilast, jfirst, jlast,
level_first, level_last, i_old, j_old, level_old);
// Create the cells in the correct order and orientation.
for (int ii = 0; ii < 4; ii++) {
level[nc] = level_old[ic] + 1;
switch (order[ii]) {
case SW:
// lower left
invorder[SW] = ii;
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2;
nc++;
break;
case SE:
// lower right
invorder[SE] = ii;
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2;
nc++;
break;
case NW:
// upper left
invorder[NW] = ii;
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2 + 1;
nc++;
break;
case NE:
// upper right
invorder[NE] = ii;
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2 + 1;
nc++;
break;
}
} // Complete cell refinement.
} // Complete real cell refinement.
else if (celltype[ic] == LEFT_BOUNDARY) {
// lower
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2;
level[nc] = level_old[ic] + 1;
nc++;
// upper
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2 + 1;
level[nc] = level_old[ic] + 1;
nc++;
}
else if (celltype[ic] == RIGHT_BOUNDARY) {
// lower
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2;
level[nc] = level_old[ic] + 1;
nc++;
// upper
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2 + 1;
level[nc] = level_old[ic] + 1;
nc++;
}
else if (celltype[ic] == BOTTOM_BOUNDARY) {
// left
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2 + 1;
level[nc] = level_old[ic] + 1;
nc++;
// right
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2 + 1;
level[nc] = level_old[ic] + 1;
nc++;
}
else if (celltype[ic] == TOP_BOUNDARY) {
// right
i[nc] = i_old[ic]*2 + 1;
j[nc] = j_old[ic]*2;
level[nc] = level_old[ic] + 1;
nc++;
// left
i[nc] = i_old[ic]*2;
j[nc] = j_old[ic]*2;
level[nc] = level_old[ic] + 1;
nc++;
}
} // Complete refinement needed.
} // Complete addition of new cells to the mesh.
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
mesh_memory.memory_delete(i_old);
mesh_memory.memory_delete(j_old);
mesh_memory.memory_delete(level_old);
#ifdef _OPENMP
} // end master region
#endif
calc_celltype_threaded(new_ncells);
if (have_state){
static MallocPlus state_memory_old;
static malloc_plus_memory_entry *memory_begin;
static malloc_plus_memory_entry *memory_end;
static malloc_plus_memory_entry *memory_next;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
state_memory_old = state_memory;
memory_begin = state_memory_old.memory_entry_by_name_begin();
memory_end = state_memory_old.memory_entry_by_name_end();
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
for (malloc_plus_memory_entry *memory_item = memory_begin;
memory_item != memory_end;
memory_item = memory_next ) {
//ref_entry = 0;
//printf("DEBUG -- memory_item->mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
if (memory_item->mem_elsize == 8) {
static double *state_temp_double, *mem_ptr_double;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
state_temp_double = (double *)state_memory.memory_malloc(new_ncells, sizeof(double),
"state_temp_double", flags);
mem_ptr_double = (double *)memory_item->mem_ptr;
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
//ref_entry = 0;
#ifdef _OPENMP
#pragma omp for
#endif
for (int ic=0; ic<(int)ncells; ic++) {
int nc = new_ic[ic];
if (mpot[ic] == 0) {
state_temp_double[nc] = mem_ptr_double[ic];
} else if (mpot[ic] < 0){
if (mpot[ic] == -2) {
int nr = nrht[ic];
int nt = ntop[ic];
int nrt = nrht[nt];
state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nr] +
mem_ptr_double[nt] + mem_ptr_double[nrt])*0.25;
}
if (mpot[ic] == -3) {
int nl = nlft[ic];
int nb = nbot[ic];
int nlb = nlft[nb];
state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nl] +
mem_ptr_double[nb] + mem_ptr_double[nlb])*0.25;
}
} else if (mpot[ic] > 0){
// lower left
state_temp_double[nc] = mem_ptr_double[ic];
nc++;
// lower right
state_temp_double[nc] = mem_ptr_double[ic];
nc++;
if (celltype_save[ic] == REAL_CELL){
// upper left
state_temp_double[nc] = mem_ptr_double[ic];
nc++;
// upper right
state_temp_double[nc] = mem_ptr_double[ic];
nc++;
}
}
} // end cell loop
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
state_memory.memory_replace(mem_ptr_double, state_temp_double);
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
} else if (memory_item->mem_elsize == 4) {
static float *state_temp_float, *mem_ptr_float;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
state_temp_float = (float *)state_memory.memory_malloc(new_ncells, sizeof(float),
"state_temp_float", flags);
mem_ptr_float = (float *)memory_item->mem_ptr;
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (int ic=0; ic<(int)ncells; ic++) {
int nc = new_ic[ic];
if (mpot[ic] == 0) {
state_temp_float[nc] = mem_ptr_float[ic];
} else if (mpot[ic] < 0){
if (mpot[ic] == -2) {
int nr = nrht[ic];
int nt = ntop[ic];
int nrt = nrht[nt];
state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nr] +
mem_ptr_float[nt] + mem_ptr_float[nrt])*0.25;
}
if (mpot[ic] == -3) {
int nl = nlft[ic];
int nb = nbot[ic];
int nlb = nlft[nb];
state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nl] +
mem_ptr_float[nb] + mem_ptr_float[nlb])*0.25;
}
} else if (mpot[ic] > 0){
// lower left
state_temp_float[nc] = mem_ptr_float[ic];
nc++;
// lower right
state_temp_float[nc] = mem_ptr_float[ic];
nc++;
if (celltype_save[ic] == REAL_CELL){
// upper left
state_temp_float[nc] = mem_ptr_float[ic];
nc++;
// upper right
state_temp_float[nc] = mem_ptr_float[ic];
nc++;
}
}
} // end cell loop
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
state_memory.memory_replace(mem_ptr_float, state_temp_float);
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
} // mem elem size 4 bytes
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
memory_next = state_memory_old.memory_entry_by_name_next();
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
} // memory item iteration
} // if have state
// End of data parallel optimizations
#endif
if (neighbor_remap) {
int flags = 0;
static int *nlft_old, *nrht_old, *nbot_old, *ntop_old;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
nlft_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "nlft_old", flags);
nrht_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "nrht_old", flags);
nbot_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "nbot_old", flags);
ntop_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "ntop_old", flags);
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
flags = RESTART_DATA;
#ifdef _OPENMP
#pragma omp for
#endif
for (int ic = 0; ic < new_ncells; ic++){
nlft_old[ic] = -1;
nrht_old[ic] = -1;
nbot_old[ic] = -1;
ntop_old[ic] = -1;
}
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
mesh_memory.memory_swap(&nlft, &nlft_old);
mesh_memory.memory_swap(&nrht, &nrht_old);
mesh_memory.memory_swap(&nbot, &nbot_old);
mesh_memory.memory_swap(&ntop, &ntop_old);
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (int ic = 0; ic < (int)ncells; ic++){
int nc = index[ic];
if (mpot[ic] == 0){
if (nlft_old[ic] < (int)ncells && nlft_old[ic] >= 0){
nlft[nc] = (mpot[nlft_old[ic]] == 0) ? index[nlft_old[ic]] : -1;
}
if (nrht_old[ic] < (int)ncells && nrht_old[ic] >= 0){
nrht[nc] = (mpot[nrht_old[ic]] == 0) ? index[nrht_old[ic]] : -1;
}
if (nbot_old[ic] < (int)ncells && nbot_old[ic] >= 0){
nbot[nc] = (mpot[nbot_old[ic]] == 0) ? index[nbot_old[ic]] : -1;
}
if (ntop_old[ic] < (int)ncells && ntop_old[ic] >= 0){
ntop[nc] = (mpot[ntop_old[ic]] == 0) ? index[ntop_old[ic]] : -1;
}
} else if (mpot[ic] <= -2) {
nlft[nc] = -1;
nrht[nc] = -1;
nbot[nc] = -1;
ntop[nc] = -1;
} else if (mpot[ic] > 0){
nlft[nc] = -1;
nlft[nc+1] = -1;
nrht[nc] = -1;
nrht[nc+1] = -1;
nbot[nc] = -1;
nbot[nc+1] = -1;
ntop[nc] = -1;
ntop[nc+1] = -1;
if (celltype[nc] == REAL_CELL){
nlft[nc+2] = -1;
nlft[nc+3] = -1;
nrht[nc+2] = -1;
nrht[nc+3] = -1;
nbot[nc+2] = -1;
nbot[nc+3] = -1;
ntop[nc+2] = -1;
ntop[nc+3] = -1;
}
}
if (mpot[ic] > 0){
nc++;
switch(celltype[nc]){
case LEFT_BOUNDARY:
nlft[nc] = nc;
break;
case RIGHT_BOUNDARY:
nrht[nc] = nc;
break;
case BOTTOM_BOUNDARY:
nbot[nc] = nc;
break;
case TOP_BOUNDARY:
ntop[nc] = nc;
break;
}
}
}
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
nlft_old = (int *)mesh_memory.memory_delete(nlft_old);
nrht_old = (int *)mesh_memory.memory_delete(nrht_old);
nbot_old = (int *)mesh_memory.memory_delete(nbot_old);
ntop_old = (int *)mesh_memory.memory_delete(ntop_old);
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
} else {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
nlft = (int *)mesh_memory.memory_delete(nlft);
nrht = (int *)mesh_memory.memory_delete(nrht);
nbot = (int *)mesh_memory.memory_delete(nbot);
ntop = (int *)mesh_memory.memory_delete(ntop);
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
}
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
//ncells = nc;
#ifdef HAVE_MPI
if (parallel) {
MPI_Allgather(&new_ncells, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
ndispl[0]=0;
for (int ip=1; ip<numpe; ip++){
ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
}
noffset=ndispl[mype];
ncells_global = ndispl[numpe-1]+nsizes[numpe-1];
}
#endif
cpu_timers[MESH_TIMER_REZONE_ALL] += cpu_timer_stop(tstart_cpu);
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
} // if do_rezone
}
#ifdef HAVE_OPENCL
void Mesh::gpu_rezone_all(int icount, int jcount, cl_mem &dev_mpot, MallocPlus &gpu_state_memory)
{
if (! gpu_do_rezone) return;
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
gpu_counters[MESH_COUNTER_REZONE]++;
cl_command_queue command_queue = ezcl_get_command_queue();
assert(dev_mpot);
assert(dev_level);
assert(dev_i);
assert(dev_j);
assert(dev_celltype);
assert(dev_ioffset);
assert(dev_levdx);
assert(dev_levdy);
int add_ncells = icount - jcount;
// int global_icount = icount;
// int global_jcount = jcount;
size_t old_ncells = ncells;
size_t new_ncells = ncells + add_ncells;
#ifdef HAVE_MPI
//int global_add_ncells = add_ncells;
// if (parallel) {
// int count[2], global_count[2];
// count[0] = icount;
// count[1] = jcount;
// MPI_Allreduce(&count, &global_count, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
// global_icount = global_count[0];
// global_jcount = global_count[1];
// //global_add_ncells = global_icount + global_jcount;
// }
#endif
int ifirst = 0;
int ilast = 0;
int jfirst = 0;
int jlast = 0;
int level_first = 0;
int level_last = 0;
#ifdef HAVE_MPI
if (numpe > 1) {
int i_tmp_first, i_tmp_last;
int j_tmp_first, j_tmp_last;
int level_tmp_first, level_tmp_last;
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, 1*sizeof(cl_int), &i_tmp_first, NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, 1*sizeof(cl_int), &j_tmp_first, NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, 1*sizeof(cl_int), &level_tmp_first, NULL);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, (old_ncells-1)*sizeof(cl_int), 1*sizeof(cl_int), &i_tmp_last, NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, (old_ncells-1)*sizeof(cl_int), 1*sizeof(cl_int), &j_tmp_last, NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, (old_ncells-1)*sizeof(cl_int), 1*sizeof(cl_int), &level_tmp_last, NULL);
MPI_Request req[12];
MPI_Status status[12];
static int prev = MPI_PROC_NULL;
static int next = MPI_PROC_NULL;
if (mype != 0) prev = mype-1;
if (mype < numpe - 1) next = mype+1;
MPI_Isend(&i_tmp_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+0);
MPI_Irecv(&ifirst, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+1);
MPI_Isend(&i_tmp_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+2);
MPI_Irecv(&ilast, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+3);
MPI_Isend(&j_tmp_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+4);
MPI_Irecv(&jfirst, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+5);
MPI_Isend(&j_tmp_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+6);
MPI_Irecv(&jlast, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+7);
MPI_Isend(&level_tmp_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+8);
MPI_Irecv(&level_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+9);
MPI_Isend(&level_tmp_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+10);
MPI_Irecv(&level_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+11);
MPI_Waitall(12, req, status);
}
#endif
/*
if (new_ncells != old_ncells){
ncells = new_ncells;
}
*/
size_t mem_request = (int)((float)new_ncells*mem_factor);
cl_mem dev_celltype_new = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_level_new = ezcl_malloc(NULL, const_cast<char *>("dev_level_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_i_new = ezcl_malloc(NULL, const_cast<char *>("dev_i_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_j_new = ezcl_malloc(NULL, const_cast<char *>("dev_j_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_ijadd;
vector<int>ijadd(6);
if (numpe > 1) {
ijadd[0] = ifirst;
ijadd[1] = ilast;
ijadd[2] = jfirst;
ijadd[3] = jlast;
ijadd[4] = level_first;
ijadd[5] = level_last;
}
size_t six = 6;
dev_ijadd = ezcl_malloc(NULL, const_cast<char *>("dev_ijadd"), &six, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_enqueue_write_buffer(command_queue, dev_ijadd, CL_TRUE, 0, 6*sizeof(cl_int), (void*)&ijadd[0], NULL);
cl_mem dev_indexoffset = ezcl_malloc(NULL, const_cast<char *>("dev_indexoffset"), &old_ncells, sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
int stencil = 0;
if (localStencil) stencil = 1;
size_t local_work_size = 128;
size_t global_work_size = ((old_ncells+local_work_size - 1) /local_work_size) * local_work_size;
ezcl_set_kernel_arg(kernel_rezone_all, 0, sizeof(cl_int), (void *)&old_ncells);
ezcl_set_kernel_arg(kernel_rezone_all, 1, sizeof(cl_int), (void *)&stencil);
ezcl_set_kernel_arg(kernel_rezone_all, 2, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_rezone_all, 3, sizeof(cl_mem), (void *)&dev_mpot);
ezcl_set_kernel_arg(kernel_rezone_all, 4, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_rezone_all, 5, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_rezone_all, 6, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_rezone_all, 7, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_rezone_all, 8, sizeof(cl_mem), (void *)&dev_level_new);
ezcl_set_kernel_arg(kernel_rezone_all, 9, sizeof(cl_mem), (void *)&dev_i_new);
ezcl_set_kernel_arg(kernel_rezone_all, 10, sizeof(cl_mem), (void *)&dev_j_new);
ezcl_set_kernel_arg(kernel_rezone_all, 11, sizeof(cl_mem), (void *)&dev_celltype_new);
ezcl_set_kernel_arg(kernel_rezone_all, 12, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_rezone_all, 13, sizeof(cl_mem), (void *)&dev_indexoffset);
ezcl_set_kernel_arg(kernel_rezone_all, 14, sizeof(cl_mem), (void *)&dev_levdx);
ezcl_set_kernel_arg(kernel_rezone_all, 15, sizeof(cl_mem), (void *)&dev_levdy);
ezcl_set_kernel_arg(kernel_rezone_all, 16, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_rezone_all, 17, sizeof(cl_mem), (void *)&dev_ijadd);
ezcl_set_kernel_arg(kernel_rezone_all, 18, local_work_size * sizeof(cl_uint), NULL);
//ezcl_set_kernel_arg(kernel_rezone_all, 19, local_work_size * sizeof(cl_real4_t), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_all, 1, NULL, &global_work_size, &local_work_size, NULL);
MallocPlus gpu_state_memory_old = gpu_state_memory;
malloc_plus_memory_entry *memory_item;
for (memory_item = gpu_state_memory_old.memory_entry_by_name_begin();
memory_item != gpu_state_memory_old.memory_entry_by_name_end();
memory_item = gpu_state_memory_old.memory_entry_by_name_next() ) {
//printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
cl_mem dev_state_mem_ptr = (cl_mem)memory_item->mem_ptr;
if (memory_item->mem_elsize == 8){
#ifndef MINIMUM_PRECISION
cl_mem dev_state_var_new = (cl_mem)gpu_state_memory.memory_malloc(max(old_ncells,new_ncells), sizeof(cl_double), const_cast<char *>("dev_state_var_new"), DEVICE_REGULAR_MEMORY);
ezcl_set_kernel_arg(kernel_rezone_one_double, 0, sizeof(cl_int), (void *)&old_ncells);
ezcl_set_kernel_arg(kernel_rezone_one_double, 1, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_rezone_one_double, 2, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_rezone_one_double, 3, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_rezone_one_double, 4, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_rezone_one_double, 5, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_rezone_one_double, 6, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_rezone_one_double, 7, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_rezone_one_double, 8, sizeof(cl_mem), (void *)&dev_mpot);
ezcl_set_kernel_arg(kernel_rezone_one_double, 9, sizeof(cl_mem), (void *)&dev_indexoffset);
ezcl_set_kernel_arg(kernel_rezone_one_double,10, sizeof(cl_mem), (void *)&dev_state_mem_ptr);
ezcl_set_kernel_arg(kernel_rezone_one_double,11, sizeof(cl_mem), (void *)&dev_state_var_new);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_one_double, 1, NULL, &global_work_size, &local_work_size, NULL);
gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
#else
printf("ERROR -- can't have double type for state variable\n");
exit(1);
#endif
} else if (memory_item->mem_elsize == 4){
cl_mem dev_state_var_new = (cl_mem)gpu_state_memory.memory_malloc(max(old_ncells,new_ncells), sizeof(cl_float), const_cast<char *>("dev_state_var_new"), DEVICE_REGULAR_MEMORY);
ezcl_set_kernel_arg(kernel_rezone_one_float, 0, sizeof(cl_int), (void *)&old_ncells);
ezcl_set_kernel_arg(kernel_rezone_one_float, 1, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_rezone_one_float, 2, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_rezone_one_float, 3, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_rezone_one_float, 4, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_rezone_one_float, 5, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_rezone_one_float, 6, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_rezone_one_float, 7, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_rezone_one_float, 8, sizeof(cl_mem), (void *)&dev_mpot);
ezcl_set_kernel_arg(kernel_rezone_one_float, 9, sizeof(cl_mem), (void *)&dev_indexoffset);
ezcl_set_kernel_arg(kernel_rezone_one_float,10, sizeof(cl_mem), (void *)&dev_state_mem_ptr);
ezcl_set_kernel_arg(kernel_rezone_one_float,11, sizeof(cl_mem), (void *)&dev_state_var_new);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_one_float, 1, NULL, &global_work_size, &local_work_size, NULL);
gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
}
}
if (neighbor_remap & ! parallel) {
size_t mem_request = (int)((float)new_ncells*mem_factor);
cl_mem dev_nlft_new = ezcl_malloc(NULL, const_cast<char *>("dev_nlft_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_nrht_new = ezcl_malloc(NULL, const_cast<char *>("dev_nrht_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_nbot_new = ezcl_malloc(NULL, const_cast<char *>("dev_nbot_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_ntop_new = ezcl_malloc(NULL, const_cast<char *>("dev_ntop_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_neighbor_init, 0, sizeof(cl_int), (void *)&new_ncells);
ezcl_set_kernel_arg(kernel_neighbor_init, 1, sizeof(cl_mem), (void *)&dev_nlft_new);
ezcl_set_kernel_arg(kernel_neighbor_init, 2, sizeof(cl_mem), (void *)&dev_nrht_new);
ezcl_set_kernel_arg(kernel_neighbor_init, 3, sizeof(cl_mem), (void *)&dev_nbot_new);
ezcl_set_kernel_arg(kernel_neighbor_init, 4, sizeof(cl_mem), (void *)&dev_ntop_new);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_neighbor_init, 1, NULL, &global_work_size, &local_work_size, NULL);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 0, sizeof(cl_int), (void *)&old_ncells);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 1, sizeof(cl_mem), (void *)&dev_mpot);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 2, sizeof(cl_mem), (void *)&dev_indexoffset);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 3, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 4, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 5, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 6, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 7, sizeof(cl_mem), (void *)&dev_celltype_new);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 8, sizeof(cl_mem), (void *)&dev_nlft_new);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 9, sizeof(cl_mem), (void *)&dev_nrht_new);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 10, sizeof(cl_mem), (void *)&dev_nbot_new);
ezcl_set_kernel_arg(kernel_rezone_neighbors, 11, sizeof(cl_mem), (void *)&dev_ntop_new);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_neighbors, 1, NULL, &global_work_size, &local_work_size, NULL);
ezcl_device_memory_swap(&dev_nlft, &dev_nlft_new);
ezcl_device_memory_swap(&dev_nrht, &dev_nrht_new);
ezcl_device_memory_swap(&dev_nbot, &dev_nbot_new);
ezcl_device_memory_swap(&dev_ntop, &dev_ntop_new);
ezcl_device_memory_delete(dev_nlft_new);
ezcl_device_memory_delete(dev_nrht_new);
ezcl_device_memory_delete(dev_nbot_new);
ezcl_device_memory_delete(dev_ntop_new);
} else {
ezcl_device_memory_delete(dev_nlft);
ezcl_device_memory_delete(dev_nrht);
ezcl_device_memory_delete(dev_nbot);
ezcl_device_memory_delete(dev_ntop);
dev_nlft = NULL;
dev_nrht = NULL;
dev_nbot = NULL;
dev_ntop = NULL;
}
ezcl_device_memory_delete(dev_indexoffset);
if (new_ncells != old_ncells){
resize_old_device_memory(new_ncells);
}
ezcl_device_memory_swap(&dev_celltype, &dev_celltype_new);
ezcl_device_memory_swap(&dev_level, &dev_level_new);
ezcl_device_memory_swap(&dev_i, &dev_i_new);
ezcl_device_memory_swap(&dev_j, &dev_j_new);
ezcl_device_memory_delete(dev_mpot);
ezcl_device_memory_delete(dev_ijadd);
ezcl_device_memory_delete(dev_ioffset);
ezcl_device_memory_delete(dev_i_new);
ezcl_device_memory_delete(dev_j_new);
ezcl_device_memory_delete(dev_celltype_new);
ezcl_device_memory_delete(dev_level_new);
#ifdef HAVE_MPI
if (parallel) {
int new_ncells = ncells + add_ncells;
MPI_Allgather(&new_ncells, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
ndispl[0]=0;
for (int ip=1; ip<numpe; ip++){
ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
}
noffset=ndispl[mype];
ncells_global = ndispl[numpe-1]+nsizes[numpe-1];
}
#endif
gpu_timers[MESH_TIMER_REZONE_ALL] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
}
#endif
void Mesh::calc_neighbors(int ncells)
{
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
if (do_rezone) {
int flags = INDEX_ARRAY_MEMORY;
#if defined (HAVE_J7)
if (parallel) flags |= LOAD_BALANCE_MEMORY;
#endif
static int nlft_size = 0;
#ifdef _OPENMP
#pragma omp master
{
#endif
cpu_counters[MESH_COUNTER_CALC_NEIGH]++;
if (nlft != NULL){
nlft_size = mesh_memory.get_memory_size(nlft);
}
if (nlft_size < ncells){
if (nlft != NULL){
nlft = (int *)mesh_memory.memory_delete(nlft);
nrht = (int *)mesh_memory.memory_delete(nrht);
nbot = (int *)mesh_memory.memory_delete(nbot);
ntop = (int *)mesh_memory.memory_delete(ntop);
}
nlft = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nlft", flags);
nrht = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nrht", flags);
nbot = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nbot", flags);
ntop = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "ntop", flags);
}
#ifdef _OPENMP
}
#pragma omp barrier
#endif
if (nlft_size < ncells){
int lowerBounds, upperBounds;
get_bounds(lowerBounds, upperBounds);
for(int ic=lowerBounds; ic<upperBounds; ic++){
nlft[ic] = -1;
nrht[ic] = -1;
nbot[ic] = -1;
ntop[ic] = -1;
}
}
if (calc_neighbor_type == HASH_TABLE) {
struct timeval tstart_lev2;
if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
int jmaxsize = (jmax+1)*IPOW2(levmx);
int imaxsize = (imax+1)*IPOW2(levmx);
int *hash;
#ifdef _OPENMP
hash = compact_hash_init_openmp(ncells, imaxsize, jmaxsize, 0);
#else
hash = compact_hash_init(ncells, imaxsize, jmaxsize, 0);
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for(int ic=0; ic<ncells; ic++){
int lev = level[ic];
bool need_hash = (nlft[ic] == -1 || nrht[ic] == -1 || nbot[ic] == -1 || ntop[ic] == -1) ? true : false;
if (! need_hash){
if ( (level[nlft[ic]] > lev && ntop[nlft[ic]] == -1) ||
(level[nrht[ic]] > lev && ntop[nrht[ic]] == -1) ||
(level[nbot[ic]] > lev && nrht[nbot[ic]] == -1) ||
(level[ntop[ic]] > lev && nrht[ntop[ic]] == -1) ) need_hash = true;
}
if (need_hash) {
int levmult = IPOW2(levmx-lev);
int ii = i[ic]*levmult;
int jj = j[ic]*levmult;
write_hash(ic,jj*imaxsize+ii,hash);
}
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_HASH_SETUP] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
//fprintf(fp,"DEBUG ncells is %lu\n",ncells);
#ifdef _OPENMP
#pragma omp for
#endif
for (int ic=0; ic<(int)ncells; ic++){
int ii = i[ic];
int jj = j[ic];
int lev = level[ic];
int levmult = IPOW2(levmx-lev);
int iicur = ii*levmult;
int iilft = max( (ii-1)*levmult, 0 );
int iirht = min( (ii+1)*levmult, imaxsize-1);
int jjcur = jj*levmult;
int jjbot = max( (jj-1)*levmult, 0 );
int jjtop = min( (jj+1)*levmult, jmaxsize-1);
int nlftval = nlft[ic];
int nrhtval = nrht[ic];
int nbotval = nbot[ic];
int ntopval = ntop[ic];
// Taking care of boundary cells
// Force each boundary cell to point to itself on its boundary direction
if (nlftval < 0 && iicur < 1*IPOW2(levmx) ) nlftval = ic;
if (nbotval < 0 && jjcur < 1*IPOW2(levmx) ) nbotval = ic;
if (nrhtval < 0 && iicur > imax*IPOW2(levmx)-1) nrhtval = ic;
if (ntopval < 0 && jjcur > jmax*IPOW2(levmx)-1) ntopval = ic;
// Boundary cells next to corner boundary need special checks
if (nlftval < 0 && iicur == 1*IPOW2(levmx) && (jjcur < 1*IPOW2(levmx) || jjcur >= jmax*IPOW2(levmx) ) ) nlftval = ic;
if (nbotval < 0 && jjcur == 1*IPOW2(levmx) && (iicur < 1*IPOW2(levmx) || iicur >= imax*IPOW2(levmx) ) ) nbotval = ic;
if (nrhtval < 0 && iirht == imax*IPOW2(levmx) && (jjcur < 1*IPOW2(levmx) || jjcur >= jmax*IPOW2(levmx) ) ) nrhtval = ic;
if (ntopval < 0 && jjtop == jmax*IPOW2(levmx) && (iicur < 1*IPOW2(levmx) || iicur >= imax*IPOW2(levmx) ) ) ntopval = ic;
// need to check for finer neighbor first
// Right and top neighbor don't change for finer, so drop through to same size
// Left and bottom need to be half of same size index for finer test
if (lev != levmx) {
int iilftfiner = iicur-(iicur-iilft)/2;
//int iirhtfiner = (iicur+iirht)/2;
int jjbotfiner = jjcur-(jjcur-jjbot)/2;
//int jjtopfiner = (jjcur+jjtop)/2;
if (nlftval < 0) nlftval = read_hash(jjcur*imaxsize+iilftfiner, hash);
if (nbotval < 0) nbotval = read_hash(jjbotfiner*imaxsize+iicur, hash);
}
// same size neighbor
if (nlftval < 0) nlftval = read_hash(jjcur*imaxsize+iilft, hash);
if (nrhtval < 0) nrhtval = read_hash(jjcur*imaxsize+iirht, hash);
if (nbotval < 0) nbotval = read_hash(jjbot*imaxsize+iicur, hash);
if (ntopval < 0) ntopval = read_hash(jjtop*imaxsize+iicur, hash);
// Now we need to take care of special case where bottom and left boundary need adjustment since
// expected cell doesn't exist on these boundaries if it is finer than current cell
if (lev != levmx) {
if (jjcur < 1*IPOW2(levmx)) {
if (nrhtval < 0) {
int jjtopfiner = (jjcur+jjtop)/2;
nrhtval = read_hash(jjtopfiner*imaxsize+iirht, hash);
}
if (nlftval < 0) {
int iilftfiner = iicur-(iicur-iilft)/2;
int jjtopfiner = (jjcur+jjtop)/2;
nlftval = read_hash(jjtopfiner*imaxsize+iilftfiner, hash);
}
}
if (iicur < 1*IPOW2(levmx)) {
if (ntopval < 0) {
int iirhtfiner = (iicur+iirht)/2;
ntopval = read_hash(jjtop*imaxsize+iirhtfiner, hash);
}
if (nbotval < 0) {
int iirhtfiner = (iicur+iirht)/2;
int jjbotfiner = jjcur-(jjcur-jjbot)/2;
nbotval = read_hash(jjbotfiner*imaxsize+iirhtfiner, hash);
}
}
}
// coarser neighbor
if (lev != 0){
if (nlftval < 0) {
iilft -= iicur-iilft;
int jjlft = (jj/2)*2*levmult;
nlftval = read_hash(jjlft*imaxsize+iilft, hash);
}
if (nrhtval < 0) {
int jjrht = (jj/2)*2*levmult;
nrhtval = read_hash(jjrht*imaxsize+iirht, hash);
}
if (nbotval < 0) {
jjbot -= jjcur-jjbot;
int iibot = (ii/2)*2*levmult;
nbotval = read_hash(jjbot*imaxsize+iibot, hash);
}
if (ntopval < 0) {
int iitop = (ii/2)*2*levmult;
ntopval = read_hash(jjtop*imaxsize+iitop, hash);
}
}
nlft[ic] = nlftval;
nrht[ic] = nrhtval;
nbot[ic] = nbotval;
ntop[ic] = ntopval;
//printf("neighbors[%d] = %d %d %d %d\n",ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
}
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
write_hash_collision_report();
read_hash_collision_report();
compact_hash_delete(hash);
if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_HASH_QUERY] += cpu_timer_stop(tstart_lev2);
#ifdef _OPENMP
} // master block
#endif
} else if (calc_neighbor_type == KDTREE) {
struct timeval tstart_lev2;
if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
TBounds box;
vector<int> index_list(IPOW2(levmx*levmx) );
int num;
ibase = 0;
calc_spatial_coordinates(ibase);
kdtree_setup();
if (TIMING_LEVEL >= 2) {
cpu_timers[MESH_TIMER_KDTREE_SETUP] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
for (int ic=0; ic<ncells; ic++) {
//left
nlft[ic] = ic;
box.min.x = x[ic]-0.25*dx[ic];
box.max.x = x[ic]-0.25*dx[ic];
box.min.y = y[ic]+0.25*dy[ic];
box.max.y = y[ic]+0.25*dy[ic];
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) nlft[ic]=index_list[0];
//right
nrht[ic] = ic;
box.min.x = x[ic]+1.25*dx[ic];
box.max.x = x[ic]+1.25*dx[ic];
box.min.y = y[ic]+0.25*dy[ic];
box.max.y = y[ic]+0.25*dy[ic];
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) nrht[ic]=index_list[0];
//bot
nbot[ic] = ic;
box.min.x = x[ic]+0.25*dx[ic];
box.max.x = x[ic]+0.25*dx[ic];
box.min.y = y[ic]-0.25*dy[ic];
box.max.y = y[ic]-0.25*dy[ic];
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) nbot[ic]=index_list[0];
//top
ntop[ic] = ic;
box.min.x = x[ic]+0.25*dx[ic];
box.max.x = x[ic]+0.25*dx[ic];
box.min.y = y[ic]+1.25*dy[ic];
box.max.y = y[ic]+1.25*dy[ic];
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) ntop[ic]=index_list[0];
} // End main loop over cells.
KDTree_Destroy(&tree);
if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_KDTREE_QUERY] += cpu_timer_stop(tstart_lev2);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
} // calc_neighbor_type
#ifdef _OPENMP
#pragma omp master
#endif
ncells_ghost = ncells;
}
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_CALC_NEIGHBORS] += cpu_timer_stop(tstart_cpu);
}
void Mesh::calc_neighbors_local(void)
{
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
if (do_rezone) {
int flags = INDEX_ARRAY_MEMORY;
#if defined (HAVE_J7)
if (parallel) flags |= LOAD_BALANCE_MEMORY;
#endif
#ifdef _OPENMP
#pragma omp master
{
#endif
cpu_counters[MESH_COUNTER_CALC_NEIGH]++;
if (mesh_memory.get_memory_size(nlft) < ncells){
if (nlft != NULL) nlft = (int *)mesh_memory.memory_delete(nlft);
if (nrht != NULL) nrht = (int *)mesh_memory.memory_delete(nrht);
if (nbot != NULL) nbot = (int *)mesh_memory.memory_delete(nbot);
if (ntop != NULL) ntop = (int *)mesh_memory.memory_delete(ntop);
nlft = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nlft", flags);
nrht = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nrht", flags);
nbot = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nbot", flags);
ntop = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "ntop", flags);
}
#ifdef _OPENMP
}
#pragma omp barrier
#endif
int lowerBound, upperBound;
set_bounds(ncells);
get_bounds(lowerBound, upperBound);
for (int ic = lowerBound; ic < upperBound; ic++){
nlft[ic] = -98;
nrht[ic] = -98;
nbot[ic] = -98;
ntop[ic] = -98;
}
if (calc_neighbor_type == HASH_TABLE) {
struct timeval tstart_lev2;
if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
ncells_ghost = ncells;
// Find maximum i column and j row for this processor
static int jmintile, imintile, jmaxtile, imaxtile;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
jmintile = (jmax+1)*IPOW2(levmx);
imintile = (imax+1)*IPOW2(levmx);
jmaxtile = 0;
imaxtile = 0;
#ifdef _OPENMP
}
#pragma omp barrier
#endif
int my_jmintile = jmintile;
int my_imintile = imintile;
int my_jmaxtile = 0;
int my_imaxtile = 0;
#ifdef _OPENMP
#pragma omp for
#endif
for(uint ic=0; ic<ncells; ic++){
int lev = level[ic];
// if (lev < 0 || lev > levmx) printf("DEBUG -- cell %d lev %d\n",ic,level[ic]);
if ( j[ic] *IPOW2(levmx-lev) < my_jmintile) my_jmintile = j[ic] *IPOW2(levmx-lev) ;
if ((j[ic]+1)*IPOW2(levmx-lev)-1 > my_jmaxtile) my_jmaxtile = (j[ic]+1)*IPOW2(levmx-lev)-1;
if ( i[ic] *IPOW2(levmx-lev) < my_imintile) my_imintile = i[ic] *IPOW2(levmx-lev) ;
if ((i[ic]+1)*IPOW2(levmx-lev)-1 > my_imaxtile) my_imaxtile = (i[ic]+1)*IPOW2(levmx-lev)-1;
}
#ifdef _OPENMP
#pragma omp critical
{
#endif
if (my_jmintile < jmintile) jmintile = my_jmintile;
if (my_imintile < imintile) imintile = my_imintile;
if (my_jmaxtile > jmaxtile) jmaxtile = my_jmaxtile;
if (my_imaxtile > imaxtile) imaxtile = my_imaxtile;
#ifdef _OPENMP
} // end critical region
#pragma omp barrier
#endif
//if (DEBUG) fprintf(fp,"%d: Tile Sizes are imin %d imax %d jmin %d jmax %d\n",mype,imintile,imaxtile,jmintile,jmaxtile);
// Expand size by 2*coarse_cells for ghost cells
int jminsize = max(jmintile-2*IPOW2(levmx),0);
int jmaxsize = min(jmaxtile+2*IPOW2(levmx),(jmax+1)*IPOW2(levmx));
int iminsize = max(imintile-2*IPOW2(levmx),0);
int imaxsize = min(imaxtile+2*IPOW2(levmx),(imax+1)*IPOW2(levmx));
//if (DEBUG) fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
//fprintf(fp,"DEBUG -- ncells %lu\n",ncells);
static int *hash;
#ifdef _OPENMP
hash = compact_hash_init_openmp(ncells, imaxsize-iminsize, jmaxsize-jminsize, 0);
#else
hash = compact_hash_init(ncells, imaxsize-iminsize, jmaxsize-jminsize, 0);
#endif
//printf("%d: DEBUG -- noffset %d cells %d\n",mype,noffset,ncells);
if (DEBUG) {
#ifdef _OPENMP
#pragma omp master
#endif
fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
}
static int imaxcalc, jmaxcalc;
#ifdef _OPENMP
#pragma omp for
#endif
for(uint ic=0; ic<ncells; ic++){
int cellnumber = ic+noffset;
int lev = level[ic];
int levmult = IPOW2(levmx-lev);
int ii = i[ic]*levmult-iminsize;
int jj = j[ic]*levmult-jminsize;
write_hash(cellnumber, jj*(imaxsize-iminsize)+ii, hash);
} // end for loop
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_HASH_SETUP] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
#ifdef _OPENMP
#pragma omp master
{
#endif
// Set neighbors to global cell numbers from hash
jmaxcalc = (jmax+1)*IPOW2(levmx);
imaxcalc = (imax+1)*IPOW2(levmx);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (uint ic=0; ic<ncells; ic++){
int ii = i[ic];
int jj = j[ic];
int lev = level[ic];
int levmult = IPOW2(levmx-lev);
int iicur = ii*levmult-iminsize;
int iilft = max( (ii-1)*levmult, 0 )-iminsize;
int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
int jjcur = jj*levmult-jminsize;
int jjbot = max( (jj-1)*levmult, 0 )-jminsize;
int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
int nlftval = -1;
int nrhtval = -1;
int nbotval = -1;
int ntopval = -1;
// Taking care of boundary cells
// Force each boundary cell to point to itself on its boundary direction
if (iicur < 1*IPOW2(levmx) -iminsize) nlftval = ic+noffset;
if (jjcur < 1*IPOW2(levmx) -jminsize) nbotval = ic+noffset;
if (iicur > imax*IPOW2(levmx)-1-iminsize) nrhtval = ic+noffset;
if (jjcur > jmax*IPOW2(levmx)-1-jminsize) ntopval = ic+noffset;
// Boundary cells next to corner boundary need special checks
if (iicur == 1*IPOW2(levmx)-iminsize && (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nlftval = ic+noffset;
if (jjcur == 1*IPOW2(levmx)-jminsize && (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) nbotval = ic+noffset;
if (iirht == imax*IPOW2(levmx)-iminsize && (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nrhtval = ic+noffset;
if (jjtop == jmax*IPOW2(levmx)-jminsize && (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) ntopval = ic+noffset;
// need to check for finer neighbor first
// Right and top neighbor don't change for finer, so drop through to same size
// Left and bottom need to be half of same size index for finer test
if (lev != levmx) {
int iilftfiner = iicur-(iicur-iilft)/2;
int jjbotfiner = jjcur-(jjcur-jjbot)/2;
if (nlftval < 0) nlftval = read_hash(jjcur *(imaxsize-iminsize)+iilftfiner, hash);
if (nbotval < 0) nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
}
// same size neighbor
if (nlftval < 0) {
int nlfttry = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
if (nlfttry >= 0 && nlfttry < (int)ncells && level[nlfttry] == lev) nlftval = nlfttry;
}
if (nrhtval < 0) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
if (nbotval < 0) {
int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
if (nbottry >= 0 && nbottry < (int)ncells && level[nbottry] == lev) nbotval = nbottry;
}
if (ntopval < 0) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
// Now we need to take care of special case where bottom and left boundary need adjustment since
// expected cell doesn't exist on these boundaries if it is finer than current cell
if (lev != levmx) {
if (jjcur < 1*IPOW2(levmx)) {
if (nrhtval < 0) {
int jjtopfiner = (jjcur+jjtop)/2;
nrhtval = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
}
if (nlftval < 0) {
int iilftfiner = iicur-(iicur-iilft)/2;
int jjtopfiner = (jjcur+jjtop)/2;
nlftval = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
}
}
if (iicur < 1*IPOW2(levmx)) {
if (ntopval < 0) {
int iirhtfiner = (iicur+iirht)/2;
ntopval = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
}
if (nbotval < 0) {
int iirhtfiner = (iicur+iirht)/2;
int jjbotfiner = jjcur-(jjcur-jjbot)/2;
nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
}
}
}
// coarser neighbor
if (lev != 0){
if (nlftval < 0) {
iilft -= iicur-iilft;
int jjlft = (jj/2)*2*levmult-jminsize;
int nlfttry = read_hash(jjlft*(imaxsize-iminsize)+iilft, hash);
if (nlfttry >= 0 && nlfttry < (int)ncells && level[nlfttry] == lev-1) nlftval = nlfttry;
}
if (nrhtval < 0) {
int jjrht = (jj/2)*2*levmult-jminsize;
int nrhttry = read_hash(jjrht*(imaxsize-iminsize)+iirht, hash);
if (nrhttry >= 0 && nrhttry < (int)ncells && level[nrhttry] == lev-1) nrhtval = nrhttry;
}
if (nbotval < 0) {
jjbot -= jjcur-jjbot;
int iibot = (ii/2)*2*levmult-iminsize;
int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iibot, hash);
if (nbottry >= 0 && nbottry < (int)ncells && level[nbottry] == lev-1) nbotval = nbottry;
}
if (ntopval < 0) {
int iitop = (ii/2)*2*levmult-iminsize;
int ntoptry = read_hash(jjtop*(imaxsize-iminsize)+iitop, hash);
if (ntoptry >= 0 && ntoptry < (int)ncells && level[ntoptry] == lev-1) ntopval = ntoptry;
}
}
nlft[ic] = nlftval;
nrht[ic] = nrhtval;
nbot[ic] = nbotval;
ntop[ic] = ntopval;
//fprintf(fp,"%d: neighbors[%d] = %d %d %d %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
}
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
print_local();
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH 0 numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash));
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nlft numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",nlft[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nrht numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",nrht[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nbot numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",nbot[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n ntop numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",ntop[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
#ifdef _OPENMP
}
#pragma omp barrier
#endif
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_HASH_QUERY] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
#ifdef HAVE_MPI
if (numpe > 1) {
static int num_comm_partners;
static vector<int> iminsize_global;
static vector<int> imaxsize_global;
static vector<int> jminsize_global;
static vector<int> jmaxsize_global;
static vector<int> comm_partner;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
iminsize_global.resize(numpe);
imaxsize_global.resize(numpe);
jminsize_global.resize(numpe);
jmaxsize_global.resize(numpe);
comm_partner.resize(numpe,-1);
MPI_Allgather(&iminsize, 1, MPI_INT, &iminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(&imaxsize, 1, MPI_INT, &imaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(&jminsize, 1, MPI_INT, &jminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(&jmaxsize, 1, MPI_INT, &jmaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
num_comm_partners = 0;
for (int ip = 0; ip < numpe; ip++){
if (ip == mype) continue;
if (iminsize_global[ip] > imaxtile) continue;
if (imaxsize_global[ip] < imintile) continue;
if (jminsize_global[ip] > jmaxtile) continue;
if (jmaxsize_global[ip] < jmintile) continue;
comm_partner[num_comm_partners] = ip;
num_comm_partners++;
//if (DEBUG) fprintf(fp,"%d: overlap with processor %d bounding box is %d %d %d %d\n",mype,ip,iminsize_global[ip],imaxsize_global[ip],jminsize_global[ip],jmaxsize_global[ip]);
}
#ifdef _OPENMP
}
#pragma omp barrier
#endif
static vector<int> border_cell;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
border_cell.resize(ncells);
#ifdef BOUNDS_CHECK
for (uint ic=0; ic<ncells; ic++){
int nl = nlft[ic];
if (nl != -1){
nl -= noffset;
if (nl<0 || nl>= (int)ncells) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
}
int nr = nrht[ic];
if (nr != -1){
nr -= noffset;
if (nr<0 || nr>= (int)ncells) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
}
int nb = nbot[ic];
if (nb != -1){
nb -= noffset;
if (nb<0 || nb>= (int)ncells) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
}
int nt = ntop[ic];
if (nt != -1){
nt -= noffset;
if (nt<0 || nt>= (int)ncells) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
}
}
#endif
#ifdef _OPENMP
}
#pragma omp barrier
#endif
static vector<int> border_cell_out;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
border_cell_out.resize(ncells);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (uint ic=0; ic<ncells; ic++){
int iborder_cell = 0;
// left neighbor is undefined -- or -- if left is at finer level check left top for undefined
if (nlft[ic] == -1 || (level[nlft[ic]-noffset] > level[ic] && ntop[nlft[ic]-noffset] == -1) ){
iborder_cell |= 0x0001;
}
if (nrht[ic] == -1 || (level[nrht[ic]-noffset] > level[ic] && ntop[nrht[ic]-noffset] == -1) ){
iborder_cell |= 0x0002;
}
if (nbot[ic] == -1 || (level[nbot[ic]-noffset] > level[ic] && nrht[nbot[ic]-noffset] == -1) ) {
iborder_cell |= 0x0004;
}
if (ntop[ic] == -1 || (level[ntop[ic]-noffset] > level[ic] && nrht[ntop[ic]-noffset] == -1) ) {
iborder_cell |= 0x0008;
}
border_cell[ic] = iborder_cell;
}
#ifdef _OPENMP
#pragma omp for
#endif
for (uint ic=0; ic<ncells; ic++){
int iborder_cell = border_cell[ic];
if (iborder_cell == 0) {
int nl = nlft[ic]-noffset;
if (nl >= 0 && nl < (int)ncells) {
if ((border_cell[nl] & 0x0001) == 0x0001) {
iborder_cell |= 0x0016;
} else if (level[nl] > level[ic]){
int ntl = ntop[nl]-noffset;
if (ntl >= 0 && ntl < (int)ncells && (border_cell[ntl] & 0x0001) == 0x0001) {
iborder_cell |= 0x0016;
}
}
}
int nr = nrht[ic]-noffset;
if (nr >= 0 && nr < (int)ncells) {
if ((border_cell[nrht[ic]-noffset] & 0x0002) == 0x0002) {
iborder_cell |= 0x0032;
} else if (level[nr] > level[ic]){
int ntr = ntop[nr]-noffset;
if (ntr >= 0 && ntr < (int)ncells && (border_cell[ntr] & 0x0002) == 0x0002) {
iborder_cell |= 0x0032;
}
}
}
int nb = nbot[ic]-noffset;
if (nb >= 0 && nb < (int)ncells) {
if ((border_cell[nb] & 0x0004) == 0x0004) {
iborder_cell |= 0x0064;
} else if (level[nb] > level[ic]){
int nrb = nrht[nb]-noffset;
if (nrb >= 0 && nrb < (int)ncells && (border_cell[nrb] & 0x0004) == 0x0004) {
iborder_cell |= 0x0064;
}
}
}
int nt = ntop[ic]-noffset;
if (nt >= 0 && nt < (int)ncells) {
if ((border_cell[nt] & 0x0008) == 0x0008) {
iborder_cell |= 0x0128;
} else if (level[nt] > level[ic]){
int nrt = nrht[nt]-noffset;
if (nrt >= 0 && nrt < (int)ncells && (border_cell[nrt] & 0x0008) == 0x0008) {
iborder_cell |= 0x0128;
}
}
}
}
border_cell_out[ic] = iborder_cell;
}
// indent offset
vector<int> border_cell_num;
static int nbsize_local;
static vector<int> border_cell_i;
static vector<int> border_cell_j;
static vector<int> border_cell_level;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
for (int ic=0; ic<(int)ncells; ic++){
if (border_cell_out[ic] > 0) border_cell_num.push_back(ic+noffset);
}
//printf("%d: border cell size is %d\n",mype,border_cell_num.size());
nbsize_local = border_cell_num.size();
border_cell_i.resize(nbsize_local);
border_cell_j.resize(nbsize_local);
border_cell_level.resize(nbsize_local);
for (int ic = 0; ic <nbsize_local; ic++){
int cell_num = border_cell_num[ic]-noffset;
border_cell_i[ic] = i[cell_num];
border_cell_j[ic] = j[cell_num];
border_cell_level[ic] = level[cell_num];
}
#ifdef _OPENMP
}
#pragma omp barrier
#endif
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
fprintf(fp,"%d: Border cell size is %d\n",mype,nbsize_local);
for (int ib = 0; ib <nbsize_local; ib++){
fprintf(fp,"%d: Border cell %d is %d i %d j %d level %d\n",mype,ib,border_cell_num[ib],
border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
}
#ifdef _OPENMP
}
#pragma omp barrier
#endif
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_FIND_BOUNDARY] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
// Allocate push database
static int **send_database;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
send_database = (int**)malloc(num_comm_partners*sizeof(int *));
for (int ip = 0; ip < num_comm_partners; ip++){
send_database[ip] = (int *)malloc(nbsize_local*sizeof(int));
}
#ifdef _OPENMP
}
#pragma omp barrier
#endif
// Compute the overlap between processor bounding boxes and set up push database
static vector<int> send_buffer_count;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
send_buffer_count.resize(num_comm_partners);
for (int ip = 0; ip < num_comm_partners; ip++){
int icount = 0;
for (int ib = 0; ib <nbsize_local; ib++){
int lev = border_cell_level[ib];
int levmult = IPOW2(levmx-lev);
if (border_cell_i[ib]*levmult >= iminsize_global[comm_partner[ip]] &&
border_cell_i[ib]*levmult <= imaxsize_global[comm_partner[ip]] &&
border_cell_j[ib]*levmult >= jminsize_global[comm_partner[ip]] &&
border_cell_j[ib]*levmult <= jmaxsize_global[comm_partner[ip]] ) {
// border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
send_database[ip][icount] = ib;
icount++;
}
}
send_buffer_count[ip]=icount;
}
#ifdef _OPENMP
}
#pragma omp barrier
#endif
// Initialize L7_Push_Setup with num_comm_partners, comm_partner, send_database and
// send_buffer_count. L7_Push_Setup will copy data and determine recv_buffer_counts.
// It will return receive_count_total for use in allocations
static int receive_count_total;
int i_push_handle = 0;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
i_push_handle = 0;
L7_Push_Setup(num_comm_partners, &comm_partner[0], &send_buffer_count[0],
send_database, &receive_count_total, &i_push_handle);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
fprintf(fp,"DEBUG num_comm_partners %d\n",num_comm_partners);
for (int ip = 0; ip < num_comm_partners; ip++){
fprintf(fp,"DEBUG comm partner is %d data count is %d\n",comm_partner[ip],send_buffer_count[ip]);
for (int ic = 0; ic < send_buffer_count[ip]; ic++){
int ib = send_database[ip][ic];
fprintf(fp,"DEBUG \t index %d cell number %d i %d j %d level %d\n",ib,border_cell_num[ib],
border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
}
}
#ifdef _OPENMP
}
#endif
}
// Can now free the send database. Other arrays are vectors and will automatically
// deallocate
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
for (int ip = 0; ip < num_comm_partners; ip++){
free(send_database[ip]);
}
free(send_database);
#ifdef _OPENMP
}
#endif
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_PUSH_SETUP] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
// Push the data needed to the adjacent processors
static int *border_cell_num_local;
static int *border_cell_i_local;
static int *border_cell_j_local;
static int *border_cell_level_local;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
border_cell_num_local = (int *)malloc(receive_count_total*sizeof(int));
border_cell_i_local = (int *)malloc(receive_count_total*sizeof(int));
border_cell_j_local = (int *)malloc(receive_count_total*sizeof(int));
border_cell_level_local = (int *)malloc(receive_count_total*sizeof(int));
L7_Push_Update(&border_cell_num[0], border_cell_num_local, i_push_handle);
L7_Push_Update(&border_cell_i[0], border_cell_i_local, i_push_handle);
L7_Push_Update(&border_cell_j[0], border_cell_j_local, i_push_handle);
L7_Push_Update(&border_cell_level[0], border_cell_level_local, i_push_handle);
L7_Push_Free(&i_push_handle);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
nbsize_local = receive_count_total;
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
for (int ic = 0; ic < nbsize_local; ic++) {
fprintf(fp,"%d: Local Border cell %d is %d i %d j %d level %d\n",mype,ic,border_cell_num_local[ic],
border_cell_i_local[ic],border_cell_j_local[ic],border_cell_level_local[ic]);
}
#ifdef _OPENMP
}
#endif
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_PUSH_BOUNDARY] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_LOCAL_LIST] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering before layer 1\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash));
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
#ifdef _OPENMP
}
#endif
}
vector<int> border_cell_needed_local;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
border_cell_needed_local.resize(nbsize_local, 0);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
// Layer 1
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
for (int ic =0; ic<nbsize_local; ic++){
int jj = border_cell_j_local[ic];
int ii = border_cell_i_local[ic];
int lev = border_cell_level_local[ic];
int levmult = IPOW2(levmx-lev);
int iicur = ii*levmult-iminsize;
int iilft = max( (ii-1)*levmult, 0 )-iminsize;
int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
int jjcur = jj*levmult-jminsize;
int jjbot = max( (jj-1)*levmult, 0 )-jminsize;
int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
//fprintf(fp,"DEBUG layer ic %d num %d i %d j %d lev %d\n",ic,border_cell_num_local[ic],ii,jj,lev);
int iborder = 0;
// Test for cell to left
if (iicur-(iicur-iilft)/2 >= 0 && iicur-(iicur-iilft)/2 < imaxsize-iminsize && jjcur >= 0 && (jjcur+jjtop)/2 < jmaxsize-jminsize){
int nlftval = -1;
// Check for finer cell left and bottom side
if (lev != levmx){ // finer neighbor
int iilftfiner = iicur-(iicur-iilft)/2;
nlftval = read_hash(jjcur*(imaxsize-iminsize)+iilftfiner, hash);
// Also check for finer cell left and top side
if (nlftval < 0) {
int jjtopfiner = (jjcur+jjtop)/2;
nlftval = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
}
}
if (nlftval < 0 && iilft >= 0) { // same size
int nlfttry = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
// we have to test for same level or it could be a finer cell one cell away that it is matching
if (nlfttry-noffset >= 0 && nlfttry-noffset < (int)ncells && level[nlfttry-noffset] == lev) {
nlftval = nlfttry;
}
}
if (lev != 0 && nlftval < 0 && iilft-(iicur-iilft) >= 0){ // coarser neighbor
iilft -= iicur-iilft;
int jjlft = (jj/2)*2*levmult-jminsize;
int nlfttry = read_hash(jjlft*(imaxsize-iminsize)+iilft, hash);
// we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
if (nlfttry-noffset >= 0 && nlfttry-noffset < (int)ncells && level[nlfttry-noffset] == lev-1) {
nlftval = nlfttry;
}
}
if (nlftval >= 0) iborder |= 0x0001;
}
// Test for cell to right
if (iirht < imaxsize-iminsize && iirht >= 0 && jjcur >= 0 && jjtop < jmaxsize-jminsize) {
int nrhtval = -1;
// right neighbor -- finer, same size and coarser
nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
// right neighbor -- finer right top test
if (nrhtval < 0 && lev != levmx){
int jjtopfiner = (jjcur+jjtop)/2;
nrhtval = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
}
if (nrhtval < 0 && lev != 0) { // test for coarser, but not directly above
int jjrhtcoarser = (jj/2)*2*levmult-jminsize;
if (jjrhtcoarser != jjcur) {
int nrhttry = read_hash(jjrhtcoarser*(imaxsize-iminsize)+iirht, hash);
if (nrhttry-noffset >= 0 && nrhttry-noffset < (int)ncells && level[nrhttry-noffset] == lev-1) {
nrhtval = nrhttry;
}
}
}
if (nrhtval > 0) iborder |= 0x0002;
}
// Test for cell to bottom
if (iicur >= 0 && (iicur+iirht)/2 < imaxsize-iminsize && jjcur-(jjcur-jjbot)/2 >= 0 && jjcur-(jjcur-jjbot)/2 < jmaxsize-jminsize){
int nbotval = -1;
// Check for finer cell below and left side
if (lev != levmx){ // finer neighbor
int jjbotfiner = jjcur-(jjcur-jjbot)/2;
nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
// Also check for finer cell below and right side
if (nbotval < 0) {
int iirhtfiner = (iicur+iirht)/2;
nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
}
}
if (nbotval < 0 && jjbot >= 0) { // same size
int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
// we have to test for same level or it could be a finer cell one cell away that it is matching
if (nbottry-noffset >= 0 && nbottry-noffset < (int)ncells && level[nbottry-noffset] == lev) {
nbotval = nbottry;
}
}
if (lev != 0 && nbotval < 0 && jjbot-(jjcur-jjbot) >= 0){ // coarser neighbor
jjbot -= jjcur-jjbot;
int iibot = (ii/2)*2*levmult-iminsize;
int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iibot, hash);
// we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
if (nbottry-noffset >= 0 && nbottry-noffset < (int)ncells && level[nbottry-noffset] == lev-1) {
nbotval = nbottry;
}
}
if (nbotval >= 0) iborder |= 0x0004;
}
// Test for cell to top
if (iirht < imaxsize-iminsize && iicur >= 0 && jjtop >= 0 && jjtop < jmaxsize-jminsize) {
int ntopval = -1;
// top neighbor -- finer, same size and coarser
ntopval = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
// top neighbor -- finer top right test
if (ntopval < 0 && lev != levmx){
int iirhtfiner = (iicur+iirht)/2;
ntopval = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
}
if (ntopval < 0 && lev != 0) { // test for coarser, but not directly above
int iitopcoarser = (ii/2)*2*levmult-iminsize;
if (iitopcoarser != iicur) {
int ntoptry = read_hash(jjtop*(imaxsize-iminsize)+iitopcoarser, hash);
if (ntoptry-noffset >= 0 && ntoptry-noffset < (int)ncells && level[ntoptry-noffset] == lev-1) {
ntopval = ntoptry;
}
}
}
if (ntopval > 0) iborder |= 0x0008;
}
if (iborder) border_cell_needed_local[ic] = iborder;
}
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
for(int ic=0; ic<nbsize_local; ic++){
if (border_cell_needed_local[ic] == 0) continue;
fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
}
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
}
// Walk through cell array and set hash to border local index plus ncells+noffset for next pass
//fprintf(fp,"%d: DEBUG new hash jminsize %d jmaxsize %d iminsize %d imaxsize %d\n",mype,jminsize,jmaxsize,iminsize,imaxsize);
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
for(int ic=0; ic<nbsize_local; ic++){
if (border_cell_needed_local[ic] == 0) continue;
//fprintf(fp,"%d: index %d cell %d i %d j %d\n",mype,ic,border_cell_num_local[ic],border_cell_i_local[ic],border_cell_j_local[ic]);
int lev = border_cell_level_local[ic];
int levmult = IPOW2(levmx-lev);
int ii = border_cell_i_local[ic]*levmult-iminsize;
int jj = border_cell_j_local[ic]*levmult-jminsize;
write_hash(ncells+noffset+ic, jj*(imaxsize-iminsize)+ii, hash);
}
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_LAYER1] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
print_local();
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering for 1 layer\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
}
// Layer 2
#ifdef _OPENMP
#pragma omp master
{
#endif
for (int ic =0; ic<nbsize_local; ic++){
if (border_cell_needed_local[ic] > 0) continue;
int jj = border_cell_j_local[ic];
int ii = border_cell_i_local[ic];
int lev = border_cell_level_local[ic];
int levmult = IPOW2(levmx-lev);
int iicur = ii*levmult-iminsize;
int iilft = max( (ii-1)*levmult, 0 )-iminsize;
int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
int jjcur = jj*levmult-jminsize;
int jjbot = max( (jj-1)*levmult, 0 )-jminsize;
int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
//fprintf(fp," DEBUG layer2 ic %d num %d i %d j %d lev %d\n",ic,border_cell_num_local[ic],ii,jj,lev);
int iborder = 0;
// Test for cell to left
if (iicur-(iicur-iilft)/2 >= 0 && iicur-(iicur-iilft)/2 < imaxsize-iminsize && jjcur >= 0 && (jjcur+jjtop)/2 < jmaxsize-jminsize){
// Check for finer cell left and bottom side
if (lev != levmx){ // finer neighbor
int iilftfiner = iicur-(iicur-iilft)/2;
int nl = read_hash(jjcur*(imaxsize-iminsize)+iilftfiner, hash);
if (nl >= (int)(ncells+noffset) && (border_cell_needed_local[nl-ncells-noffset] & 0x0001) == 0x0001) {
iborder = 0x0001;
} else {
// Also check for finer cell left and top side
int jjtopfiner = (jjcur+jjtop)/2;
int nlt = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
if ( nlt >= (int)(ncells+noffset) && (border_cell_needed_local[nlt-ncells-noffset] & 0x0001) == 0x0001) {
iborder = 0x0001;
}
}
}
if ( (iborder & 0x0001) == 0 && iilft >= 0) { //same size
int nl = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
int levcheck = -1;
if (nl-noffset >= 0 && nl-noffset < (int)ncells) {
levcheck = level[nl-noffset];
} else if (nl >= 0 && (int)(nl-ncells-noffset) >= 0 && (int)(nl-ncells-noffset) < nbsize_local) {
levcheck = border_cell_level_local[nl-ncells-noffset];
}
if (nl >= (int)(ncells+noffset) && levcheck == lev && (border_cell_needed_local[nl-ncells-noffset] & 0x0001) == 0x0001) {
iborder = 0x0001;
} else if (lev != 0 && iilft-(iicur-iilft) >= 0){ // coarser neighbor
iilft -= iicur-iilft;
int jjlft = (jj/2)*2*levmult-jminsize;
nl = read_hash(jjlft*(imaxsize-iminsize)+iilft, hash);
levcheck = -1;
if (nl-noffset >= 0 && nl-noffset < (int)ncells) {
levcheck = level[nl-noffset];
} else if (nl >= 0 && (int)(nl-ncells-noffset) >= 0 && (int)(nl-ncells-noffset) < nbsize_local) {
levcheck = border_cell_level_local[nl-ncells-noffset];
}
// we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
if (nl >= (int)(ncells+noffset) && levcheck == lev-1 && (border_cell_needed_local[nl-ncells-noffset] & 0x0001) == 0x0001) {
iborder = 0x0001;
}
}
}
}
// Test for cell to right
if (iirht < imaxsize-iminsize && iirht >= 0 && jjcur >= 0 && jjtop < jmaxsize-jminsize) {
// right neighbor -- finer, same size and coarser
int nr = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
if (nr >= (int)(ncells+noffset) && (border_cell_needed_local[nr-ncells-noffset] & 0x0002) == 0x0002) {
iborder = 0x0002;
} else if (lev != levmx){
// right neighbor -- finer right top test
int jjtopfiner = (jjcur+jjtop)/2;
int nrt = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
if (nrt >= (int)(ncells+noffset) && (border_cell_needed_local[nrt-ncells-noffset] & 0x0002) == 0x0002) {
iborder = 0x0002;
}
}
if ( (iborder & 0x0002) == 0 && lev != 0) { // test for coarser, but not directly right
int jjrhtcoarser = (jj/2)*2*levmult-jminsize;
if (jjrhtcoarser != jjcur) {
int nr = read_hash(jjrhtcoarser*(imaxsize-iminsize)+iirht, hash);
int levcheck = -1;
if (nr-noffset >= 0 && nr-noffset < (int)ncells) {
levcheck = level[nr-noffset];
} else if (nr >= 0 && (int)(nr-ncells-noffset) >= 0 && (int)(nr-ncells-noffset) < nbsize_local) {
levcheck = border_cell_level_local[nr-ncells-noffset];
}
if (nr >= (int)(ncells+noffset) && levcheck == lev-1 && (border_cell_needed_local[nr-ncells-noffset] & 0x0002) == 0x0002) {
iborder = 0x0002;
}
}
}
}
// Test for cell to bottom
if (iicur >= 0 && (iicur+iirht)/2 < imaxsize-iminsize && jjcur-(jjcur-jjbot)/2 >= 0 && jjcur-(jjcur-jjbot)/2 < jmaxsize-jminsize){
// Check for finer cell below and left side
if (lev != levmx){ // finer neighbor
int jjbotfiner = jjcur-(jjcur-jjbot)/2;
int nb = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
if (nb >= (int)(ncells+noffset) && (border_cell_needed_local[nb-ncells-noffset] & 0x0004) == 0x0004) {
iborder = 0x0004;
} else {
// Also check for finer cell below and right side
int iirhtfiner = (iicur+iirht)/2;
int nbr = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
if (nbr >= (int)(ncells+noffset) && (border_cell_needed_local[nbr-ncells-noffset] & 0x0004) == 0x0004) {
iborder = 0x0004;
}
}
}
if ( (iborder & 0x0004) == 0 && jjbot >= 0) { //same size
int nb = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
int levcheck = -1;
if (nb-noffset >= 0 && nb-noffset < (int)ncells) {
levcheck = level[nb-noffset];
} else if (nb >= 0 && (int)(nb-ncells-noffset) >= 0 && (int)(nb-ncells-noffset) < nbsize_local) {
levcheck = border_cell_level_local[nb-ncells-noffset];
}
if (nb >= (int)(ncells+noffset) && levcheck == lev && (border_cell_needed_local[nb-ncells-noffset] & 0x0004) == 0x0004) {
iborder = 0x0004;
} else if (lev != 0 && jjbot-(jjcur-jjbot) >= 0){ // coarser neighbor
jjbot -= jjcur-jjbot;
int iibot = (ii/2)*2*levmult-iminsize;
nb = read_hash(jjbot*(imaxsize-iminsize)+iibot, hash);
levcheck = -1;
if (nb-noffset >= 0 && nb-noffset < (int)ncells) {
levcheck = level[nb-noffset];
} else if (nb >= 0 && (int)(nb-ncells-noffset) >= 0 && (int)(nb-ncells-noffset) < nbsize_local) {
levcheck = border_cell_level_local[nb-ncells-noffset];
}
// we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
if (nb >= (int)(ncells+noffset) && levcheck == lev-1 && (border_cell_needed_local[nb-ncells-noffset] & 0x0004) == 0x0004) {
iborder = 0x0004;
}
}
}
}
// Test for cell to top
if (iirht < imaxsize-iminsize && iicur >= 0 && jjtop >= 0 && jjtop < jmaxsize-jminsize) {
// top neighbor -- finer, same size and coarser
int nt = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
if (nt >= (int)(ncells+noffset) && (border_cell_needed_local[nt-ncells-noffset] & 0x0008) == 0x0008) {
iborder = 0x0008;
} else if (lev != levmx){
int iirhtfiner = (iicur+iirht)/2;
int ntr = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
if ( ntr >= (int)(ncells+noffset) && (border_cell_needed_local[ntr-ncells-noffset] & 0x0008) == 0x0008) {
iborder = 0x0008;
}
}
if ( (iborder & 0x0008) == 0 && lev != 0) { // test for coarser, but not directly above
int iitopcoarser = (ii/2)*2*levmult-iminsize;
if (iitopcoarser != iicur) {
int nb = read_hash(jjtop*(imaxsize-iminsize)+iitopcoarser, hash);
int levcheck = -1;
if (nb-noffset >= 0 && nb-noffset < (int)ncells) {
levcheck = level[nb-noffset];
} else if (nb >= 0 && (int)(nb-ncells-noffset) >= 0 && (int)(nb-ncells-noffset) < nbsize_local) {
levcheck = border_cell_level_local[nb-ncells-noffset];
}
if (nb-noffset >= (int)(ncells-noffset) && levcheck == lev-1 && (border_cell_needed_local[nb-ncells-noffset] & 0x0008) == 0x0008) {
iborder = 0x0008;
}
}
}
}
if (iborder) border_cell_needed_local[ic] = iborder |= 0x0016;
}
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
vector<int> indices_needed;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
if (DEBUG) {
for(int ic=0; ic<nbsize_local; ic++){
if (border_cell_needed_local[ic] < 0x0016) fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
if (border_cell_needed_local[ic] >= 0x0016) fprintf(fp,"%d: Second set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
}
}
int inew = 0;
for(int ic=0; ic<nbsize_local; ic++){
if (border_cell_needed_local[ic] <= 0) continue;
indices_needed.push_back(border_cell_num_local[ic]);
border_cell_num_local[inew] = border_cell_num_local[ic];
border_cell_i_local[inew] = border_cell_i_local[ic];
border_cell_j_local[inew] = border_cell_j_local[ic];
border_cell_level_local[inew] = border_cell_level_local[ic];
// border_cell_num_local is not used after -- could be commented out?
// border_cell_needed_local[inew] = 1;
inew++;
}
nbsize_local = inew;
free(border_cell_num_local);
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
// Walk through cell array and set hash to global cell values
//fprintf(fp,"%d: DEBUG new hash jminsize %d jmaxsize %d iminsize %d imaxsize %d\n",mype,jminsize,jmaxsize,iminsize,imaxsize);
#ifdef _OPENMP
#pragma omp for
#endif
for(int ic=0; ic<nbsize_local; ic++){
int lev = border_cell_level_local[ic];
int levmult = IPOW2(levmx-lev);
int ii = border_cell_i_local[ic]*levmult-iminsize;
int jj = border_cell_j_local[ic]*levmult-jminsize;
write_hash(-(ncells+ic), jj*(imaxsize-iminsize)+ii, hash);
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_LAYER2] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
print_local();
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering for 2 layer\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
#ifdef _OPENMP
} // end master region
#endif
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_LAYER_LIST] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
int nghost = nbsize_local;
ncells_ghost = ncells + nghost;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
celltype = (int *)mesh_memory.memory_realloc(ncells_ghost, celltype);
i = (int *)mesh_memory.memory_realloc(ncells_ghost, i);
j = (int *)mesh_memory.memory_realloc(ncells_ghost, j);
level = (int *)mesh_memory.memory_realloc(ncells_ghost, level);
nlft = (int *)mesh_memory.memory_realloc(ncells_ghost, nlft);
nrht = (int *)mesh_memory.memory_realloc(ncells_ghost, nrht);
nbot = (int *)mesh_memory.memory_realloc(ncells_ghost, nbot);
ntop = (int *)mesh_memory.memory_realloc(ncells_ghost, ntop);
memory_reset_ptrs();
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (int ic = ncells; ic < (int)ncells_ghost; ic++){
nlft[ic] = -1;
nrht[ic] = -1;
nbot[ic] = -1;
ntop[ic] = -1;
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_COPY_MESH_DATA] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
#ifdef _OPENMP
#pragma omp for
#endif
for(int ic=0; ic<nbsize_local; ic++){
int ii = border_cell_i_local[ic];
int jj = border_cell_j_local[ic];
int lev = border_cell_level_local[ic];
if (ii < lev_ibegin[lev]) celltype[ncells+ic] = LEFT_BOUNDARY;
if (ii > lev_iend[lev]) celltype[ncells+ic] = RIGHT_BOUNDARY;
if (jj < lev_jbegin[lev]) celltype[ncells+ic] = BOTTOM_BOUNDARY;
if (jj > lev_jend[lev]) celltype[ncells+ic] = TOP_BOUNDARY;
i[ncells+ic] = ii;
j[ncells+ic] = jj;
level[ncells+ic] = lev;
}
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
free(border_cell_i_local);
free(border_cell_j_local);
free(border_cell_level_local);
#ifdef _OPENMP
} // end master region
#endif
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_FILL_MESH_GHOST] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
fprintf(fp,"After copying i,j, level to ghost cells\n");
print_local();
#ifdef _OPENMP
} // end master region
#endif
}
#ifdef _OPENMP
#pragma omp for
#endif
for (uint ic=0; ic<ncells_ghost; ic++){
int ii = i[ic];
int jj = j[ic];
int lev = level[ic];
int levmult = IPOW2(levmx-lev);
int iicur = ii*levmult-iminsize;
int iilft = max( (ii-1)*levmult, 0 )-iminsize;
int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
int jjcur = jj*levmult-jminsize;
int jjbot = max( (jj-1)*levmult, 0 )-jminsize;
int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
//fprintf(fp,"DEBUG neigh ic %d nlft %d ii %d levmult %d iminsize %d icheck %d\n",ic,nlft[ic],ii,levmult,iminsize,(max( ii *levmult-1, 0))-iminsize);
int nlftval = nlft[ic];
int nrhtval = nrht[ic];
int nbotval = nbot[ic];
int ntopval = ntop[ic];
if (nlftval == -1){
// Taking care of boundary cells
// Force each boundary cell to point to itself on its boundary direction
if (iicur < 1*IPOW2(levmx) -iminsize) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
// Boundary cells next to corner boundary need special checks
if (iicur == 1*IPOW2(levmx)-iminsize && (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
// need to check for finer neighbor first
// Right and top neighbor don't change for finer, so drop through to same size
// Left and bottom need to be half of same size index for finer test
if (lev != levmx) {
int iilftfiner = iicur-(iicur-iilft)/2;
if (nlftval == -1 && iilftfiner >= 0) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iilftfiner, hash);
}
// same size neighbor
if (nlftval == -1 && iilft >= 0) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
// Now we need to take care of special case where bottom and left boundary need adjustment since
// expected cell doesn't exist on these boundaries if it is finer than current cell
if (jjcur < 1*IPOW2(levmx) && lev != levmx) {
if (nlftval == -1) {
int iilftfiner = iicur-(iicur-iilft)/2;
int jjtopfiner = (jjcur+jjtop)/2;
if (jjtopfiner < jmaxsize-jminsize && iilftfiner >= 0) nlftval = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
}
}
// coarser neighbor
if (lev != 0){
if (nlftval == -1) {
int iilftcoarser = iilft - (iicur-iilft);
int jjlft = (jj/2)*2*levmult-jminsize;
if (iilftcoarser >=0) nlftval = read_hash(jjlft*(imaxsize-iminsize)+iilftcoarser, hash);
}
}
if (nlftval != -1) nlft[ic] = nlftval;
}
if (nrhtval == -1) {
// Taking care of boundary cells
// Force each boundary cell to point to itself on its boundary direction
if (iicur > imax*IPOW2(levmx)-1-iminsize) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
// Boundary cells next to corner boundary need special checks
if (iirht == imax*IPOW2(levmx)-iminsize && (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
// same size neighbor
if (nrhtval == -1 && iirht < imaxsize-iminsize) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
// Now we need to take care of special case where bottom and left boundary need adjustment since
// expected cell doesn't exist on these boundaries if it is finer than current cell
if (jjcur < 1*IPOW2(levmx) && lev != levmx) {
if (nrhtval == -1) {
int jjtopfiner = (jjcur+jjtop)/2;
if (jjtopfiner < jmaxsize-jminsize && iirht < imaxsize-iminsize) nrhtval = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
}
}
// coarser neighbor
if (lev != 0){
if (nrhtval == -1) {
int jjrht = (jj/2)*2*levmult-jminsize;
if (iirht < imaxsize-iminsize) nrhtval = read_hash(jjrht*(imaxsize-iminsize)+iirht, hash);
}
}
if (nrhtval != -1) nrht[ic] = nrhtval;
}
if (nbotval == -1) {
// Taking care of boundary cells
// Force each boundary cell to point to itself on its boundary direction
if (jjcur < 1*IPOW2(levmx) -jminsize) nbotval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
// Boundary cells next to corner boundary need special checks
if (jjcur == 1*IPOW2(levmx)-jminsize && (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) nbotval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
// need to check for finer neighbor first
// Right and top neighbor don't change for finer, so drop through to same size
// Left and bottom need to be half of same size index for finer test
if (lev != levmx) {
int jjbotfiner = jjcur-(jjcur-jjbot)/2;
if (nbotval == -1 && jjbotfiner >= 0) nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
}
// same size neighbor
if (nbotval == -1 && jjbot >=0) nbotval = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
// Now we need to take care of special case where bottom and left boundary need adjustment since
// expected cell doesn't exist on these boundaries if it is finer than current cell
if (iicur < 1*IPOW2(levmx) && lev != levmx) {
if (nbotval == -1) {
int iirhtfiner = (iicur+iirht)/2;
int jjbotfiner = jjcur-(jjcur-jjbot)/2;
if (jjbotfiner >= 0 && iirhtfiner < imaxsize-iminsize) nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
}
}
// coarser neighbor
if (lev != 0){
if (nbotval == -1) {
int jjbotcoarser = jjbot - (jjcur-jjbot);
int iibot = (ii/2)*2*levmult-iminsize;
if (jjbotcoarser >= 0 && iibot >= 0) nbotval = read_hash(jjbotcoarser*(imaxsize-iminsize)+iibot, hash);
}
}
if (nbotval != -1) nbot[ic] = nbotval;
}
if (ntopval == -1) {
// Taking care of boundary cells
// Force each boundary cell to point to itself on its boundary direction
if (jjcur > jmax*IPOW2(levmx)-1-jminsize) ntopval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
// Boundary cells next to corner boundary need special checks
if (jjtop == jmax*IPOW2(levmx)-jminsize && (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) ntopval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
// same size neighbor
if (ntopval == -1 && jjtop < jmaxsize-jminsize) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
if (iicur < 1*IPOW2(levmx)) {
if (ntopval == -1) {
int iirhtfiner = (iicur+iirht)/2;
if (jjtop < jmaxsize-jminsize && iirhtfiner < imaxsize-iminsize) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
}
}
// coarser neighbor
if (lev != 0){
if (ntopval == -1) {
int iitop = (ii/2)*2*levmult-iminsize;
if (jjtop < jmaxsize-jminsize && iitop < imaxsize-iminsize) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iitop, hash);
}
}
if (ntopval != -1) ntop[ic] = ntopval;
}
//fprintf(fp,"%d: neighbors[%d] = %d %d %d %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_FILL_NEIGH_GHOST] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
fprintf(fp,"After setting neighbors through ghost cells\n");
print_local();
#ifdef _OPENMP
} // end master region
#endif
}
/*
// Set neighbors to global cell numbers from hash
for (uint ic=0; ic<ncells; ic++){
ii = i[ic];
jj = j[ic];
lev = level[ic];
levmult = IPOW2(levmx-lev);
//fprintf(fp,"%d:Neighbors input for ic %d ii %d jj %d levmult %d lev %d\n",mype,ic, ii, jj, levmult,lev);
//fprintf(fp,"%d:Neighbors befor ic %d nlft %d nrht %d nbot %d ntop %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
if (nlft[ic] == -1) nlft[ic] = hash[( jj *levmult )-jminsize][(max( ii *levmult-1, 0 ))-iminsize];
if (celltype[ic] == BOTTOM_BOUNDARY && nlft[ic] == -1){
if (nlft[ic] == -1) nlft[ic] = hash[(jj+1)*levmult-jminsize][(min( (ii+1)*levmult, imaxcalc-1))-iminsize];
}
if (nrht[ic] == -1) nrht[ic] = hash[( jj *levmult )-jminsize][(min( (ii+1)*levmult, imaxcalc-1))-iminsize];
if (celltype[ic] == BOTTOM_BOUNDARY && nrht[ic] == -1){
if (nrht[ic] == -1) nrht[ic] = hash[(jj+1)*levmult-jminsize][(min( (ii+1)*levmult, imaxcalc-1))-iminsize];
//if (ic == 3 && mype == 0) printf("DEBUG line %d -- ic %d celltype %d nrht %d\n",__line__,ic,celltype[ic],nrht[ic]);
//printf("DEBUG line %d -- ic %d celltype %d nrht %d jj %d ii %d\n",__line__,ic,celltype[ic],nrht[ic],(jj+1)*levmult-jminsize,(min( (ii+1)*levmult, imaxcalc-1))-iminsize);
}
if (nbot[ic] == -1) nbot[ic] = hash[(max( jj *levmult-1, 0) )-jminsize][( ii *levmult )-iminsize];
if (celltype[ic] == LEFT_BOUNDARY && nbot[ic] == -1){
if (nbot[ic] == -1) nbot[ic] = hash[(max( jj *levmult-1, 0) )-jminsize][( ii *levmult+1 )-iminsize];
}
if (ntop[ic] == -1) ntop[ic] = hash[(min( (jj+1)*levmult, jmaxcalc-1))-jminsize][( ii *levmult )-iminsize];
if (celltype[ic] == LEFT_BOUNDARY && ntop[ic] == -1){
if (ntop[ic] == -1) ntop[ic] = hash[(min( (jj+1)*levmult, jmaxcalc-1))-jminsize][( ii *levmult+1 )-iminsize];
}
//fprintf(fp,"%d:Neighbors after ic %d nlft %d nrht %d nbot %d ntop %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
}
*/
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_SET_CORNER_NEIGH] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
fprintf(fp,"After setting corner neighbors\n");
print_local();
#ifdef _OPENMP
} // end master region
#endif
}
// Adjusting neighbors to local indices
#ifdef _OPENMP
#pragma omp for
#endif
for (uint ic=0; ic<ncells_ghost; ic++){
//fprintf(fp,"%d: ic %d nlft %d noffset %d ncells %ld\n",mype,ic,nlft[ic],noffset,ncells);
if (nlft[ic] <= -(int)ncells && nlft[ic] > -(int)ncells_ghost){
nlft[ic] = abs(nlft[ic]);
} else if (nlft[ic] >= noffset && nlft[ic] < (int)(noffset+ncells)) {
nlft[ic] -= noffset;
}
if (nrht[ic] <= -(int)ncells && nrht[ic] > -(int)ncells_ghost){
nrht[ic] = abs(nrht[ic]);
} else if (nrht[ic] >= noffset && nrht[ic] < (int)(noffset+ncells)) {
nrht[ic] -= noffset;
}
if (nbot[ic] <= -(int)ncells && nbot[ic] > -(int)ncells_ghost){
nbot[ic] = abs(nbot[ic]);
} else if (nbot[ic] >= noffset && nbot[ic] < (int)(noffset+ncells)) {
nbot[ic] -= noffset;
}
if (ntop[ic] <= -(int)ncells && ntop[ic] > -(int)ncells_ghost){
ntop[ic] = abs(ntop[ic]);
} else if (ntop[ic] >= noffset && ntop[ic] < (int)(noffset+ncells)) {
ntop[ic] -= noffset;
}
}
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
fprintf(fp,"After adjusting neighbors to local indices\n");
print_local();
#ifdef _OPENMP
} // end master region
#endif
}
if (TIMING_LEVEL >= 2) {
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_NEIGH_ADJUST] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
offtile_ratio_local = (offtile_ratio_local*(double)offtile_local_count) + ((double)nghost / (double)ncells);
offtile_local_count++;
offtile_ratio_local /= offtile_local_count;
//printf("%d ncells size is %ld ncells_ghost size is %ld nghost %d\n",mype,ncells,ncells_ghost,nghost);
//fprintf(fp,"%d ncells_ghost size is %ld nghost %d\n",mype,ncells_ghost,nghost);
if (cell_handle) L7_Free(&cell_handle);
cell_handle=0;
if (DEBUG) {
fprintf(fp,"%d: SETUP ncells %ld noffset %d nghost %d\n",mype,ncells,noffset,nghost);
for (int ig = 0; ig<nghost; ig++){
fprintf(fp,"%d: indices needed ic %d index %d\n",mype,ig,indices_needed[ig]);
}
}
L7_Setup(0, noffset, ncells, &indices_needed[0], nghost, &cell_handle);
if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_SETUP_COMM] += cpu_timer_stop(tstart_lev2);
#ifdef _OPENMP
} // end master region
#endif
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
print_local();
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nlft numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
if ( (hashval >= 0 && hashval < (int)ncells) ) {
fprintf(fp,"%5d",nlft[hashval]);
} else {
fprintf(fp," ");
}
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nrht numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if ( ii >= iminsize && ii < imaxsize ) {
int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
if ( hashval >= 0 && hashval < (int)ncells ) {
fprintf(fp,"%5d",nrht[hashval]);
} else {
fprintf(fp," ");
}
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nbot numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if ( ii >= iminsize && ii < imaxsize ) {
int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
if ( hashval >= 0 && hashval < (int)ncells ) {
fprintf(fp,"%5d",nbot[hashval]);
} else {
fprintf(fp," ");
}
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n ntop numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if ( ii >= iminsize && ii < imaxsize ) {
int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
if ( hashval >= 0 && hashval < (int)ncells ) {
fprintf(fp,"%5d",ntop[hashval]);
} else {
fprintf(fp," ");
}
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
#ifdef _OPENMP
} // end master region
#endif
} // end DEBUG
if (DEBUG) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
print_local();
for (uint ic=0; ic<ncells; ic++){
fprintf(fp,"%d: before update ic %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
mype,ic,i[ic],j[ic],level[ic],nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
}
int ig=0;
for (uint ic=ncells; ic<ncells_ghost; ic++, ig++){
fprintf(fp,"%d: after update ic %d off %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
mype,ic,indices_needed[ig],i[ic],j[ic],level[ic],nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
}
#ifdef _OPENMP
} // end master region
#endif
} // end DEBUG
} // if numpe > 1
#endif
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
write_hash_collision_report();
read_hash_collision_report();
compact_hash_delete(hash);
#ifdef BOUNDS_CHECK
{
for (uint ic=0; ic<ncells; ic++){
int nl = nlft[ic];
if (nl<0 || nl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
if (level[nl] > level[ic]){
int ntl = ntop[nl];
if (ntl<0 || ntl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d nlft %d ntop of nlft %d\n",mype,__LINE__,ic,ic+noffset,nl,ntl);
}
int nr = nrht[ic];
if (nr<0 || nr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
if (level[nr] > level[ic]){
int ntr = ntop[nr];
if (ntr<0 || ntr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop of nrht %d\n",mype,__LINE__,ic,ntr);
}
int nb = nbot[ic];
if (nb<0 || nb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
if (level[nb] > level[ic]){
int nrb = nrht[nb];
if (nrb<0 || nrb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of nbot %d\n",mype,__LINE__,ic,nrb);
}
int nt = ntop[ic];
if (nt<0 || nt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
if (level[nt] > level[ic]){
int nrt = nrht[nt];
if (nrt<0 || nrt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of ntop %d\n",mype,__LINE__,ic,nrt);
}
}
}
#endif
#ifdef _OPENMP
} // end master region
#pragma omp barrier
#endif
} else if (calc_neighbor_type == KDTREE) {
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
struct timeval tstart_lev2;
if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
TBounds box;
vector<int> index_list(IPOW2(levmx*levmx) );
int num;
ibase = 0;
calc_spatial_coordinates(ibase);
kdtree_setup();
if (TIMING_LEVEL >= 2) {
cpu_timers[MESH_TIMER_KDTREE_SETUP] += cpu_timer_stop(tstart_lev2);
cpu_timer_start(&tstart_lev2);
}
for (uint ic=0; ic<ncells; ic++) {
//left
nlft[ic] = ic;
box.min.x = x[ic]-0.25*dx[ic];
box.max.x = x[ic]-0.25*dx[ic];
box.min.y = y[ic]+0.25*dy[ic];
box.max.y = y[ic]+0.25*dy[ic];
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) nlft[ic]=index_list[0];
//right
nrht[ic] = ic;
box.min.x = x[ic]+1.25*dx[ic];
box.max.x = x[ic]+1.25*dx[ic];
box.min.y = y[ic]+0.25*dy[ic];
box.max.y = y[ic]+0.25*dy[ic];
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) nrht[ic]=index_list[0];
//bot
nbot[ic] = ic;
box.min.x = x[ic]+0.25*dx[ic];
box.max.x = x[ic]+0.25*dx[ic];
box.min.y = y[ic]-0.25*dy[ic];
box.max.y = y[ic]-0.25*dy[ic];
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) nbot[ic]=index_list[0];
//top
ntop[ic] = ic;
box.min.x = x[ic]+0.25*dx[ic];
box.max.x = x[ic]+0.25*dx[ic];
box.min.y = y[ic]+1.25*dy[ic];
box.max.y = y[ic]+1.25*dy[ic];
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) ntop[ic]=index_list[0];
} // End main loop over cells.
KDTree_Destroy(&tree);
if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_KDTREE_QUERY] += cpu_timer_stop(tstart_lev2);
#ifdef _OPENMP
}
#pragma omp barrier
#endif
} // calc_neighbor_type
}
#ifdef _OPENMP
#pragma omp master
#endif
cpu_timers[MESH_TIMER_CALC_NEIGHBORS] += cpu_timer_stop(tstart_cpu);
}
#ifdef HAVE_OPENCL
void Mesh::gpu_calc_neighbors(void)
{
if (! gpu_do_rezone) return;
ulong gpu_hash_table_size = 0;
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
struct timeval tstart_lev2;
cpu_timer_start(&tstart_lev2);
cl_command_queue command_queue = ezcl_get_command_queue();
gpu_counters[MESH_COUNTER_CALC_NEIGH]++;
assert(dev_levtable);
assert(dev_level);
assert(dev_i);
assert(dev_j);
size_t mem_request = (int)((float)ncells*mem_factor);
size_t local_work_size = MIN(ncells, TILE_SIZE);
size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
//printf("DEBUG file %s line %d dev_nlft %p size %d\n",__FILE__,__LINE__,dev_nlft,ezcl_get_device_mem_nelements(dev_nlft));
if (dev_nlft == NULL || ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells) {
dev_nlft = ezcl_malloc(NULL, const_cast<char *>("dev_nlft"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_nrht = ezcl_malloc(NULL, const_cast<char *>("dev_nrht"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_nbot = ezcl_malloc(NULL, const_cast<char *>("dev_nbot"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_ntop = ezcl_malloc(NULL, const_cast<char *>("dev_ntop"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_neighbor_init, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_neighbor_init, 1, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_neighbor_init, 2, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_neighbor_init, 3, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_neighbor_init, 4, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_neighbor_init, 1, NULL, &global_work_size, &local_work_size, NULL);
}
int imaxsize = (imax+1)*IPOW2(levmx);
int jmaxsize = (jmax+1)*IPOW2(levmx);
int gpu_hash_method = METHOD_UNSET;
// allow input.c to control hash types and methods
if (choose_hash_method != METHOD_UNSET) gpu_hash_method = choose_hash_method;
//=========
size_t hashsize;
uint hash_report_level = 1;
cl_mem dev_hash_header = NULL;
cl_mem dev_hash = gpu_compact_hash_init(ncells, imaxsize, jmaxsize, gpu_hash_method, hash_report_level,
&gpu_hash_table_size, &hashsize, &dev_hash_header);
/*
const int isize, // 0
const int levmx, // 1
const int imaxsize, // 2
__global const int *levtable, // 3
__global const int *level, // 4
__global const int *i, // 5
__global const int *j, // 6
__global const ulong *hash_header, // 7
__global int *hash) // 8
*/
cl_event hash_setup_event;
ezcl_set_kernel_arg(kernel_hash_setup, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_hash_setup, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_hash_setup, 2, sizeof(cl_int), (void *)&imaxsize);
ezcl_set_kernel_arg(kernel_hash_setup, 3, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_hash_setup, 4, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_hash_setup, 5, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_hash_setup, 6, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_hash_setup, 7, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_hash_setup, 8, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_hash_setup, 9, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_hash_setup, 10, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_hash_setup, 11, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_hash_setup, 12, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_setup, 1, NULL, &global_work_size, &local_work_size, &hash_setup_event);
ezcl_wait_for_events(1, &hash_setup_event);
ezcl_event_release(hash_setup_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_HASH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
/*
const int isize, // 0
const int levmx, // 1
const int imax, // 2
const int jmax, // 3
const int imaxsize, // 4
const int jmaxsize, // 5
__global const int *levtable, // 6
__global const int *level, // 7
__global const int *i, // 8
__global const int *j, // 9
__global int *nlft, // 10
__global int *nrht, // 11
__global int *nbot, // 12
__global int *ntop, // 13
__global const ulong *hash_header, // 14
__global int *hash) // 15
*/
cl_event calc_neighbors_event;
ezcl_set_kernel_arg(kernel_calc_neighbors, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_neighbors, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_neighbors, 2, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_calc_neighbors, 3, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_calc_neighbors, 4, sizeof(cl_int), (void *)&imaxsize);
ezcl_set_kernel_arg(kernel_calc_neighbors, 5, sizeof(cl_int), (void *)&jmaxsize);
ezcl_set_kernel_arg(kernel_calc_neighbors, 6, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_neighbors, 7, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_neighbors, 8, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_calc_neighbors, 9, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_calc_neighbors, 10, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_calc_neighbors, 11, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_calc_neighbors, 12, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_calc_neighbors, 13, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_calc_neighbors, 14, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_neighbors, 15, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_neighbors, 1, NULL, &global_work_size, &local_work_size, &calc_neighbors_event);
ezcl_wait_for_events(1, &calc_neighbors_event);
ezcl_event_release(calc_neighbors_event);
gpu_compact_hash_delete(dev_hash, dev_hash_header);
if (TIMING_LEVEL >= 2) gpu_timers[MESH_TIMER_HASH_QUERY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
gpu_timers[MESH_TIMER_CALC_NEIGHBORS] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
}
void Mesh::gpu_calc_neighbors_local(void)
{
if (! gpu_do_rezone) return;
ulong gpu_hash_table_size = 0;
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
struct timeval tstart_lev2;
if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
cl_command_queue command_queue = ezcl_get_command_queue();
gpu_counters[MESH_COUNTER_CALC_NEIGH]++;
ncells_ghost = ncells;
assert(dev_levtable);
assert(dev_level);
assert(dev_i);
assert(dev_j);
size_t one = 1;
cl_mem dev_check = ezcl_malloc(NULL, const_cast<char *>("dev_check"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
size_t mem_request = (int)((float)ncells*mem_factor);
dev_nlft = ezcl_malloc(NULL, const_cast<char *>("dev_nlft"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_nrht = ezcl_malloc(NULL, const_cast<char *>("dev_nrht"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_nbot = ezcl_malloc(NULL, const_cast<char *>("dev_nbot"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_ntop = ezcl_malloc(NULL, const_cast<char *>("dev_ntop"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
size_t local_work_size = 64;
size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
size_t block_size = global_work_size/local_work_size;
//printf("DEBUG file %s line %d lws = %d gws %d bs %d ncells %d\n",__FILE__,__LINE__,
// local_work_size, global_work_size, block_size, ncells);
cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int4), CL_MEM_READ_WRITE, 0);
cl_mem dev_sizes = ezcl_malloc(NULL, const_cast<char *>("dev_sizes"), &one, sizeof(cl_int4), CL_MEM_READ_WRITE, 0);
#ifdef BOUNDS_CHECK
if (ezcl_get_device_mem_nelements(dev_i) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_j) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_level) < (int)ncells ){
printf("%d: Warning ncells %ld size dev_i %d dev_j %d dev_level %d\n",mype,ncells,ezcl_get_device_mem_nelements(dev_i),ezcl_get_device_mem_nelements(dev_j),ezcl_get_device_mem_nelements(dev_level));
}
#endif
/*
__kernel void calc_hash_size_cl(
const int ncells, // 0
const int levmx, // 1
__global int *levtable, // 2
__global int *level, // 3
__global int *i, // 4
__global int *j, // 5
__global int4 *redscratch, // 6
__global int4 *sizes, // 7
__local int4 *tile) // 8
*/
ezcl_set_kernel_arg(kernel_hash_size, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_hash_size, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_hash_size, 2, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_hash_size, 3, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_hash_size, 4, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_hash_size, 5, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_hash_size, 6, sizeof(cl_mem), (void *)&dev_redscratch);
ezcl_set_kernel_arg(kernel_hash_size, 7, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_hash_size, 8, local_work_size*sizeof(cl_int4), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_size, 1, NULL, &global_work_size, &local_work_size, NULL);
if (block_size > 1) {
/*
__kernel void finish_reduction_minmax4_cl(
const int isize, // 0
__global int4 *redscratch, // 1
__global int4 *sizes, // 2
__local int4 *tile) // 3
*/
ezcl_set_kernel_arg(kernel_finish_hash_size, 0, sizeof(cl_int), (void *)&block_size);
ezcl_set_kernel_arg(kernel_finish_hash_size, 1, sizeof(cl_mem), (void *)&dev_redscratch);
ezcl_set_kernel_arg(kernel_finish_hash_size, 2, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_finish_hash_size, 3, local_work_size*sizeof(cl_int4), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_hash_size, 1, NULL, &local_work_size, &local_work_size, NULL);
}
ezcl_device_memory_delete(dev_redscratch);
cl_int sizes[4];
ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes, NULL);
int imintile = sizes[0];
int imaxtile = sizes[1];
int jmintile = sizes[2];
int jmaxtile = sizes[3];
// Expand size by 2*coarse_cells for ghost cells
// TODO: May want to get fancier here and calc based on cell level
int jminsize = max(jmintile-2*IPOW2(levmx),0);
int jmaxsize = min(jmaxtile+2*IPOW2(levmx),(jmax+1)*IPOW2(levmx));
int iminsize = max(imintile-2*IPOW2(levmx),0);
int imaxsize = min(imaxtile+2*IPOW2(levmx),(imax+1)*IPOW2(levmx));
//fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
//ezcl_enqueue_write_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes, NULL);
int gpu_hash_method = METHOD_UNSET;
// allow imput.c to control hash types and methods
if (choose_hash_method != METHOD_UNSET) gpu_hash_method = choose_hash_method;
//=========
size_t hashsize;
uint hash_report_level = 1;
cl_mem dev_hash_header = NULL;
cl_mem dev_hash = gpu_compact_hash_init(ncells, imaxsize-iminsize, jmaxsize-jminsize, gpu_hash_method, hash_report_level, &gpu_hash_table_size, &hashsize, &dev_hash_header);
int csize = corners_i.size();
#ifdef BOUNDS_CHECK
for (int ic=0; ic<csize; ic++){
if (corners_i[ic] >= iminsize) continue;
if (corners_j[ic] >= jminsize) continue;
if (corners_i[ic] < imaxsize) continue;
if (corners_j[ic] < jmaxsize) continue;
if ( (corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize) < 0 ||
(corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize) > (int)hashsize){
printf("%d: Warning corners i %d j %d hash %d\n",mype,corners_i[ic],corners_j[ic],
(corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize));
}
}
#endif
size_t corners_local_work_size = MIN(csize, TILE_SIZE);
size_t corners_global_work_size = ((csize+corners_local_work_size - 1) /corners_local_work_size) * corners_local_work_size;
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 0, sizeof(cl_int), (void *)&csize);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 2, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 3, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 4, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 5, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_adjust_sizes, 1, NULL, &corners_global_work_size, &corners_local_work_size, NULL);
if (DEBUG){
vector<int> sizes_tmp(4);
ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes_tmp[0], NULL);
int iminsize_tmp = sizes_tmp[0];
int imaxsize_tmp = sizes_tmp[1];
int jminsize_tmp = sizes_tmp[2];
int jmaxsize_tmp = sizes_tmp[3];
fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize_tmp,imaxsize_tmp,jminsize_tmp,jmaxsize_tmp);
}
local_work_size = 128;
global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
#ifdef BOUNDS_CHECK
{
vector<int> i_tmp(ncells);
vector<int> j_tmp(ncells);
vector<int> level_tmp(ncells);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
for (int ic=0; ic<(int)ncells; ic++){
int lev = level_tmp[ic];
for ( int jj = j_tmp[ic]*IPOW2(levmx-lev)-jminsize; jj < (j_tmp[ic]+1)*IPOW2(levmx-lev)-jminsize; jj++) {
for (int ii = i_tmp[ic]*IPOW2(levmx-lev)-iminsize; ii < (i_tmp[ic]+1)*IPOW2(levmx-lev)-iminsize; ii++) {
if (jj < 0 || jj >= (jmaxsize-jminsize) || ii < 0 || ii >= (imaxsize-iminsize) ) {
printf("%d: Warning ncell %d writes to hash out-of-bounds at line %d ii %d jj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,ic,__LINE__,ii,jj,iminsize,imaxsize,jminsize,jmaxsize);
}
}
}
}
}
#endif
//printf("%d: lws %d gws %d \n",mype,local_work_size,global_work_size);
cl_event hash_setup_local_event;
/*
const int isize, // 0
const int levmx, // 1
const int imax, // 2
const int jmax, // 3
const int noffset, // 4
__global int *sizes, // 5
__global int *levtable, // 6
__global int *level, // 7
__global int *i, // 8
__global int *j, // 9
__global const ulong *hash_heaer, // 10
__global int *hash) // 11
*/
ezcl_set_kernel_arg(kernel_hash_setup_local, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_hash_setup_local, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_hash_setup_local, 2, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_hash_setup_local, 3, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_hash_setup_local, 4, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_hash_setup_local, 5, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_hash_setup_local, 6, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_hash_setup_local, 7, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_hash_setup_local, 8, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_hash_setup_local, 9, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_hash_setup_local, 10, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_hash_setup_local, 11, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_setup_local, 1, NULL, &global_work_size, &local_work_size, &hash_setup_local_event);
ezcl_wait_for_events(1, &hash_setup_local_event);
ezcl_event_release(hash_setup_local_event);
if (DEBUG){
vector<int> sizes_tmp(4);
ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes_tmp[0], NULL);
int iminsize_tmp = sizes_tmp[0];
int imaxsize_tmp = sizes_tmp[1];
int jminsize_tmp = sizes_tmp[2];
int jmaxsize_tmp = sizes_tmp[3];
fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize_tmp,imaxsize_tmp,jminsize_tmp,jmaxsize_tmp);
}
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_HASH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
#ifdef BOUNDS_CHECK
{
if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_i) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_j) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_level) < (int)ncells ) {
printf("%d: Warning -- sizes for dev_neigh too small ncells %ld neigh %d %d %d %d %d %d %d\n",mype,ncells,ezcl_get_device_mem_nelements(dev_nlft),ezcl_get_device_mem_nelements(dev_nrht),ezcl_get_device_mem_nelements(dev_nbot),ezcl_get_device_mem_nelements(dev_ntop),ezcl_get_device_mem_nelements(dev_i),ezcl_get_device_mem_nelements(dev_j),ezcl_get_device_mem_nelements(dev_level));
}
vector<int> level_tmp(ncells);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
int iflag = 0;
for (int ic=0; ic<ncells; ic++){
if (levmx-level_tmp[ic] < 0 || levmx-level_tmp[ic] > levmx) {
printf("%d: Warning level value bad ic %d level %d ncells %d\n",mype,ic,level_tmp[ic],ncells);
iflag++;
}
}
if (ezcl_get_device_mem_nelements(dev_levtable) < levmx+1) printf("%d Warning levtable too small levmx is %d devtable size is %d\n",mype,levmx,ezcl_get_device_mem_nelements(dev_levtable));
#ifdef HAVE_MPI
if (iflag > 20) {fflush(stdout); L7_Terminate(); exit(0);}
#endif
}
#endif
#ifdef BOUNDS_CHECK
{
int jmaxcalc = (jmax+1)*IPOW2(levmx);
int imaxcalc = (imax+1)*IPOW2(levmx);
vector<int> i_tmp(ncells);
vector<int> j_tmp(ncells);
vector<int> level_tmp(ncells);
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
for (int ic=0; ic<(int)ncells; ic++){
int ii = i_tmp[ic];
int jj = j_tmp[ic];
int lev = level_tmp[ic];
int levmult = IPOW2(levmx-lev);
int jjj=jj *levmult-jminsize;
int iii=max( ii *levmult-1, 0 )-iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
jjj=jj *levmult-jminsize;
iii=min( (ii+1)*levmult, imaxcalc-1)-iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
jjj=max( jj *levmult-1, 0) -jminsize;
iii=ii *levmult -iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
jjj=min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
iii=ii *levmult -iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
int nlftval = hash_tmp[(( jj *levmult )-jminsize)*(imaxsize-iminsize)+((max( ii *levmult-1, 0 ))-iminsize)];
int nrhtval = hash_tmp[(( jj *levmult )-jminsize)*(imaxsize-iminsize)+((min( (ii+1)*levmult, imaxcalc-1))-iminsize)];
int nbotval = hash_tmp[((max( jj *levmult-1, 0) )-jminsize)*(imaxsize-iminsize)+(( ii *levmult )-iminsize)];
int ntopval = hash_tmp[((min( (jj+1)*levmult, jmaxcalc-1))-jminsize)*(imaxsize-iminsize)+(( ii *levmult )-iminsize)];
if (nlftval == INT_MIN){
jjj = jj*levmult-jminsize;
iii = ii*levmult-iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
}
if (nrhtval == INT_MIN){
jjj = jj*levmult-jminsize;
iii = ii*levmult-iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
}
if (nbotval == INT_MIN) {
iii = ii*levmult-iminsize;
jjj = jj*levmult-jminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
}
if (ntopval == INT_MIN) {
iii = ii*levmult-iminsize;
jjj = jj*levmult-jminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
}
}
}
#endif
cl_event calc_neighbors_local_event;
/*
const int isize, // 0
const int levmx, // 1
const int imaxsize, // 2
const int jmaxsize, // 3
const int noffset, // 4
__global int *sizes, // 5
__global int *levtable, // 6
__global int *level, // 7
__global int *i, // 8
__global int *j, // 9
__global int *nlft, // 10
__global int *nrht, // 11
__global int *nbot, // 12
__global int *ntop, // 13
__global const ulong *hash_heaer, // 14
__global int *hash) // 15
*/
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 2, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 3, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 4, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 5, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 6, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 7, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 8, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 9, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 10, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 11, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 12, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 13, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 14, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 15, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_neighbors_local, 1, NULL, &global_work_size, &local_work_size, &calc_neighbors_local_event);
ezcl_wait_for_events(1, &calc_neighbors_local_event);
ezcl_event_release(calc_neighbors_local_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_HASH_QUERY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
print_dev_local();
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_FALSE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH 0 numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nlft numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",nlft_tmp[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nrht numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",nrht_tmp[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nbot numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",nbot_tmp[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n ntop numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",ntop_tmp[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
}
#ifdef HAVE_MPI
if (numpe > 1) {
vector<int> iminsize_global(numpe);
vector<int> imaxsize_global(numpe);
vector<int> jminsize_global(numpe);
vector<int> jmaxsize_global(numpe);
vector<int> comm_partner(numpe,-1);
MPI_Allgather(&iminsize, 1, MPI_INT, &iminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(&imaxsize, 1, MPI_INT, &imaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(&jminsize, 1, MPI_INT, &jminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(&jmaxsize, 1, MPI_INT, &jmaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
int num_comm_partners = 0;
for (int ip = 0; ip < numpe; ip++){
if (ip == mype) continue;
if (iminsize_global[ip] > imaxtile) continue;
if (imaxsize_global[ip] < imintile) continue;
if (jminsize_global[ip] > jmaxtile) continue;
if (jmaxsize_global[ip] < jmintile) continue;
comm_partner[num_comm_partners] = ip;
num_comm_partners++;
//if (DEBUG) fprintf(fp,"%d: overlap with processor %d bounding box is %d %d %d %d\n",mype,ip,iminsize_global[ip],imaxsize_global[ip],jminsize_global[ip],jmaxsize_global[ip]);
}
#ifdef BOUNDS_CHECK
{
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells*sizeof(cl_int), &ntop_tmp[0], NULL);
for (uint ic=0; ic<ncells; ic++){
int nl = nlft_tmp[ic];
if (nl != -1){
nl -= noffset;
if (nl<0 || nl>= ncells) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
}
int nr = nrht_tmp[ic];
if (nr != -1){
nr -= noffset;
if (nr<0 || nr>= ncells) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
}
int nb = nbot_tmp[ic];
if (nb != -1){
nb -= noffset;
if (nb<0 || nb>= ncells) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
}
int nt = ntop_tmp[ic];
if (nt != -1){
nt -= noffset;
if (nt<0 || nt>= ncells) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
}
}
}
#endif
cl_mem dev_border_cell = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell1"), &ncells, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_calc_border_cells, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_border_cells, 1, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_border_cells, 2, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_calc_border_cells, 3, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_calc_border_cells, 4, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_calc_border_cells, 5, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_calc_border_cells, 6, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_border_cells, 7, sizeof(cl_mem), (void *)&dev_border_cell);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_border_cells, 1, NULL, &global_work_size, &local_work_size, NULL);
cl_mem dev_border_cell_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell2"), &ncells, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
size_t one = 1;
cl_mem dev_nbsize = ezcl_malloc(NULL, const_cast<char *>("dev_nbsize"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size, sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 1, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 2, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 3, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 4, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 5, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 6, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 7, sizeof(cl_mem), (void *)&dev_border_cell);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 8, sizeof(cl_mem), (void *)&dev_border_cell_new);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 9, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 10, sizeof(cl_mem), (void *)&dev_nbsize);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 11, local_work_size*sizeof(cl_int), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_border_cells2, 1, NULL, &global_work_size, &local_work_size, NULL);
ezcl_device_memory_swap(&dev_border_cell, &dev_border_cell_new);
ezcl_device_memory_delete(dev_border_cell_new);
int group_size = (int)(global_work_size/local_work_size);
ezcl_set_kernel_arg(kernel_finish_scan, 0, sizeof(cl_int), (void *)&group_size);
ezcl_set_kernel_arg(kernel_finish_scan, 1, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_finish_scan, 2, sizeof(cl_mem), (void *)&dev_nbsize);
ezcl_set_kernel_arg(kernel_finish_scan, 3, local_work_size*sizeof(cl_int), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_scan, 1, NULL, &local_work_size, &local_work_size, NULL);
int nbsize_local;
ezcl_enqueue_read_buffer(command_queue, dev_nbsize, CL_TRUE, 0, 1*sizeof(cl_int), &nbsize_local, NULL);
ezcl_device_memory_delete(dev_nbsize);
//printf("%d: border cell size is %d global is %ld\n",mype,nbsize_local,nbsize_global);
vector<int> border_cell_num(nbsize_local);
vector<int> border_cell_i(nbsize_local);
vector<int> border_cell_j(nbsize_local);
vector<int> border_cell_level(nbsize_local);
// allocate new border memory
size_t nbsize_long = nbsize_local;
cl_mem dev_border_cell_i = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_j = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_level = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_num = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_num"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_get_border_data, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_get_border_data, 1, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_get_border_data, 2, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_get_border_data, 3, sizeof(cl_mem), (void *)&dev_border_cell);
ezcl_set_kernel_arg(kernel_get_border_data, 4, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_get_border_data, 5, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_get_border_data, 6, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_get_border_data, 7, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_get_border_data, 8, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_get_border_data, 9, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_get_border_data, 10, sizeof(cl_mem), (void *)&dev_border_cell_num);
ezcl_set_kernel_arg(kernel_get_border_data, 11, local_work_size*sizeof(cl_uint), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_get_border_data, 1, NULL, &global_work_size, &local_work_size, NULL);
ezcl_device_memory_delete(dev_ioffset);
ezcl_device_memory_delete(dev_border_cell);
// read gpu border cell data
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_i, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_i[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_j, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_j[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_level, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_level[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_num, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_num[0], NULL);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_FIND_BOUNDARY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
// Allocate push database
int **send_database = (int**)malloc(num_comm_partners*sizeof(int *));
for (int ip = 0; ip < num_comm_partners; ip++){
send_database[ip] = (int *)malloc(nbsize_local*sizeof(int));
}
// Compute the overlap between processor bounding boxes and set up push database
vector<int> send_buffer_count(num_comm_partners);
for (int ip = 0; ip < num_comm_partners; ip++){
int icount = 0;
for (int ib = 0; ib <nbsize_local; ib++){
int lev = border_cell_level[ib];
int levmult = IPOW2(levmx-lev);
if (border_cell_i[ib]*levmult >= iminsize_global[comm_partner[ip]] &&
border_cell_i[ib]*levmult <= imaxsize_global[comm_partner[ip]] &&
border_cell_j[ib]*levmult >= jminsize_global[comm_partner[ip]] &&
border_cell_j[ib]*levmult <= jmaxsize_global[comm_partner[ip]] ) {
send_database[ip][icount] = ib;
icount++;
}
}
send_buffer_count[ip]=icount;
}
// Initialize L7_Push_Setup with num_comm_partners, comm_partner, send_database and
// send_buffer_count. L7_Push_Setup will copy data and determine recv_buffer_counts.
// It will return receive_count_total for use in allocations
int receive_count_total;
int i_push_handle = 0;
L7_Push_Setup(num_comm_partners, &comm_partner[0], &send_buffer_count[0],
send_database, &receive_count_total, &i_push_handle);
if (DEBUG) {
fprintf(fp,"DEBUG num_comm_partners %d\n",num_comm_partners);
for (int ip = 0; ip < num_comm_partners; ip++){
fprintf(fp,"DEBUG comm partner is %d data count is %d\n",comm_partner[ip],send_buffer_count[ip]);
for (int ic = 0; ic < send_buffer_count[ip]; ic++){
int ib = send_database[ip][ic];
fprintf(fp,"DEBUG \t index %d cell number %d i %d j %d level %d\n",ib,border_cell_num[ib],
border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
}
}
}
// Can now free the send database. Other arrays are vectors and will automatically
// deallocate
for (int ip = 0; ip < num_comm_partners; ip++){
free(send_database[ip]);
}
free(send_database);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_PUSH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
// Push the data needed to the adjacent processors
int *border_cell_num_local = (int *)malloc(receive_count_total*sizeof(int));
int *border_cell_i_local = (int *)malloc(receive_count_total*sizeof(int));
int *border_cell_j_local = (int *)malloc(receive_count_total*sizeof(int));
int *border_cell_level_local = (int *)malloc(receive_count_total*sizeof(int));
L7_Push_Update(&border_cell_num[0], border_cell_num_local, i_push_handle);
L7_Push_Update(&border_cell_i[0], border_cell_i_local, i_push_handle);
L7_Push_Update(&border_cell_j[0], border_cell_j_local, i_push_handle);
L7_Push_Update(&border_cell_level[0], border_cell_level_local, i_push_handle);
L7_Push_Free(&i_push_handle);
ezcl_device_memory_delete(dev_border_cell_i);
ezcl_device_memory_delete(dev_border_cell_j);
ezcl_device_memory_delete(dev_border_cell_level);
ezcl_device_memory_delete(dev_border_cell_num);
nbsize_local = receive_count_total;
if (DEBUG) {
for (int ic = 0; ic < nbsize_local; ic++) {
fprintf(fp,"%d: Local Border cell %d is %d i %d j %d level %d\n",mype,ic,border_cell_num_local[ic],
border_cell_i_local[ic],border_cell_j_local[ic],border_cell_level_local[ic]);
}
}
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_PUSH_BOUNDARY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
nbsize_long = nbsize_local;
dev_border_cell_num = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_num"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_border_cell_i = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_border_cell_j = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_border_cell_level = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_needed = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_needed"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_needed_out = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_needed_out"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_enqueue_write_buffer(command_queue, dev_border_cell_num, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_num_local[0], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_border_cell_i, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_i_local[0], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_border_cell_j, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_j_local[0], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_border_cell_level, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_level_local[0], NULL);
//ezcl_enqueue_write_buffer(command_queue, dev_border_cell_needed, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0], NULL);
free(border_cell_i_local);
free(border_cell_j_local);
free(border_cell_level_local);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_LOCAL_LIST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering before layer 1\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
}
size_t nb_local_work_size = 128;
size_t nb_global_work_size = ((nbsize_local + nb_local_work_size - 1) /nb_local_work_size) * nb_local_work_size;
ezcl_set_kernel_arg(kernel_calc_layer1, 0, sizeof(cl_int), (void *)&nbsize_local);
ezcl_set_kernel_arg(kernel_calc_layer1, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_layer1, 2, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_layer1, 3, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_calc_layer1, 4, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_calc_layer1, 5, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_layer1, 6, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_layer1, 7, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_layer1, 8, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_layer1, 9, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_calc_layer1, 10, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_calc_layer1, 11, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_calc_layer1, 12, sizeof(cl_mem), (void *)&dev_border_cell_needed);
ezcl_set_kernel_arg(kernel_calc_layer1, 13, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_layer1, 14, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer1, 1, NULL, &nb_global_work_size, &nb_local_work_size, NULL);
if (DEBUG){
vector<int> border_cell_needed_local(nbsize_local);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_needed, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0], NULL);
for(int ic=0; ic<nbsize_local; ic++){
if (border_cell_needed_local[ic] == 0) continue;
fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
}
}
cl_event calc_layer1_sethash_event;
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 0, sizeof(cl_int), (void *)&nbsize_local);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 2, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 3, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 4, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 5, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 6, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 7, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 8, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 9, sizeof(cl_mem), (void *)&dev_border_cell_needed);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 10, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 11, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer1_sethash, 1, NULL, &nb_global_work_size, &nb_local_work_size, &calc_layer1_sethash_event);
ezcl_wait_for_events(1, &calc_layer1_sethash_event);
ezcl_event_release(calc_layer1_sethash_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_LAYER1] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
print_dev_local();
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering for 1 layer\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
}
group_size = (int)(nb_global_work_size/nb_local_work_size);
cl_mem dev_nbpacked = ezcl_malloc(NULL, const_cast<char *>("dev_nbpacked"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
size_t group_size_long = group_size;
dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &group_size_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_calc_layer2, 0, sizeof(cl_int), (void *)&nbsize_local);
ezcl_set_kernel_arg(kernel_calc_layer2, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_layer2, 2, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_layer2, 3, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_layer2, 4, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_calc_layer2, 5, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_calc_layer2, 6, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_layer2, 7, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_layer2, 8, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_layer2, 9, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_calc_layer2, 10, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_calc_layer2, 11, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_calc_layer2, 12, sizeof(cl_mem), (void *)&dev_border_cell_needed);
ezcl_set_kernel_arg(kernel_calc_layer2, 13, sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
ezcl_set_kernel_arg(kernel_calc_layer2, 14, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_layer2, 15, sizeof(cl_mem), (void *)&dev_hash);
ezcl_set_kernel_arg(kernel_calc_layer2, 16, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_calc_layer2, 17, sizeof(cl_mem), (void *)&dev_nbpacked);
ezcl_set_kernel_arg(kernel_calc_layer2, 18, nb_local_work_size*sizeof(cl_mem), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer2, 1, NULL, &nb_global_work_size, &nb_local_work_size, NULL);
if (DEBUG){
vector<int> border_cell_needed_local(nbsize_local);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_needed_out, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0], NULL);
for(int ic=0; ic<nbsize_local; ic++){
if (border_cell_needed_local[ic] <= 0) continue;
if (border_cell_needed_local[ic] < 0x0016) fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
if (border_cell_needed_local[ic] >= 0x0016) fprintf(fp,"%d: Second set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
}
}
free(border_cell_num_local);
ezcl_device_memory_delete(dev_border_cell_needed);
ezcl_set_kernel_arg(kernel_finish_scan, 0, sizeof(cl_int), (void *)&group_size);
ezcl_set_kernel_arg(kernel_finish_scan, 1, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_finish_scan, 2, sizeof(cl_mem), (void *)&dev_nbpacked);
ezcl_set_kernel_arg(kernel_finish_scan, 3, nb_local_work_size*sizeof(cl_int), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_scan, 1, NULL, &nb_local_work_size, &nb_local_work_size, NULL);
int nbpacked;
ezcl_enqueue_read_buffer(command_queue, dev_nbpacked, CL_TRUE, 0, 1*sizeof(cl_int), &nbpacked, NULL);
ezcl_device_memory_delete(dev_nbpacked);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_LAYER2] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
nbsize_long = nbsize_local;
cl_mem dev_border_cell_i_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_j_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_level_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_indices_needed = ezcl_malloc(NULL, const_cast<char *>("dev_indices_needed"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_event get_border_data2_event;
ezcl_set_kernel_arg(kernel_get_border_data2, 0, sizeof(cl_int), (void *)&nbsize_local);
ezcl_set_kernel_arg(kernel_get_border_data2, 1, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_get_border_data2, 2, sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
ezcl_set_kernel_arg(kernel_get_border_data2, 3, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_get_border_data2, 4, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_get_border_data2, 5, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_get_border_data2, 6, sizeof(cl_mem), (void *)&dev_border_cell_num);
ezcl_set_kernel_arg(kernel_get_border_data2, 7, sizeof(cl_mem), (void *)&dev_border_cell_i_new);
ezcl_set_kernel_arg(kernel_get_border_data2, 8, sizeof(cl_mem), (void *)&dev_border_cell_j_new);
ezcl_set_kernel_arg(kernel_get_border_data2, 9, sizeof(cl_mem), (void *)&dev_border_cell_level_new);
ezcl_set_kernel_arg(kernel_get_border_data2, 10, sizeof(cl_mem), (void *)&dev_indices_needed);
ezcl_set_kernel_arg(kernel_get_border_data2, 11, local_work_size*sizeof(cl_uint), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_get_border_data2, 1, NULL, &nb_global_work_size, &nb_local_work_size, &get_border_data2_event);
ezcl_device_memory_delete(dev_border_cell_num);
ezcl_device_memory_swap(&dev_border_cell_i, &dev_border_cell_i_new);
ezcl_device_memory_swap(&dev_border_cell_j, &dev_border_cell_j_new);
ezcl_device_memory_swap(&dev_border_cell_level, &dev_border_cell_level_new);
size_t nbp_local_work_size = 128;
size_t nbp_global_work_size = ((nbpacked + nbp_local_work_size - 1) /nbp_local_work_size) * nbp_local_work_size;
cl_event calc_layer2_sethash_event;
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 0, sizeof(cl_int), (void *)&nbpacked);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 2, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 3, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 4, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 5, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 6, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 7, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 8, sizeof(cl_mem), (void *)&dev_levibeg);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 9, sizeof(cl_mem), (void *)&dev_leviend);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 10, sizeof(cl_mem), (void *)&dev_levjbeg);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 11, sizeof(cl_mem), (void *)&dev_levjend);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 12, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 13, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 14, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 15, sizeof(cl_mem), (void *)&dev_indices_needed);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 16, sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 17, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 18, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer2_sethash, 1, NULL, &nbp_global_work_size, &nbp_local_work_size, &calc_layer2_sethash_event);
ezcl_wait_for_events(1, &calc_layer2_sethash_event);
ezcl_event_release(calc_layer2_sethash_event);
ezcl_device_memory_delete(dev_ioffset);
ezcl_wait_for_events(1, &get_border_data2_event);
ezcl_event_release(get_border_data2_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_LAYER_LIST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
vector<int> indices_needed(nbpacked);
// read gpu border cell data
ezcl_enqueue_read_buffer(command_queue, dev_indices_needed, CL_TRUE, 0, nbpacked*sizeof(cl_int), &indices_needed[0], NULL);
ezcl_device_memory_delete(dev_border_cell_i_new);
ezcl_device_memory_delete(dev_border_cell_j_new);
ezcl_device_memory_delete(dev_border_cell_level_new);
if (DEBUG) {
print_dev_local();
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering for 2 layer\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fflush(fp);
}
ezcl_device_memory_delete(dev_border_cell_needed_out);
int nghost = nbpacked;
ncells_ghost = ncells + nghost;
//if (mype == 1) printf("%d: DEBUG before expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
if (ezcl_get_device_mem_capacity(dev_celltype) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_i) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_j) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_level) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_nlft) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_nrht) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_nbot) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_ntop) < ncells_ghost ) {
//if (mype == 0) printf("%d: DEBUG expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
//printf("%d: DEBUG expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
mem_factor = (float)(ncells_ghost/ncells);
cl_mem dev_celltype_old = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_i_old = ezcl_malloc(NULL, const_cast<char *>("dev_i_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_j_old = ezcl_malloc(NULL, const_cast<char *>("dev_j_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_level_old = ezcl_malloc(NULL, const_cast<char *>("dev_level_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_nlft_old = ezcl_malloc(NULL, const_cast<char *>("dev_nlft_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_nrht_old = ezcl_malloc(NULL, const_cast<char *>("dev_nrht_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_nbot_old = ezcl_malloc(NULL, const_cast<char *>("dev_nbot_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_ntop_old = ezcl_malloc(NULL, const_cast<char *>("dev_ntop_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_device_memory_swap(&dev_celltype_old, &dev_celltype);
ezcl_device_memory_swap(&dev_i_old, &dev_i );
ezcl_device_memory_swap(&dev_j_old, &dev_j );
ezcl_device_memory_swap(&dev_level_old, &dev_level );
ezcl_device_memory_swap(&dev_nlft_old, &dev_nlft );
ezcl_device_memory_swap(&dev_nrht_old, &dev_nrht );
ezcl_device_memory_swap(&dev_nbot_old, &dev_nbot );
ezcl_device_memory_swap(&dev_ntop_old, &dev_ntop );
cl_event copy_mesh_data_event;
ezcl_set_kernel_arg(kernel_copy_mesh_data, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 1, sizeof(cl_mem), (void *)&dev_celltype_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 2, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 3, sizeof(cl_mem), (void *)&dev_i_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 4, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 5, sizeof(cl_mem), (void *)&dev_j_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 6, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 7, sizeof(cl_mem), (void *)&dev_level_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 8, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 9, sizeof(cl_mem), (void *)&dev_nlft_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 10, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 11, sizeof(cl_mem), (void *)&dev_nrht_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 12, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 13, sizeof(cl_mem), (void *)&dev_nbot_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 14, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 15, sizeof(cl_mem), (void *)&dev_ntop_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 16, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_copy_mesh_data, 1, NULL, &global_work_size, &local_work_size, &copy_mesh_data_event);
ezcl_device_memory_delete(dev_celltype_old);
ezcl_device_memory_delete(dev_i_old);
ezcl_device_memory_delete(dev_j_old);
ezcl_device_memory_delete(dev_level_old);
ezcl_device_memory_delete(dev_nlft_old);
ezcl_device_memory_delete(dev_nrht_old);
ezcl_device_memory_delete(dev_nbot_old);
ezcl_device_memory_delete(dev_ntop_old);
ezcl_wait_for_events(1, &copy_mesh_data_event);
ezcl_event_release(copy_mesh_data_event);
}
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_COPY_MESH_DATA] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
nb_global_work_size = ((nbpacked + nb_local_work_size - 1) /nb_local_work_size) * nb_local_work_size;
#ifdef BOUNDS_CHECK
if (ezcl_get_device_mem_nelements(dev_i) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_j) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_level) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_celltype) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
printf("DEBUG size issue at %d\n",__LINE__);
}
if (ezcl_get_device_mem_nelements(dev_border_cell_i) < nbpacked ||
ezcl_get_device_mem_nelements(dev_border_cell_j) < nbpacked ||
ezcl_get_device_mem_nelements(dev_border_cell_level) < nbpacked ){
printf("DEBUG size issue at %d\n",__LINE__);
}
#endif
cl_event fill_mesh_ghost_event;
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 0, sizeof(cl_int), (void *)&nbpacked);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 2, sizeof(cl_mem), (void *)&dev_levibeg);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 3, sizeof(cl_mem), (void *)&dev_leviend);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 4, sizeof(cl_mem), (void *)&dev_levjbeg);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 5, sizeof(cl_mem), (void *)&dev_levjend);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 6, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 7, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 8, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 9, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 10, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 11, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 12, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 13, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 14, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 15, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 16, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_fill_mesh_ghost, 1, NULL, &nb_global_work_size, &nb_local_work_size, &fill_mesh_ghost_event);
ezcl_wait_for_events(1, &fill_mesh_ghost_event);
ezcl_event_release(fill_mesh_ghost_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_FILL_MESH_GHOST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG){
fprintf(fp,"After copying i,j, level to ghost cells\n");
print_dev_local();
}
ezcl_device_memory_delete(dev_border_cell_i);
ezcl_device_memory_delete(dev_border_cell_j);
ezcl_device_memory_delete(dev_border_cell_level);
size_t ghost_local_work_size = 128;
size_t ghost_global_work_size = ((ncells_ghost + ghost_local_work_size - 1) /ghost_local_work_size) * ghost_local_work_size;
cl_event fill_neighbor_ghost_event;
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 0, sizeof(cl_int), (void *)&ncells_ghost);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 2, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 3, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 4, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 5, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 6, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 7, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 8, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 9, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 10, sizeof(cl_mem), (void *)&dev_hash);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 11, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 12, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 13, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 14, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_fill_neighbor_ghost, 1, NULL, &ghost_global_work_size, &ghost_local_work_size, &fill_neighbor_ghost_event);
ezcl_wait_for_events(1, &fill_neighbor_ghost_event);
ezcl_event_release(fill_neighbor_ghost_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_FILL_NEIGH_GHOST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG){
fprintf(fp,"After setting neighbors through ghost cells\n");
print_dev_local();
}
#ifdef BOUNDS_CHECK
if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
printf("%d: Warning sizes for set_corner_neighbor not right ncells ghost %d nlft size %d\n",mype,ncells_ghost,ezcl_get_device_mem_nelements(dev_nlft));
}
#endif
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_SET_CORNER_NEIGH] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG){
fprintf(fp,"After setting corner neighbors\n");
print_dev_local();
}
#ifdef BOUNDS_CHECK
if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
printf("%d: Warning sizes for adjust neighbors not right ncells ghost %d nlft size %d\n",mype,ncells_ghost,ezcl_get_device_mem_nelements(dev_nlft));
}
if (ezcl_get_device_mem_nelements(dev_indices_needed) < (int)(ncells_ghost-ncells) ){
printf("%d: Warning indices size wrong nghost %d size indices_needed\n",mype,ncells_ghost-ncells,ezcl_get_device_mem_nelements(dev_indices_needed));
}
#endif
cl_event adjust_neighbors_local_event;
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 0, sizeof(cl_int), (void *)&ncells_ghost);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 2, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 3, sizeof(cl_mem), (void *)&dev_indices_needed);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 4, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 5, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 6, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 7, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_adjust_neighbors_local, 1, NULL, &ghost_global_work_size, &ghost_local_work_size, &adjust_neighbors_local_event);
ezcl_device_memory_delete(dev_indices_needed);
if (DEBUG){
fprintf(fp,"After adjusting neighbors to local indices\n");
print_dev_local();
}
ezcl_wait_for_events(1, &adjust_neighbors_local_event);
ezcl_event_release(adjust_neighbors_local_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_NEIGH_ADJUST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
offtile_ratio_local = (offtile_ratio_local*(double)offtile_local_count) + ((double)nghost / (double)ncells);
offtile_local_count++;
offtile_ratio_local /= offtile_local_count;
if (cell_handle) L7_Free(&cell_handle);
cell_handle=0;
if (DEBUG){
fprintf(fp,"%d: SETUP ncells %ld noffset %d nghost %d\n",mype,ncells,noffset,nghost);
for (int ic=0; ic<nghost; ic++){
fprintf(fp,"%d: indices needed ic %d index %d\n",mype,ic,indices_needed[ic]);
}
}
L7_Dev_Setup(0, noffset, ncells, &indices_needed[0], nghost, &cell_handle);
#ifdef BOUNDS_CHECK
{
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
vector<int> level_tmp(ncells_ghost);
vector<real_t> H_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
for (uint ic=0; ic<ncells; ic++){
int nl = nlft_tmp[ic];
if (nl<0 || nl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
if (level_tmp[nl] > level_tmp[ic]){
int ntl = ntop_tmp[nl];
if (ntl<0 || ntl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d nlft %d ntop of nlft %d\n",mype,__LINE__,ic,ic+noffset,nl,ntl);
}
int nr = nrht_tmp[ic];
if (nr<0 || nr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
if (level_tmp[nr] > level_tmp[ic]){
int ntr = ntop_tmp[nr];
if (ntr<0 || ntr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop of nrht %d\n",mype,__LINE__,ic,ntr);
}
int nb = nbot_tmp[ic];
if (nb<0 || nb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
if (level_tmp[nb] > level_tmp[ic]){
int nrb = nrht_tmp[nb];
if (nrb<0 || nrb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of nbot %d\n",mype,__LINE__,ic,nrb);
}
int nt = ntop_tmp[ic];
if (nt<0 || nt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d ntop %d ncells %ld ncells_ghost %ld\n",mype,__LINE__,ic,ic+noffset,nt,ncells,ncells_ghost);
if (level_tmp[nt] > level_tmp[ic]){
int nrt = nrht_tmp[nt];
if (nrt<0 || nrt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of ntop %d\n",mype,__LINE__,ic,nrt);
}
}
}
#endif
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_SETUP_COMM] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
}
if (DEBUG) {
print_dev_local();
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_FALSE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nlft numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
fprintf(fp,"%5d",nlft_tmp[hashval]);
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nrht numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
fprintf(fp,"%5d",nrht_tmp[hashval]);
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nbot numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
fprintf(fp,"%5d",nbot_tmp[hashval]);
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n ntop numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
fprintf(fp,"%5d",ntop_tmp[hashval]);
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
}
if (DEBUG) {
print_dev_local();
vector<int> i_tmp(ncells_ghost);
vector<int> j_tmp(ncells_ghost);
vector<int> level_tmp(ncells_ghost);
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
for (uint ic=0; ic<ncells; ic++){
fprintf(fp,"%d: before update ic %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
mype,ic,i_tmp[ic],j_tmp[ic],level_tmp[ic],nlft_tmp[ic],nrht_tmp[ic],nbot_tmp[ic],ntop_tmp[ic]);
}
int ig=0;
for (uint ic=ncells; ic<ncells_ghost; ic++, ig++){
fprintf(fp,"%d: after update ic %d off %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
mype,ic,indices_needed[ig],i_tmp[ic],j_tmp[ic],level_tmp[ic],nlft_tmp[ic],nrht_tmp[ic],nbot_tmp[ic],ntop_tmp[ic]);
}
}
}
#endif
ezcl_device_memory_delete(dev_sizes);
ezcl_device_memory_delete(dev_check);
gpu_compact_hash_delete(dev_hash, dev_hash_header);
gpu_timers[MESH_TIMER_CALC_NEIGHBORS] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
}
#endif
void Mesh::print_calc_neighbor_type(void)
{
if ( calc_neighbor_type == HASH_TABLE ) {
if (mype == 0) printf("Using hash tables to calculate neighbors\n");
if (mype == 0 && numpe == 1) final_hash_collision_report();
} else {
printf("hash table size %ld\n",ncells*(int)log(ncells)*sizeof(int));
if (mype == 0) printf("Using k-D tree to calculate neighbors\n");
}
}
int Mesh::get_calc_neighbor_type(void)
{
return(calc_neighbor_type );
}
void Mesh::calc_celltype_threaded(size_t ncells)
{
int flags=0;
#ifdef HAVE_J7
if (parallel) flags = LOAD_BALANCE_MEMORY;
#endif
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
{
#endif
if (celltype == NULL || mesh_memory.get_memory_size(celltype) < ncells) {
if (celltype != NULL) celltype = (int *)mesh_memory.memory_delete(celltype);
celltype = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "celltype", flags);
}
#ifdef _OPENMP
}
#pragma omp barrier
#endif
#ifdef _OPENMP
#pragma omp for
#endif
for (uint ic=0; ic<ncells; ++ic) {
celltype[ic] = REAL_CELL;
if (is_left_boundary(ic) ) celltype[ic] = LEFT_BOUNDARY;
if (is_right_boundary(ic) ) celltype[ic] = RIGHT_BOUNDARY;
if (is_bottom_boundary(ic) ) celltype[ic] = BOTTOM_BOUNDARY;
if (is_top_boundary(ic)) celltype[ic] = TOP_BOUNDARY;
}
}
void Mesh::calc_celltype(size_t ncells)
{
int flags = 0;
#ifdef HAVE_J7
if (parallel) flags = LOAD_BALANCE_MEMORY;
#endif
if (celltype == NULL || mesh_memory.get_memory_size(celltype) < ncells) {
if (celltype != NULL) celltype = (int *)mesh_memory.memory_delete(celltype);
celltype = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "celltype", flags);
}
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (uint ic=0; ic<ncells; ++ic) {
celltype[ic] = REAL_CELL;
if (is_left_boundary(ic) ) celltype[ic] = LEFT_BOUNDARY;
if (is_right_boundary(ic) ) celltype[ic] = RIGHT_BOUNDARY;
if (is_bottom_boundary(ic) ) celltype[ic] = BOTTOM_BOUNDARY;
if (is_top_boundary(ic)) celltype[ic] = TOP_BOUNDARY;
}
}
void Mesh::calc_symmetry(vector<int> &dsym, vector<int> &xsym, vector<int> &ysym)
{
TBounds box;
vector<int> index_list( IPOW2(levmx*levmx) );
int num;
for (uint ic=0; ic<ncells; ic++) {
dsym[ic]=ic;
xsym[ic]=ic;
ysym[ic]=ic;
//diagonal symmetry
box.min.x = -1.0*(x[ic]+0.5*dx[ic]);
box.max.x = -1.0*(x[ic]+0.5*dx[ic]);
box.min.y = -1.0*(y[ic]+0.5*dy[ic]);
box.max.y = -1.0*(y[ic]+0.5*dy[ic]);
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) dsym[ic]=index_list[0];
//printf("ic %d dsym[ic] %d num %d\n",ic,dsym[ic],num);
//x symmetry
box.min.x = -1.0*(x[ic]+0.5*dx[ic]);
box.max.x = -1.0*(x[ic]+0.5*dx[ic]);
box.min.y = y[ic]+0.5*dy[ic];
box.max.y = y[ic]+0.5*dy[ic];
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) xsym[ic]=index_list[0];
//y symmetry
box.min.x = x[ic]+0.5*dx[ic];
box.max.x = x[ic]+0.5*dx[ic];
box.min.y = -1.0*(y[ic]+0.5*dy[ic]);
box.max.y = -1.0*(y[ic]+0.5*dy[ic]);
KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
if (num == 1) ysym[ic]=index_list[0];
}
}
#ifdef HAVE_MPI
void Mesh::do_load_balance_local(size_t numcells, float *weight, MallocPlus &state_memory)
{
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
// To get rid of compiler warning
if (DEBUG && weight != NULL) printf("DEBUG weight[0] = %f\n",weight[0]);
int ncells_old = numcells;
int noffset_old = ndispl[mype];
// Need to add weight array to load balance if it is not NULL
// Need to add tolerance to when load balance is done
int do_load_balance_global = 0;
int nsizes_old = 0;
for (int ip=0; ip<numpe; ip++){
nsizes_old = nsizes[ip];
// Calc new,even partition of data across processors
nsizes[ip] = ncells_global/numpe;
// Account for leftover cells
if (ip < (int)(ncells_global%numpe)) nsizes[ip]++;
if (nsizes_old != nsizes[ip]) do_load_balance_global = 1;
}
if (do_load_balance_global) {
cpu_counters[MESH_COUNTER_LOAD_BALANCE]++;
mesh_memory.memory_delete(celltype);
mesh_memory.memory_delete(nlft);
mesh_memory.memory_delete(nrht);
mesh_memory.memory_delete(nbot);
mesh_memory.memory_delete(ntop);
ndispl[0]=0;
for (int ip=1; ip<numpe; ip++){
ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
}
ncells = nsizes[mype];
noffset=ndispl[mype];
// Indices of blocks to be added to load balance
int lower_block_start = noffset;
int lower_block_end = min(noffset_old-1, (int)(noffset+ncells-1));
int upper_block_start = max((int)(noffset_old+ncells_old), noffset);
int upper_block_end = noffset+ncells-1;
int lower_block_size = max(lower_block_end-lower_block_start+1,0);
if(lower_block_end < 0) lower_block_size = 0; // Handles segfault at start of array
int upper_block_size = max(upper_block_end-upper_block_start+1,0);
int indices_needed_count = lower_block_size + upper_block_size;
int in = 0;
vector<int> indices_needed(indices_needed_count);
for (int iz = lower_block_start; iz <= lower_block_end; iz++, in++){
indices_needed[in]=iz;
}
for (int iz = upper_block_start; iz <= upper_block_end; iz++, in++){
indices_needed[in]=iz;
}
int load_balance_handle = 0;
L7_Setup(0, noffset_old, ncells_old, &indices_needed[0], indices_needed_count, &load_balance_handle);
//printf("\n%d: DEBUG load balance report\n",mype);
state_memory.memory_realloc_all(ncells_old+indices_needed_count);
MallocPlus state_memory_old = state_memory;
malloc_plus_memory_entry *memory_item;
for (memory_item = state_memory_old.memory_entry_by_name_begin();
memory_item != state_memory_old.memory_entry_by_name_end();
memory_item = state_memory_old.memory_entry_by_name_next() ) {
//if (mype == 0) printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
if (memory_item->mem_elsize == 8) {
double *mem_ptr_double = (double *)memory_item->mem_ptr;
int flags = state_memory.get_memory_flags(mem_ptr_double);
double *state_temp_double = (double *) state_memory.memory_malloc(ncells, sizeof(double),
"state_temp_double", flags);
//printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
L7_Update(mem_ptr_double, L7_DOUBLE, load_balance_handle);
in = 0;
if(lower_block_size > 0) {
for(; in < MIN(lower_block_size, (int)ncells); in++) {
state_temp_double[in] = mem_ptr_double[ncells_old + in];
}
}
for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
state_temp_double[in] = mem_ptr_double[ic];
}
if(upper_block_size > 0) {
int ic = ncells_old + lower_block_size;
for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
state_temp_double[in] = mem_ptr_double[ic+k];
}
}
state_memory.memory_replace(mem_ptr_double, state_temp_double);
} else if (memory_item->mem_elsize == 4) {
float *mem_ptr_float = (float *)memory_item->mem_ptr;
int flags = state_memory.get_memory_flags(mem_ptr_float);
float *state_temp_float = (float *) state_memory.memory_malloc(ncells, sizeof(float),
"state_temp_float", flags);
//printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
L7_Update(mem_ptr_float, L7_FLOAT, load_balance_handle);
in = 0;
if(lower_block_size > 0) {
for(; in < MIN(lower_block_size, (int)ncells); in++) {
state_temp_float[in] = mem_ptr_float[ncells_old + in];
}
}
for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
state_temp_float[in] = mem_ptr_float[ic];
}
if(upper_block_size > 0) {
int ic = ncells_old + lower_block_size;
for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
state_temp_float[in] = mem_ptr_float[ic+k];
}
}
state_memory.memory_replace(mem_ptr_float, state_temp_float);
}
}
mesh_memory.memory_realloc_all(ncells_old+indices_needed_count);
MallocPlus mesh_memory_old = mesh_memory;
for (memory_item = mesh_memory_old.memory_entry_by_name_begin();
memory_item != mesh_memory_old.memory_entry_by_name_end();
memory_item = mesh_memory_old.memory_entry_by_name_next() ) {
//if (mype == 0) printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
if (memory_item->mem_elsize == 8) {
long long *mem_ptr_long = (long long *)memory_item->mem_ptr;
int flags = mesh_memory.get_memory_flags(mem_ptr_long);
long long *mesh_temp_long = (long long *)mesh_memory.memory_malloc(ncells, sizeof(long long), "mesh_temp_long", flags);
//printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
L7_Update(mem_ptr_long, L7_LONG_LONG_INT, load_balance_handle);
in = 0;
if(lower_block_size > 0) {
for(; in < MIN(lower_block_size, (int)ncells); in++) {
mesh_temp_long[in] = mem_ptr_long[ncells_old + in];
}
}
for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
mesh_temp_long[in] = mem_ptr_long[ic];
}
if(upper_block_size > 0) {
int ic = ncells_old + lower_block_size;
for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
mesh_temp_long[in] = mem_ptr_long[ic+k];
}
}
mesh_memory.memory_replace(mem_ptr_long, mesh_temp_long);
} else {
int *mem_ptr_int = (int *)memory_item->mem_ptr;
int flags = mesh_memory.get_memory_flags(mem_ptr_int);
int *mesh_temp_int = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "mesh_temp_int", flags);
//printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
L7_Update(mem_ptr_int, L7_INT, load_balance_handle);
in = 0;
if(lower_block_size > 0) {
for(; in < MIN(lower_block_size, (int)ncells); in++) {
mesh_temp_int[in] = mem_ptr_int[ncells_old + in];
}
}
for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
mesh_temp_int[in] = mem_ptr_int[ic];
}
if(upper_block_size > 0) {
int ic = ncells_old + lower_block_size;
for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
mesh_temp_int[in] = mem_ptr_int[ic+k];
}
}
mesh_memory.memory_replace(mem_ptr_int, mesh_temp_int);
}
}
L7_Free(&load_balance_handle);
load_balance_handle = 0;
memory_reset_ptrs();
//mesh_memory.memory_report();
//state_memory.memory_report();
//printf("%d: DEBUG end load balance report\n\n",mype);
calc_celltype(ncells);
}
cpu_timers[MESH_TIMER_LOAD_BALANCE] += cpu_timer_stop(tstart_cpu);
}
#endif
#ifdef HAVE_OPENCL
#ifdef HAVE_MPI
int Mesh::gpu_do_load_balance_local(size_t numcells, float *weight, MallocPlus &gpu_state_memory)
{
int do_load_balance_global = 0;
if (! gpu_do_rezone) return(do_load_balance_global);
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
// To get rid of compiler warning
if (DEBUG && weight != NULL) printf("DEBUG weight[0] = %f\n",weight[0]);
int ncells_old = numcells;
int noffset_old = ndispl[mype];
// Need to add weight array to load balance if it is not NULL
// Need to add tolerance to when load balance is done
int nsizes_old = 0;
for (int ip=0; ip<numpe; ip++){
nsizes_old = nsizes[ip];
nsizes[ip] = ncells_global/numpe;
if (ip < (int)(ncells_global%numpe)) nsizes[ip]++;
if (nsizes_old != nsizes[ip]) do_load_balance_global = 1;
}
if(do_load_balance_global) {
cl_command_queue command_queue = ezcl_get_command_queue();
gpu_counters[MESH_COUNTER_LOAD_BALANCE]++;
ndispl[0]=0;
for (int ip=1; ip<numpe; ip++){
ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
}
ncells = nsizes[mype];
noffset=ndispl[mype];
// Indices of blocks to be added to load balance
int lower_block_start = noffset;
int lower_block_end = min(noffset_old-1, (int)(noffset+ncells-1));
int upper_block_start = max((int)(noffset_old+ncells_old), noffset);
int upper_block_end = noffset+ncells-1;
//printf("%d: lbs %d lbe %d ubs %d ube %d\n",mype,lower_block_start-noffset_old,lower_block_end-noffset_old,upper_block_start-noffset_old,upper_block_end-noffset_old);
size_t lower_block_size = max(lower_block_end-lower_block_start+1,0);
if(lower_block_end < 0) lower_block_size = 0; // Handles segfault at start of array
size_t upper_block_size = max(upper_block_end-upper_block_start+1,0);
int indices_needed_count = lower_block_size + upper_block_size;
size_t middle_block_size = ncells - lower_block_size - upper_block_size;
int middle_block_start = max(noffset - noffset_old, 0);
int lower_segment_size = noffset-noffset_old;
int do_whole_segment = 0;
if (lower_segment_size > ncells_old) do_whole_segment = 1;
int upper_segment_size = ( (noffset_old+ncells_old) - (noffset+ncells) );
int upper_segment_start = (noffset_old+ncells_old) - upper_segment_size - noffset_old;
if (upper_segment_size > ncells_old) do_whole_segment=1;
int in = 0;
vector<int> indices_needed(indices_needed_count);
for (int iz = lower_block_start; iz <= lower_block_end; iz++, in++){
indices_needed[in]=iz;
}
for (int iz = upper_block_start; iz <= upper_block_end; iz++, in++){
indices_needed[in]=iz;
}
int load_balance_handle = 0;
L7_Setup(0, noffset_old, ncells_old, &indices_needed[0], indices_needed_count, &load_balance_handle);
size_t local_work_size = 128;
size_t global_work_size = ((ncells + local_work_size - 1) / local_work_size) * local_work_size;
// printf("MYPE%d: \t ncells = %d \t ncells_old = %d \t ncells_global = %d \n", mype, ncells, ncells_old, ncells_global);
// Allocate lower block on GPU
size_t low_block_size = MAX(1, lower_block_size);
cl_mem dev_state_var_lower = ezcl_malloc(NULL, const_cast<char *>("dev_state_var_lower"), &low_block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
// Allocate upper block on GPU
size_t up_block_size = MAX(1, upper_block_size);
cl_mem dev_state_var_upper = ezcl_malloc(NULL, const_cast<char *>("dev_state_var_upper"), &up_block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
MallocPlus gpu_state_memory_old = gpu_state_memory;
malloc_plus_memory_entry *memory_item;
for (memory_item = gpu_state_memory_old.memory_entry_by_name_begin();
memory_item != gpu_state_memory_old.memory_entry_by_name_end();
memory_item = gpu_state_memory_old.memory_entry_by_name_next() ) {
//printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
cl_mem dev_state_mem_ptr = (cl_mem)memory_item->mem_ptr;
if (memory_item->mem_elsize == 8){
#ifndef MINIMUM_PRECISION
vector<double> state_var_tmp(ncells_old+indices_needed_count,0.0);
// Read current state values from GPU and write to CPU arrays
if (do_whole_segment) {
ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, ncells_old*sizeof(cl_double), &state_var_tmp[0], NULL);
} else {
// Read lower block from GPU
if (lower_segment_size > 0) {
ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, lower_segment_size*sizeof(cl_double), &state_var_tmp[0], NULL);
}
// Read upper block from GPU
if (upper_segment_size > 0) {
ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, upper_segment_start*sizeof(cl_double), upper_segment_size*sizeof(cl_double), &state_var_tmp[upper_segment_start], NULL);
}
}
// Update arrays with L7
L7_Update(&state_var_tmp[0], L7_DOUBLE, load_balance_handle);
// Set lower block on GPU
if(lower_block_size > 0) {
ezcl_enqueue_write_buffer(command_queue, dev_state_var_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_double), &state_var_tmp[ncells_old], NULL);
}
// Set upper block on GPU
if(upper_block_size > 0) {
ezcl_enqueue_write_buffer(command_queue, dev_state_var_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_double), &state_var_tmp[ncells_old+lower_block_size], NULL);
}
// Allocate space on GPU for temp arrays (used in double buffering)
cl_mem dev_state_var_new = ezcl_malloc(NULL, gpu_state_memory.get_memory_name(dev_state_mem_ptr), &ncells, sizeof(cl_double), CL_MEM_READ_WRITE, 0);
gpu_state_memory.memory_add(dev_state_var_new, ncells, sizeof(cl_double), "dev_state_var_new", DEVICE_REGULAR_MEMORY);
//printf("DEBUG memory for proc %d is %p dev_state_new is %p\n",mype,dev_state_mem_ptr,dev_state_var_new);
ezcl_set_kernel_arg(kernel_do_load_balance_double, 0, sizeof(cl_int), &ncells);
ezcl_set_kernel_arg(kernel_do_load_balance_double, 1, sizeof(cl_int), &lower_block_size);
ezcl_set_kernel_arg(kernel_do_load_balance_double, 2, sizeof(cl_int), &middle_block_size);
ezcl_set_kernel_arg(kernel_do_load_balance_double, 3, sizeof(cl_int), &middle_block_start);
ezcl_set_kernel_arg(kernel_do_load_balance_double, 4, sizeof(cl_mem), &dev_state_mem_ptr);
ezcl_set_kernel_arg(kernel_do_load_balance_double, 5, sizeof(cl_mem), &dev_state_var_lower);
ezcl_set_kernel_arg(kernel_do_load_balance_double, 6, sizeof(cl_mem), &dev_state_var_upper);
ezcl_set_kernel_arg(kernel_do_load_balance_double, 7, sizeof(cl_mem), &dev_state_var_new);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_double, 1, NULL, &global_work_size, &local_work_size, NULL);
gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
#else
printf("ERROR -- can't have double type for state variable\n");
exit(1);
#endif
} else if (memory_item->mem_elsize == 4){
vector<float> state_var_tmp(ncells_old+indices_needed_count,0.0);
// Read current state values from GPU and write to CPU arrays
if (do_whole_segment) {
ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, ncells_old*sizeof(cl_float), &state_var_tmp[0], NULL);
} else {
// Read lower block from GPU
if (lower_segment_size > 0) {
ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, lower_segment_size*sizeof(cl_float), &state_var_tmp[0], NULL);
}
// Read upper block from GPU
if (upper_segment_size > 0) {
ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, upper_segment_start*sizeof(cl_float), upper_segment_size*sizeof(cl_float), &state_var_tmp[upper_segment_start], NULL);
}
}
// Update arrays with L7
L7_Update(&state_var_tmp[0], L7_FLOAT, load_balance_handle);
// Set lower block on GPU
if(lower_block_size > 0) {
ezcl_enqueue_write_buffer(command_queue, dev_state_var_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_float), &state_var_tmp[ncells_old], NULL);
}
// Set upper block on GPU
if(upper_block_size > 0) {
ezcl_enqueue_write_buffer(command_queue, dev_state_var_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_float), &state_var_tmp[ncells_old+lower_block_size], NULL);
}
// Allocate space on GPU for temp arrays (used in double buffering)
cl_mem dev_state_var_new = ezcl_malloc(NULL, gpu_state_memory.get_memory_name(dev_state_mem_ptr), &ncells, sizeof(cl_float), CL_MEM_READ_WRITE, 0);
gpu_state_memory.memory_add(dev_state_var_new, ncells, sizeof(cl_float), "dev_state_var_new", DEVICE_REGULAR_MEMORY);
//printf("DEBUG memory for proc %d is %p dev_state_new is %p\n",mype,dev_state_mem_ptr,dev_state_var_new);
ezcl_set_kernel_arg(kernel_do_load_balance_float, 0, sizeof(cl_int), &ncells);
ezcl_set_kernel_arg(kernel_do_load_balance_float, 1, sizeof(cl_int), &lower_block_size);
ezcl_set_kernel_arg(kernel_do_load_balance_float, 2, sizeof(cl_int), &middle_block_size);
ezcl_set_kernel_arg(kernel_do_load_balance_float, 3, sizeof(cl_int), &middle_block_start);
ezcl_set_kernel_arg(kernel_do_load_balance_float, 4, sizeof(cl_mem), &dev_state_mem_ptr);
ezcl_set_kernel_arg(kernel_do_load_balance_float, 5, sizeof(cl_mem), &dev_state_var_lower);
ezcl_set_kernel_arg(kernel_do_load_balance_float, 6, sizeof(cl_mem), &dev_state_var_upper);
ezcl_set_kernel_arg(kernel_do_load_balance_float, 7, sizeof(cl_mem), &dev_state_var_new);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_float, 1, NULL, &global_work_size, &local_work_size, NULL);
gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
}
}
ezcl_device_memory_delete(dev_state_var_lower);
ezcl_device_memory_delete(dev_state_var_upper);
vector<int> i_tmp(ncells_old+indices_needed_count,0);
vector<int> j_tmp(ncells_old+indices_needed_count,0);
vector<int> level_tmp(ncells_old+indices_needed_count,0);
vector<int> celltype_tmp(ncells_old+indices_needed_count,0);
if (do_whole_segment) {
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_old*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_old*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells_old*sizeof(cl_int), &level_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE, 0, ncells_old*sizeof(cl_int), &celltype_tmp[0], NULL);
} else {
if (lower_segment_size > 0) {
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, lower_segment_size*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, lower_segment_size*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, lower_segment_size*sizeof(cl_int), &level_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE, 0, lower_segment_size*sizeof(cl_int), &celltype_tmp[0], NULL);
}
if (upper_segment_size > 0) {
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &i_tmp[upper_segment_start], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &j_tmp[upper_segment_start], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &level_tmp[upper_segment_start], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &celltype_tmp[upper_segment_start], NULL);
}
}
L7_Update(&i_tmp[0], L7_INT, load_balance_handle);
L7_Update(&j_tmp[0], L7_INT, load_balance_handle);
L7_Update(&level_tmp[0], L7_INT, load_balance_handle);
L7_Update(&celltype_tmp[0], L7_INT, load_balance_handle);
L7_Free(&load_balance_handle);
load_balance_handle = 0;
// Allocate and set lower block on GPU
cl_mem dev_i_lower, dev_j_lower, dev_level_lower, dev_celltype_lower;
if(lower_block_size > 0) {
dev_i_lower = ezcl_malloc(NULL, const_cast<char *>("dev_i_lower"), &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_j_lower = ezcl_malloc(NULL, const_cast<char *>("dev_j_lower"), &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_level_lower = ezcl_malloc(NULL, const_cast<char *>("dev_level_lower"), &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_celltype_lower = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_lower"), &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_enqueue_write_buffer(command_queue, dev_i_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_int), &i_tmp[ncells_old], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_j_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_int), &j_tmp[ncells_old], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_level_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_int), &level_tmp[ncells_old], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_celltype_lower, CL_TRUE, 0, lower_block_size*sizeof(cl_int), &celltype_tmp[ncells_old], NULL);
}
// Allocate and set upper block on GPU
cl_mem dev_i_upper, dev_j_upper, dev_level_upper, dev_celltype_upper;
if(upper_block_size > 0) {
dev_i_upper = ezcl_malloc(NULL, const_cast<char *>("dev_i_upper"), &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_j_upper = ezcl_malloc(NULL, const_cast<char *>("dev_j_upper"), &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_level_upper = ezcl_malloc(NULL, const_cast<char *>("dev_level_upper"), &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_celltype_upper = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_upper"), &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_enqueue_write_buffer(command_queue, dev_i_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_int), &i_tmp[ncells_old+lower_block_size], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_j_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_int), &j_tmp[ncells_old+lower_block_size], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_level_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_int), &level_tmp[ncells_old+lower_block_size], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_celltype_upper, CL_TRUE, 0, upper_block_size*sizeof(cl_int), &celltype_tmp[ncells_old+lower_block_size], NULL);
}
local_work_size = 128;
// printf("MYPE%d: \t ncells = %d \t ncells_old = %d \t ncells_global = %d \n", mype, ncells, ncells_old, ncells_global);
// Allocate space on GPU for temp arrays (used in double buffering)
size_t mem_request = (int)((float)ncells*mem_factor);
cl_mem dev_i_new = ezcl_malloc(NULL, const_cast<char *>("dev_i_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_j_new = ezcl_malloc(NULL, const_cast<char *>("dev_j_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_level_new = ezcl_malloc(NULL, const_cast<char *>("dev_level_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_celltype_new = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
// Set kernel arguments and call lower block kernel
if(lower_block_size > 0) {
size_t global_work_size = ((lower_block_size + local_work_size - 1) / local_work_size) * local_work_size;
ezcl_set_kernel_arg(kernel_do_load_balance_lower, 0, sizeof(cl_mem), &dev_i_new);
ezcl_set_kernel_arg(kernel_do_load_balance_lower, 1, sizeof(cl_mem), &dev_j_new);
ezcl_set_kernel_arg(kernel_do_load_balance_lower, 2, sizeof(cl_mem), &dev_level_new);
ezcl_set_kernel_arg(kernel_do_load_balance_lower, 3, sizeof(cl_mem), &dev_celltype_new);
ezcl_set_kernel_arg(kernel_do_load_balance_lower, 4, sizeof(cl_mem), &dev_i_lower);
ezcl_set_kernel_arg(kernel_do_load_balance_lower, 5, sizeof(cl_mem), &dev_j_lower);
ezcl_set_kernel_arg(kernel_do_load_balance_lower, 6, sizeof(cl_mem), &dev_level_lower);
ezcl_set_kernel_arg(kernel_do_load_balance_lower, 7, sizeof(cl_mem), &dev_celltype_lower);
ezcl_set_kernel_arg(kernel_do_load_balance_lower, 8, sizeof(cl_int), &lower_block_size);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_lower, 1, NULL, &global_work_size, &local_work_size, NULL);
ezcl_device_memory_delete(dev_i_lower);
ezcl_device_memory_delete(dev_j_lower);
ezcl_device_memory_delete(dev_level_lower);
ezcl_device_memory_delete(dev_celltype_lower);
}
// Set kernel arguments and call middle block kernel
if(middle_block_size > 0) {
size_t global_work_size = ((middle_block_size + local_work_size - 1) / local_work_size) * local_work_size;
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 0, sizeof(cl_mem), &dev_i_new);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 1, sizeof(cl_mem), &dev_j_new);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 2, sizeof(cl_mem), &dev_level_new);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 3, sizeof(cl_mem), &dev_celltype_new);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 4, sizeof(cl_mem), &dev_i);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 5, sizeof(cl_mem), &dev_j);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 6, sizeof(cl_mem), &dev_level);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 7, sizeof(cl_mem), &dev_celltype);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 8, sizeof(cl_int), &lower_block_size);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 9, sizeof(cl_int), &middle_block_size);
ezcl_set_kernel_arg(kernel_do_load_balance_middle, 10, sizeof(cl_int), &middle_block_start);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_middle, 1, NULL, &global_work_size, &local_work_size, NULL);
}
// Set kernel arguments and call upper block kernel
if(upper_block_size > 0) {
size_t global_work_size = ((upper_block_size + local_work_size - 1) / local_work_size) * local_work_size;
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 0, sizeof(cl_mem), &dev_i_new);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 1, sizeof(cl_mem), &dev_j_new);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 2, sizeof(cl_mem), &dev_level_new);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 3, sizeof(cl_mem), &dev_celltype_new);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 4, sizeof(cl_mem), &dev_i_upper);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 5, sizeof(cl_mem), &dev_j_upper);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 6, sizeof(cl_mem), &dev_level_upper);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 7, sizeof(cl_mem), &dev_celltype_upper);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 8, sizeof(cl_int), &lower_block_size);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 9, sizeof(cl_int), &middle_block_size);
ezcl_set_kernel_arg(kernel_do_load_balance_upper, 10, sizeof(cl_int), &upper_block_size);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_upper, 1, NULL, &global_work_size, &local_work_size, NULL);
ezcl_device_memory_delete(dev_i_upper);
ezcl_device_memory_delete(dev_j_upper);
ezcl_device_memory_delete(dev_level_upper);
ezcl_device_memory_delete(dev_celltype_upper);
}
ezcl_device_memory_swap(&dev_i_new, &dev_i);
ezcl_device_memory_swap(&dev_j_new, &dev_j);
ezcl_device_memory_swap(&dev_level_new, &dev_level);
ezcl_device_memory_swap(&dev_celltype_new, &dev_celltype);
ezcl_device_memory_delete(dev_i_new);
ezcl_device_memory_delete(dev_j_new);
ezcl_device_memory_delete(dev_level_new);
ezcl_device_memory_delete(dev_celltype_new);
gpu_timers[MESH_TIMER_LOAD_BALANCE] += (long int)(cpu_timer_stop(tstart_cpu)*1.0e9);
}
return(do_load_balance_global);
}
#endif
#endif
#ifdef HAVE_OPENCL
int Mesh::gpu_count_BCs(void)
{
cl_event count_BCs_stage1_event, count_BCs_stage2_event;
size_t local_work_size = MIN(ncells, TILE_SIZE);
size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
//size_t block_size = (ncells + TILE_SIZE - 1) / TILE_SIZE; // For on-device global reduction kernel.
size_t block_size = global_work_size/local_work_size;
int bcount = 0;
if (! have_boundary) {
cl_command_queue command_queue = ezcl_get_command_queue();
cl_mem dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
/*
__kernel void count_BCs(
const int isize, // 0
__global const int *i, // 1
__global const int *j, // 2
__global const int *level, // 3
__global const int *lev_ibeg, // 4
__global const int *lev_iend, // 5
__global const int *lev_jbeg, // 6
__global const int *lev_jend, // 7
__global int *scratch, // 8
__local int *tile) // 9
*/
size_t shared_spd_sum_int = local_work_size * sizeof(cl_int);
ezcl_set_kernel_arg(kernel_count_BCs, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_count_BCs, 1, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_count_BCs, 2, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_count_BCs, 3, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_count_BCs, 4, sizeof(cl_mem), (void *)&dev_levibeg);
ezcl_set_kernel_arg(kernel_count_BCs, 5, sizeof(cl_mem), (void *)&dev_leviend);
ezcl_set_kernel_arg(kernel_count_BCs, 6, sizeof(cl_mem), (void *)&dev_levjbeg);
ezcl_set_kernel_arg(kernel_count_BCs, 7, sizeof(cl_mem), (void *)&dev_levjend);
ezcl_set_kernel_arg(kernel_count_BCs, 8, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_count_BCs, 9, shared_spd_sum_int, 0);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_count_BCs, 1, NULL, &global_work_size, &local_work_size, &count_BCs_stage1_event);
if (block_size > 1) {
ezcl_set_kernel_arg(kernel_reduce_sum_int_stage2of2, 0, sizeof(cl_int), (void *)&block_size);
ezcl_set_kernel_arg(kernel_reduce_sum_int_stage2of2, 1, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_reduce_sum_int_stage2of2, 2, shared_spd_sum_int, 0);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_sum_int_stage2of2, 1, NULL, &local_work_size, &local_work_size, &count_BCs_stage2_event);
}
ezcl_enqueue_read_buffer(command_queue, dev_ioffset, CL_TRUE, 0, 1*sizeof(cl_int), &bcount, NULL);
//printf("DEBUG -- bcount is %d\n",bcount);
//state->gpu_time_read += ezcl_timer_calc(&start_read_event, &start_read_event);
ezcl_device_memory_delete(dev_ioffset);
gpu_timers[MESH_TIMER_COUNT_BCS] += ezcl_timer_calc(&count_BCs_stage1_event, &count_BCs_stage1_event);
if (block_size > 1) {
gpu_timers[MESH_TIMER_COUNT_BCS] += ezcl_timer_calc(&count_BCs_stage2_event, &count_BCs_stage2_event);
}
}
return(bcount);
}
#endif
void Mesh::allocate(size_t ncells)
{
int flags = 0;
flags = RESTART_DATA;
#ifdef HAVE_J7
if (parallel) flags = LOAD_BALANCE_MEMORY;
#endif
i = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "i", flags);
j = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "j", flags);
level = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "level", flags);
}
void Mesh::resize(size_t new_ncells)
{
size_t current_size = mesh_memory.get_memory_size(i);
if (new_ncells > current_size) mesh_memory.memory_realloc_all(new_ncells);
}
void Mesh::memory_reset_ptrs(void){
i = (int *)mesh_memory.get_memory_ptr("i");
j = (int *)mesh_memory.get_memory_ptr("j");
level = (int *)mesh_memory.get_memory_ptr("level");
celltype = (int *)mesh_memory.get_memory_ptr("celltype");
nlft = (int *)mesh_memory.get_memory_ptr("nlft");
nrht = (int *)mesh_memory.get_memory_ptr("nrht");
nbot = (int *)mesh_memory.get_memory_ptr("nbot");
ntop = (int *)mesh_memory.get_memory_ptr("ntop");
}
void Mesh::resize_old_device_memory(size_t ncells)
{
#ifdef HAVE_OPENCL
ezcl_device_memory_delete(dev_level);
ezcl_device_memory_delete(dev_i);
ezcl_device_memory_delete(dev_j);
ezcl_device_memory_delete(dev_celltype);
size_t mem_request = (int)((float)ncells*mem_factor);
dev_level = ezcl_malloc(NULL, const_cast<char *>("dev_level"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0);
dev_i = ezcl_malloc(NULL, const_cast<char *>("dev_i"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0);
dev_j = ezcl_malloc(NULL, const_cast<char *>("dev_j"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0);
dev_celltype = ezcl_malloc(NULL, const_cast<char *>("dev_celltype"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0);
#else
// To get rid of compiler warning
if (1 == 2) printf("DEBUG -- ncells is %lu\n",ncells);
#endif
}
void Mesh::print_object_info(void)
{
printf(" ---- Mesh object info -----\n");
printf("Dimensionality : %d\n",ndim);
printf("Parallel info : mype %d numpe %d noffset %d parallel %d\n",mype,numpe,noffset,parallel);
printf("Sizes : ncells %ld ncells_ghost %ld\n\n",ncells,ncells_ghost);
#ifdef HAVE_OPENCL
int num_elements, elsize;
num_elements = ezcl_get_device_mem_nelements(dev_celltype);
elsize = ezcl_get_device_mem_elsize(dev_celltype);
printf("dev_celltype ptr : %p nelements %d elsize %d\n",dev_celltype,num_elements,elsize);
num_elements = ezcl_get_device_mem_nelements(dev_level);
elsize = ezcl_get_device_mem_elsize(dev_level);
printf("dev_level ptr : %p nelements %d elsize %d\n",dev_level,num_elements,elsize);
num_elements = ezcl_get_device_mem_nelements(dev_i);
elsize = ezcl_get_device_mem_elsize(dev_i);
printf("dev_i ptr : %p nelements %d elsize %d\n",dev_i,num_elements,elsize);
num_elements = ezcl_get_device_mem_nelements(dev_j);
elsize = ezcl_get_device_mem_elsize(dev_j);
printf("dev_j ptr : %p nelements %d elsize %d\n",dev_j,num_elements,elsize);
num_elements = ezcl_get_device_mem_nelements(dev_nlft);
elsize = ezcl_get_device_mem_elsize(dev_nlft);
printf("dev_nlft ptr : %p nelements %d elsize %d\n",dev_nlft,num_elements,elsize);
num_elements = ezcl_get_device_mem_nelements(dev_nrht);
elsize = ezcl_get_device_mem_elsize(dev_nrht);
printf("dev_nrht ptr : %p nelements %d elsize %d\n",dev_nrht,num_elements,elsize);
num_elements = ezcl_get_device_mem_nelements(dev_nbot);
elsize = ezcl_get_device_mem_elsize(dev_nbot);
printf("dev_nbot ptr : %p nelements %d elsize %d\n",dev_nbot,num_elements,elsize);
num_elements = ezcl_get_device_mem_nelements(dev_ntop);
elsize = ezcl_get_device_mem_elsize(dev_ntop);
printf("dev_ntop ptr : %p nelements %d elsize %d\n",dev_ntop,num_elements,elsize);
#endif
printf("vector celltype ptr : %p nelements %ld elsize %ld\n",&celltype[0],mesh_memory.get_memory_size(celltype),sizeof(celltype[0]));
printf("vector level ptr : %p nelements %ld elsize %ld\n",&level[0], mesh_memory.get_memory_size(level), sizeof(level[0]));
printf("vector i ptr : %p nelements %ld elsize %ld\n",&i[0], mesh_memory.get_memory_size(i), sizeof(i[0]));
printf("vector j ptr : %p nelements %ld elsize %ld\n",&j[0], mesh_memory.get_memory_size(j), sizeof(j[0]));
printf("vector nlft ptr : %p nelements %ld elsize %ld\n",&nlft[0], mesh_memory.get_memory_size(nlft), sizeof(nlft[0]));
printf("vector nrht ptr : %p nelements %ld elsize %ld\n",&nrht[0], mesh_memory.get_memory_size(nrht), sizeof(nrht[0]));
printf("vector nbot ptr : %p nelements %ld elsize %ld\n",&nbot[0], mesh_memory.get_memory_size(nbot), sizeof(nbot[0]));
printf("vector ntop ptr : %p nelements %ld elsize %ld\n",&ntop[0], mesh_memory.get_memory_size(ntop), sizeof(ntop[0]));
}
void Mesh::set_refinement_order(int order[4], int ic, int ifirst, int ilast, int jfirst, int jlast,
int level_first, int level_last, int *i_old, int *j_old, int *level_old)
{
if (localStencil) {
// Store the coordinates of the cells before and after this one on
// the space-filling curve index.
#ifdef __OLD_STENCIL__
spatial_t nx[3], // x-coordinates of cells.
ny[3]; // y-coordinates of cells.
if (ic != 0) {
nx[0] = lev_deltax[level_old[ic-1]] * (spatial_t)i[ic-1];
ny[0] = lev_deltay[level_old[ic-1]] * (spatial_t)j[ic-1];
} else {
nx[0] = lev_deltax[level_first] * (spatial_t)ifirst;
ny[0] = lev_deltay[level_first] * (spatial_t)jfirst;
}
nx[1] = lev_deltax[level_old[ic ]] * (spatial_t)i[ic ];
ny[1] = lev_deltay[level_old[ic ]] * (spatial_t)j[ic ];
if (ic != ncells-1) {
nx[2] = lev_deltax[level_old[ic+1]] * (spatial_t)i[ic+1];
ny[2] = lev_deltay[level_old[ic+1]] * (spatial_t)j[ic+1];
} else {
nx[2] = lev_deltax[level_last] * (spatial_t)ilast;
ny[2] = lev_deltay[level_last] * (spatial_t)jlast;
}
// Figure out relative orientation of the neighboring cells. We are
// are aided in this because the Hilbert curve only has six possible
// ways across the cell: four Ls and two straight lines. Then
// refine the cell according to the relative orientation and order
// according to the four-point Hilbert stencil.
if (nx[0] < nx[1] and ny[2] < ny[1]) // southwest L, forward order
{ order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE; }
else if (nx[2] < nx[1] and ny[0] < ny[1]) // southwest L, reverse order
{ order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW; }
else if (nx[0] > nx[1] and ny[2] < ny[1]) // southeast L, forward order
{ order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW; }
else if (nx[2] > nx[1] and ny[0] < ny[1]) // southeast L, reverse order
{ order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE; }
else if (nx[0] > nx[1] and ny[2] > ny[1]) // northeast L, forward order
{ order[0] = SE; order[1] = SW; order[2] = NW; order[3] = NE; }
else if (nx[2] > nx[1] and ny[0] > ny[1]) // northeast L, reverse order
{ order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE; }
else if (nx[0] < nx[1] and ny[2] > ny[1]) // northwest L, forward order
{ order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW; }
else if (nx[2] < nx[1] and ny[0] > ny[1]) // northwest L, reverse order
{ order[0] = NW; order[1] = NE; order[2] = SE; order[3] = SW; }
else if (nx[0] > nx[1] and nx[1] > nx[2]) // straight horizontal, forward order
{ order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW; }
else if (nx[0] < nx[1] and nx[1] < nx[2]) // straight horizontal, reverse order
{ order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE; }
else if (ny[0] > ny[1] and ny[1] > ny[2]) // straight vertical, forward order
{ order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE; }
else if (ny[0] < ny[1] and ny[1] < ny[2]) // straight vertical, reverse order
{ order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW; }
else // other, default to z-order
{ order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE; }
#endif
#ifdef __NEW_STENCIL__
int ir[3], // First i index at finest level of the mesh
jr[3]; // First j index at finest level of the mesh
// Cell's Radius at the Finest level of the mesh
int crf = IPOW2(levmx-level_old[ic]);
if (ic != 0) {
ir[0] = i_old[ic - 1] * IPOW2(levmx-level_old[ic - 1]);
jr[0] = j_old[ic - 1] * IPOW2(levmx-level_old[ic - 1]);
} else {
//printf("%d cell %d is a first\n",mype,ic);
ir[0] = ifirst * IPOW2(levmx-level_first);
jr[0] = jfirst * IPOW2(levmx-level_first);
}
ir[1] = i_old[ic ] * IPOW2(levmx-level_old[ic ]);
jr[1] = j_old[ic ] * IPOW2(levmx-level_old[ic ]);
if (ic != (int)ncells-1) {
ir[2] = i_old[ic + 1] * IPOW2(levmx-level_old[ic + 1]);
jr[2] = j_old[ic + 1] * IPOW2(levmx-level_old[ic + 1]);
} else {
//printf("%d cell %d is a last\n",mype,ic);
ir[2] = ilast * IPOW2(levmx-level_last);
jr[2] = jlast * IPOW2(levmx-level_last);
}
//if (parallel) fprintf(fp,"%d: DEBUG rezone top boundary -- ic %d global %d noffset %d nc %d i %d j %d level %d\n",mype,ic,ic+noffset,noffset,nc,i[nc],j[nc],level[nc]);
int dir_in = ir[1] - ir[0];
int dir_out = ir[1] - ir[2];
int djr_in = jr[1] - jr[0];
int djr_out = jr[1] - jr[2];
char in_direction = 'X';
char out_direction = 'X';
// Left In
if( (djr_in == 0 && (dir_in == crf*HALF || dir_in == crf || dir_in == crf*TWO)) || (djr_in == -crf*HALF && dir_in == crf*HALF) || (djr_in == crf && dir_in == crf*TWO) ) {
in_direction = 'L';
}
// Bottom In
else if( (dir_in == 0 && (djr_in == crf*HALF || djr_in == crf || djr_in == crf*TWO)) || (dir_in == -crf*HALF && djr_in == crf*HALF) || (dir_in == crf && djr_in == crf*TWO) ) {
in_direction = 'B';
}
// Right In
else if( (dir_in == -crf && (djr_in == -crf*HALF || djr_in == 0 || (djr_in == crf && level_old[ic-1] < level_old[ic]))) ) {
in_direction = 'R';
}
// Top In
else if( (djr_in == -crf && (dir_in == -crf*HALF || dir_in == 0 || (dir_in == crf && level_old[ic-1] < level_old[ic]))) ) {
in_direction = 'T';
}
// Further from the left
else if( dir_in > 0 && djr_in == 0 ) {
in_direction = 'L';
}
// Further from the right
else if( dir_in < 0 && djr_in == 0 ) {
in_direction = 'R';
}
// Further from the bottom
else if( djr_in > 0 && dir_in == 0 ) {
in_direction = 'B';
}
// Further from the top
else if( djr_in < 0 && dir_in == 0 ) {
in_direction = 'T';
}
// SW in; 'M'
else if( dir_in > 0 && djr_in > 0) {
in_direction = 'M';
}
// NW in; 'W'
else if( dir_in > 0 && djr_in < 0) {
in_direction = 'W';
}
// SE in; 'F'
else if( dir_in < 0 && djr_in > 0) {
in_direction = 'F';
}
// NE in; 'E'
else if( dir_in < 0 && djr_in < 0) {
in_direction = 'E';
}
// Left Out
if( (djr_out == 0 && (dir_out == crf*HALF || dir_out == crf || dir_out == crf*TWO)) || (djr_out == -crf*HALF && dir_out == crf*HALF) || (djr_out == crf && dir_out == crf*TWO) ) {
out_direction = 'L';
}
// Bottom Out
else if( (dir_out == 0 && (djr_out == crf*HALF || djr_out == crf || djr_out == crf*TWO)) || (dir_out == -crf*HALF && djr_out == crf*HALF) || (dir_out == crf && djr_out == crf*TWO) ) {
out_direction = 'B';
}
// Right Out
else if( (dir_out == -crf && (djr_out == -crf*HALF || djr_out == 0 || (djr_out == crf && level_old[ic+1] < level_old[ic]))) ) {
out_direction = 'R';
}
// Top Out
else if( (djr_out == -crf && (dir_out == -crf*HALF || dir_out == 0 || (dir_out == crf && level_old[ic+1] < level_old[ic]))) ) {
out_direction = 'T';
}
// Further from the left
else if( dir_out > 0 && djr_out == 0 ) {
out_direction = 'L';
}
// Further from the right
else if( dir_out < 0 && djr_out == 0 ) {
out_direction = 'R';
}
// Further from the bottom
else if( djr_out > 0 && dir_out == 0 ) {
out_direction = 'B';
}
// Further from the top
else if( djr_out < 0 && dir_out == 0 ) {
out_direction = 'T';
}
// SW out; 'M'
else if( dir_out > 0 && djr_out > 0) {
out_direction = 'M';
}
// NW out; 'W'
else if( dir_out > 0 && djr_out < 0) {
out_direction = 'W';
}
// SE out; 'F'
else if( dir_out < 0 && djr_out > 0) {
out_direction = 'F';
}
// NE out; 'E'
else if( dir_out < 0 && djr_out < 0) {
out_direction = 'E';
}
// Set the Stencil
if(in_direction == 'L' && (out_direction == 'B' || out_direction == 'R' || out_direction == 'F')) {
order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE;
}
else if(in_direction == 'L' && (out_direction == 'T' || out_direction == 'W' )) {
order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW;
}
else if(in_direction == 'L' && out_direction == 'M') {
order[0] = NW; order[1] = NE; order[2] = SE; order[3] = SW;
}
else if(in_direction == 'L' && out_direction == 'E') {
order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE;
}
else if(in_direction == 'B' && (out_direction == 'R' || out_direction == 'F' )) {
order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE;
}
else if(in_direction == 'B' && (out_direction == 'L' || out_direction == 'T' || out_direction == 'W' )) {
order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW;
}
else if(in_direction == 'B' && out_direction == 'M') {
order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW;
}
else if(in_direction == 'B' && out_direction == 'E') {
order[0] = SW; order[1] = NW; order[2] = SE; order[3] = NE;
}
else if(in_direction == 'R' && (out_direction == 'T' || out_direction == 'L' || out_direction == 'W' )) {
order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW;
}
else if(in_direction == 'R' && (out_direction == 'B' || out_direction == 'F' )) {
order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE;
}
else if(in_direction == 'R' && out_direction == 'M') {
order[0] = NE; order[1] = NW; order[2] = SE; order[3] = SW;
}
else if(in_direction == 'R' && out_direction == 'E') {
order[0] = SE; order[1] = SW; order[2] = NW; order[3] = NE;
}
else if(in_direction == 'T' && (out_direction == 'L' || out_direction == 'W' )) {
order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW;
}
else if(in_direction == 'T' && (out_direction == 'R' || out_direction == 'B' || out_direction == 'F' )) {
order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE;
}
else if(in_direction == 'T' && out_direction == 'M') {
order[0] = NE; order[1] = SE; order[2] = NW; order[3] = SW;
}
else if(in_direction == 'T' && out_direction == 'E') {
order[0] = NW; order[1] = SW; order[2] = SE; order[3] = NE;
}
else if(in_direction == 'M' && (out_direction == 'L' || out_direction == 'W' || out_direction == 'T') ) {
order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW;
}
else if(in_direction == 'M' && (out_direction == 'R' || out_direction == 'F' || out_direction == 'B') ) {
order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE;
}
else if(in_direction == 'M' && out_direction == 'E') {
order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE;
}
else if(in_direction == 'W' && (out_direction == 'L' || out_direction == 'M' || out_direction == 'B') ) {
order[0] = NW; order[1] = NE; order[2] = SE; order[3] = SW;
}
else if(in_direction == 'W' && (out_direction == 'R' || out_direction == 'E' || out_direction == 'T') ) {
order[0] = NW; order[1] = SW; order[2] = SE; order[3] = NE;
}
else if(in_direction == 'W' && out_direction == 'F') {
order[0] = NW; order[1] = NE; order[2] = SW; order[3] = SE;
}
else if(in_direction == 'F' && (out_direction == 'L' || out_direction == 'M' || out_direction == 'B') ) {
order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW;
}
else if(in_direction == 'F' && (out_direction == 'R' || out_direction == 'E' || out_direction == 'T') ) {
order[0] = SE; order[1] = SW; order[2] = NW; order[3] = NE;
}
else if(in_direction == 'F' && out_direction == 'W') {
order[0] = SE; order[1] = NE; order[2] = SW; order[3] = NW;
}
else if(in_direction == 'E' && (out_direction == 'L' || out_direction == 'W' || out_direction == 'T') ) {
order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW;
}
else if(in_direction == 'E' && (out_direction == 'R' || out_direction == 'F' || out_direction == 'B') ) {
order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE;
}
else if(in_direction == 'E' && out_direction == 'M') {
order[0] = NE; order[1] = SE; order[2] = NW; order[3] = SW;
}
else { // Default to a knot
order[0] = NW; order[1] = SE; order[2] = SW; order[3] = NE;
if (do_stencil_warning) {
printf("Nonlocal case for the stencil.\n");
}
}
// Determine the relative orientation of the neighboring cells.
// There are 12 possible ways across the cell: 4 Ls and 2 straight
// lines, each with 2 directions of traversal.
// Then the cell is refined and ordered according to the relative
// orientation and four-point Hilbert stencil.
// XXX NOTE that the four-point stencil varies depending upon
// the starting and ending point of the global Hilbert curve.
// The stencil applied here assumes the start at (0,0) and the end
// at (0,y_max). XXX WRONG
#endif
} // End local stencil version
else // Use Z-ordering for the curve.
{ order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE; }
}
void Mesh::calc_face_list(void)
{
xface_i.clear();
xface_j.clear();
xface_level.clear();
ixmin_level.clear();
ixmax_level.clear();
jxmin_level.clear();
jxmax_level.clear();
ixmin_level.resize(levmx+1, 9999999);
ixmax_level.resize(levmx+1, -9999999);
jxmin_level.resize(levmx+1, 9999999);
jxmax_level.resize(levmx+1, -9999999);
ixadjust.clear();
ixadjust.resize(levmx+1);
jxadjust.clear();
jxadjust.resize(levmx+1);
int iface=0;
for (int nz=0; nz<(int)ncells; nz++){
int nr = nrht[nz];
if (nr == nz) continue;
int ifactor = 1;
if (level[nr] < level[nz]) ifactor = 2;
// Have right face
//printf("DEBUG xface -- iface %d lower nz %d upper nr %d\n",iface,nz,nr);
xface_level.push_back(MAX(level[nz],level[nr]));
xface_i.push_back(i[nr]*ifactor);
if (level[nr] < level[nz] && is_upper(j[nz]) ) {
xface_j.push_back(j[nr]*ifactor+1);
} else {
xface_j.push_back(j[nr]*ifactor);
}
iface++;
if (level[nr] > level[nz] && is_lower(j[nr]) ){
int ntr = ntop[nr];
if (ntr != nr) {
//printf("DEBUG xface -- iface %d lower nz %d upper ntr %d\n",iface,nz,ntr);
xface_level.push_back(MAX(level[nz],level[ntr]));
xface_i.push_back(i[ntr]*ifactor);
xface_j.push_back(j[ntr]*ifactor);
iface++;
}
}
}
nxface=iface;
yface_i.clear();
yface_j.clear();
yface_level.clear();
iymin_level.clear();
iymax_level.clear();
jymin_level.clear();
jymax_level.clear();
iymin_level.resize(levmx+1, 9999999);
iymax_level.resize(levmx+1, -9999999);
jymin_level.resize(levmx+1, 9999999);
jymax_level.resize(levmx+1, -9999999);
iyadjust.clear();
iyadjust.resize(levmx+1);
jyadjust.clear();
jyadjust.resize(levmx+1);
iface=0;
for (int nz=0; nz<(int)ncells; nz++){
int nt = ntop[nz];
if (nt == nz) continue;
int ifactor = 1;
if (level[nt] < level[nz]) ifactor = 2;
// Have top face
//printf("DEBUG yface -- iface %d lower nz %d upper nt %d\n",iface,nz,nt);
yface_level.push_back(MAX(level[nz],level[nt]));
yface_j.push_back(j[nt]*ifactor);
if (level[nt] < level[nz] && is_upper(i[nz]) ) {
yface_i.push_back(i[nt]*ifactor+1);
} else{
yface_i.push_back(i[nt]*ifactor);
}
iface++;
if (level[nt] > level[nz] && is_lower(i[nt]) ){
int nrt = nrht[nt];
if (nrt != nt) {
//printf("DEBUG yface -- iface %d lower nz %d upper nrt %d\n",iface,nz,nrt);
yface_level.push_back(MAX(level[nz],level[nrt]));
yface_j.push_back(j[nrt]*ifactor);
yface_i.push_back(i[nrt]*ifactor);
iface++;
}
}
}
nyface=iface;
for (int iface=0; iface < nxface; iface++){
int fl = xface_level[iface];
int fi = xface_i[iface];
if (fi < ixmin_level[fl]) ixmin_level[fl] = fi;
if (fi > ixmax_level[fl]) ixmax_level[fl] = fi;
int fj = xface_j[iface];
if (fj < jxmin_level[fl]) jxmin_level[fl] = fj;
if (fj > jxmax_level[fl]) jxmax_level[fl] = fj;
}
for (int iface=0; iface < nxface; iface++){
int fl = xface_level[iface];
if (ixmax_level[fl] < ixmin_level[fl]) continue;
xface_i[iface] -= ixmin_level[fl];
xface_j[iface] -= jxmin_level[fl];
}
for (int fl = 0; fl <= levmx; fl++){
ixadjust[fl] = ixmin_level[fl];
jxadjust[fl] = jxmin_level[fl];
ixmax_level[fl] -= ixmin_level[fl];;
jxmax_level[fl] -= jxmin_level[fl];
ixmin_level[fl] = 0;
jxmin_level[fl] = 0;
}
for (int iface=0; iface < nyface; iface++){
int fl = yface_level[iface];
int fi = yface_i[iface];
if (fi < iymin_level[fl]) iymin_level[fl] = fi;
if (fi > iymax_level[fl]) iymax_level[fl] = fi;
int fj = yface_j[iface];
if (fj < jymin_level[fl]) jymin_level[fl] = fj;
if (fj > jymax_level[fl]) jymax_level[fl] = fj;
}
for (int iface=0; iface < nyface; iface++){
int fl = yface_level[iface];
if (iymax_level[fl] < iymin_level[fl]) continue;
yface_i[iface] -= iymin_level[fl];
yface_j[iface] -= jymin_level[fl];
}
for (int fl = 0; fl <= levmx; fl++){
iyadjust[fl] = iymin_level[fl];
jyadjust[fl] = jymin_level[fl];
iymax_level[fl] -= iymin_level[fl];;
jymax_level[fl] -= jymin_level[fl];
iymin_level[fl] = 0;
jymin_level[fl] = 0;
}
}
void Mesh::calc_face_list_wmap(void)
{
map_xface2cell_lower.clear();
map_xface2cell_upper.clear();
xface_i.clear();
xface_j.clear();
xface_level.clear();
ixmin_level.clear();
ixmax_level.clear();
jxmin_level.clear();
jxmax_level.clear();
ixmin_level.resize(levmx+1, 9999999);
ixmax_level.resize(levmx+1, -9999999);
jxmin_level.resize(levmx+1, 9999999);
jxmax_level.resize(levmx+1, -9999999);
ixadjust.clear();
ixadjust.resize(levmx+1);
jxadjust.clear();
jxadjust.resize(levmx+1);
int iface=0;
for (int nz=0; nz<(int)ncells; nz++){
int nr = nrht[nz];
if (nr == nz) continue;
int ifactor = 1;
if (level[nr] < level[nz]) ifactor = 2;
// Have right face
map_xface2cell_lower.push_back(nz);
map_xface2cell_upper.push_back(nr);
xface_level.push_back(MAX(level[nz],level[nr]));
xface_i.push_back(i[nr]*ifactor);
if (level[nr] < level[nz] && is_upper(j[nz]) ) {
xface_j.push_back(j[nr]*ifactor+1);
} else {
xface_j.push_back(j[nr]*ifactor);
}
iface++;
if (level[nr] > level[nz] && is_lower(j[nr]) ){
int ntr = ntop[nr];
if (ntr != nr) {
map_xface2cell_lower.push_back(nz);
map_xface2cell_upper.push_back(ntr);
xface_level.push_back(MAX(level[nz],level[ntr]));
xface_i.push_back(i[ntr]*ifactor);
xface_j.push_back(j[ntr]*ifactor);
iface++;
}
}
}
nxface=iface;
map_yface2cell_lower.clear();
map_yface2cell_upper.clear();
yface_i.clear();
yface_j.clear();
yface_level.clear();
iymin_level.clear();
iymax_level.clear();
jymin_level.clear();
jymax_level.clear();
iymin_level.resize(levmx+1, 9999999);
iymax_level.resize(levmx+1, -9999999);
jymin_level.resize(levmx+1, 9999999);
jymax_level.resize(levmx+1, -9999999);
iyadjust.clear();
iyadjust.resize(levmx+1);
jyadjust.clear();
jyadjust.resize(levmx+1);
iface=0;
for (int nz=0; nz<(int)ncells; nz++){
int nt = ntop[nz];
if (nt == nz) continue;
int ifactor = 1;
if (level[nt] < level[nz]) ifactor = 2;
// Have top face
// printf("DEBUG -- iface %d lower nz %d upper nr %d\n",iface,nz,nt);
map_yface2cell_lower.push_back(nz);
map_yface2cell_upper.push_back(nt);
yface_level.push_back(MAX(level[nz],level[nt]));
yface_j.push_back(j[nt]*ifactor);
if (level[nt] < level[nz] && is_upper(i[nz]) ) {
yface_i.push_back(i[nt]*ifactor+1);
} else{
yface_i.push_back(i[nt]*ifactor);
}
iface++;
if (level[nt] > level[nz] && is_lower(i[nt]) ){
int nrt = nrht[nt];
if (nrt != nt) {
map_yface2cell_lower.push_back(nz);
map_yface2cell_upper.push_back(nrt);
yface_level.push_back(MAX(level[nz],level[nrt]));
yface_j.push_back(j[nrt]*ifactor);
yface_i.push_back(i[nrt]*ifactor);
iface++;
}
}
}
nyface=iface;
for (int iface=0; iface < nxface; iface++){
int fl = xface_level[iface];
int fi = xface_i[iface];
if (fi < ixmin_level[fl]) ixmin_level[fl] = fi;
if (fi > ixmax_level[fl]) ixmax_level[fl] = fi;
int fj = xface_j[iface];
if (fj < jxmin_level[fl]) jxmin_level[fl] = fj;
if (fj > jxmax_level[fl]) jxmax_level[fl] = fj;
}
for (int iface=0; iface < nxface; iface++){
int fl = xface_level[iface];
if (ixmax_level[fl] < ixmin_level[fl]) continue;
xface_i[iface] -= ixmin_level[fl];
xface_j[iface] -= jxmin_level[fl];
}
for (int fl = 0; fl <= levmx; fl++){
ixadjust[fl] = ixmin_level[fl];
jxadjust[fl] = jxmin_level[fl];
ixmax_level[fl] -= ixmin_level[fl];;
jxmax_level[fl] -= jxmin_level[fl];
ixmin_level[fl] = 0;
jxmin_level[fl] = 0;
}
for (int iface=0; iface < nyface; iface++){
int fl = yface_level[iface];
int fi = yface_i[iface];
if (fi < iymin_level[fl]) iymin_level[fl] = fi;
if (fi > iymax_level[fl]) iymax_level[fl] = fi;
int fj = yface_j[iface];
if (fj < jymin_level[fl]) jymin_level[fl] = fj;
if (fj > jymax_level[fl]) jymax_level[fl] = fj;
}
for (int iface=0; iface < nyface; iface++){
int fl = yface_level[iface];
if (iymax_level[fl] < iymin_level[fl]) continue;
yface_i[iface] -= iymin_level[fl];
yface_j[iface] -= jymin_level[fl];
}
for (int fl = 0; fl <= levmx; fl++){
iyadjust[fl] = iymin_level[fl];
jyadjust[fl] = jymin_level[fl];
iymax_level[fl] -= iymin_level[fl];;
jymax_level[fl] -= jymin_level[fl];
iymin_level[fl] = 0;
jymin_level[fl] = 0;
}
}
void Mesh::calc_face_list_wbidirmap(void)
{
map_xface2cell_lower.clear();
map_xface2cell_upper.clear();
map_xcell2face_left1.clear();
map_xcell2face_left2.clear();
map_xcell2face_right1.clear();
map_xcell2face_right2.clear();
map_xcell2face_left1.resize(ncells, -1);
map_xcell2face_left2.resize(ncells, -1);
map_xcell2face_right1.resize(ncells, -1);
map_xcell2face_right2.resize(ncells, -1);
xface_i.clear();
xface_j.clear();
xface_level.clear();
ixmin_level.clear();
ixmax_level.clear();
jxmin_level.clear();
jxmax_level.clear();
ixmin_level.resize(levmx+1, 9999999);
ixmax_level.resize(levmx+1, -9999999);
jxmin_level.resize(levmx+1, 9999999);
jxmax_level.resize(levmx+1, -9999999);
ixadjust.clear();
ixadjust.resize(levmx+1);
jxadjust.clear();
jxadjust.resize(levmx+1);
int iface=0;
for (int nz=0; nz<(int)ncells; nz++){
int nr = nrht[nz];
if (nr == nz) continue;
int ifactor = 1;
if (level[nr] < level[nz]) ifactor = 2;
// Have right face
map_xface2cell_lower.push_back(nz);
map_xface2cell_upper.push_back(nr);
xface_level.push_back(MAX(level[nz],level[nr]));
xface_i.push_back(i[nr]*ifactor);
if (level[nr] < level[nz] && is_upper(j[nz]) ) {
xface_j.push_back(j[nr]*ifactor+1);
} else {
xface_j.push_back(j[nr]*ifactor);
}
map_xcell2face_right1[nz] = iface;
iface++;
if (level[nr] > level[nz] && is_lower(j[nr]) ){
int ntr = ntop[nr];
if (ntr != nr) {
map_xface2cell_lower.push_back(nz);
map_xface2cell_upper.push_back(ntr);
xface_level.push_back(MAX(level[nz],level[ntr]));
xface_i.push_back(i[ntr]*ifactor);
xface_j.push_back(j[ntr]*ifactor);
map_xcell2face_right2[nz] = iface;
iface++;
}
}
}
nxface=iface;
for (int nz=0; nz<(int)ncells; nz++){
int nl = nlft[nz];
if (nl == nz) continue;
if (level[nl] < level[nz] && is_upper(j[nz])){
map_xcell2face_left1[nz] = map_xcell2face_right2[nl];
} else {
map_xcell2face_left1[nz] = map_xcell2face_right1[nl];
if (level[nl] > level[nz]){
map_xcell2face_left2[nz] = map_xcell2face_right1[ntop[nl]];
}
}
}
map_yface2cell_lower.clear();
map_yface2cell_upper.clear();
map_ycell2face_bot1.clear();
map_ycell2face_bot2.clear();
map_ycell2face_top1.clear();
map_ycell2face_top2.clear();
map_ycell2face_bot1.resize(ncells, -1);
map_ycell2face_bot2.resize(ncells, -1);
map_ycell2face_top1.resize(ncells, -1);
map_ycell2face_top2.resize(ncells, -1);
yface_i.clear();
yface_j.clear();
yface_level.clear();
iymin_level.clear();
iymax_level.clear();
jymin_level.clear();
jymax_level.clear();
iymin_level.resize(levmx+1, 9999999);
iymax_level.resize(levmx+1, -9999999);
jymin_level.resize(levmx+1, 9999999);
jymax_level.resize(levmx+1, -9999999);
iyadjust.clear();
iyadjust.resize(levmx+1);
jyadjust.clear();
jyadjust.resize(levmx+1);
iface=0;
for (int nz=0; nz<(int)ncells; nz++){
int nt = ntop[nz];
if (nt == nz) continue;
int ifactor = 1;
if (level[nt] < level[nz]) ifactor = 2;
// Have top face
// printf("DEBUG -- iface %d lower nz %d upper nr %d\n",iface,nz,nt);
map_yface2cell_lower.push_back(nz);
map_yface2cell_upper.push_back(nt);
yface_level.push_back(MAX(level[nz],level[nt]));
yface_j.push_back(j[nt]*ifactor);
if (level[nt] < level[nz] && is_upper(i[nz]) ) {
yface_i.push_back(i[nt]*ifactor+1);
} else{
yface_i.push_back(i[nt]*ifactor);
}
map_ycell2face_top1[nz] = iface;
iface++;
if (level[nt] > level[nz] &&is_lower(i[nt]) ){
int nrt = nrht[nt];
if (nrt != nt) {
map_yface2cell_lower.push_back(nz);
map_yface2cell_upper.push_back(nrt);
yface_level.push_back(MAX(level[nz],level[nrt]));
yface_j.push_back(j[nrt]*ifactor);
yface_i.push_back(i[nrt]*ifactor);
map_ycell2face_top2[nz] = iface;
iface++;
}
}
}
nyface=iface;
for (int nz=0; nz<(int)ncells; nz++){
int nb = nbot[nz];
if (nb == nz) continue;
if (level[nb] < level[nz] && is_upper(i[nz])){
map_ycell2face_bot1[nz] = map_ycell2face_top2[nb];
} else {
map_ycell2face_bot1[nz] = map_ycell2face_top1[nb];
if (level[nb] > level[nz]){
map_ycell2face_bot2[nz] = map_ycell2face_top1[nrht[nb]];
}
}
}
for (int iface=0; iface < nxface; iface++){
int fl = xface_level[iface];
int fi = xface_i[iface];
if (fi < ixmin_level[fl]) ixmin_level[fl] = fi;
if (fi > ixmax_level[fl]) ixmax_level[fl] = fi;
int fj = xface_j[iface];
if (fj < jxmin_level[fl]) jxmin_level[fl] = fj;
if (fj > jxmax_level[fl]) jxmax_level[fl] = fj;
}
for (int iface=0; iface < nxface; iface++){
int fl = xface_level[iface];
if (ixmax_level[fl] < ixmin_level[fl]) continue;
xface_i[iface] -= ixmin_level[fl];
xface_j[iface] -= jxmin_level[fl];
}
for (int fl = 0; fl <= levmx; fl++){
ixadjust[fl] = ixmin_level[fl];
jxadjust[fl] = jxmin_level[fl];
ixmax_level[fl] -= ixmin_level[fl];;
jxmax_level[fl] -= jxmin_level[fl];
ixmin_level[fl] = 0;
jxmin_level[fl] = 0;
}
for (int iface=0; iface < nyface; iface++){
int fl = yface_level[iface];
int fi = yface_i[iface];
if (fi < iymin_level[fl]) iymin_level[fl] = fi;
if (fi > iymax_level[fl]) iymax_level[fl] = fi;
int fj = yface_j[iface];
if (fj < jymin_level[fl]) jymin_level[fl] = fj;
if (fj > jymax_level[fl]) jymax_level[fl] = fj;
}
for (int iface=0; iface < nyface; iface++){
int fl = yface_level[iface];
if (iymax_level[fl] < iymin_level[fl]) continue;
yface_i[iface] -= iymin_level[fl];
yface_j[iface] -= jymin_level[fl];
}
for (int fl = 0; fl <= levmx; fl++){
iyadjust[fl] = iymin_level[fl];
jyadjust[fl] = jymin_level[fl];
iymax_level[fl] -= iymin_level[fl];;
jymax_level[fl] -= jymin_level[fl];
iymin_level[fl] = 0;
jymin_level[fl] = 0;
}
}
int **Mesh::get_xface_flag(int lev, bool print_output)
{
int **xface_flag = (int **)genmatrix(jxmax_level[lev]+1,
ixmax_level[lev]+1, sizeof(int));
for (int jj=0; jj<jxmax_level[lev]+1; jj++){
for (int ii=0; ii<ixmax_level[lev]+1; ii++){
xface_flag[jj][ii] = -1;
}
}
for (int iface=0; iface < nxface; iface++){
if (xface_level[iface] == lev){
int ii = xface_i[iface];
int jj = xface_j[iface];
xface_flag[jj][ii] = 1;
}
}
if (DEBUG || print_output) {
printf("DEBUG -- x face_flag for level %d\n",lev);
printf("DEBUG -- sizes isize+1 %d jsize+1 %d\n",ixmax_level[lev]+1,jxmax_level[lev]+1);
printf(" ");
for (int ii=0; ii<ixmax_level[lev]+1; ii++){
printf(" %4d ",ii);
}
printf("\n");
for (int jj=jxmax_level[lev]; jj>=0; jj--){
printf("DEBUG -- j %4d: ",jj);
for (int ii=0; ii<ixmax_level[lev]+1; ii++){
if (xface_flag[jj][ii] >= 0){
//printf(" xface_flag_check[%d][%d] = 1;\n",jj,ii);
printf(" %4d ", xface_flag[jj][ii]);
} else {
printf(" ");
}
}
printf("\n");
}
}
return(xface_flag);
}
int **Mesh::get_yface_flag(int lev, bool print_output)
{
int **yface_flag = (int **)genmatrix(jymax_level[lev]+1,
iymax_level[lev]+1, sizeof(int));
for (int jj=0; jj<jymax_level[lev]+1; jj++){
for (int ii=0; ii<iymax_level[lev]+1; ii++){
yface_flag[jj][ii] = -1;
}
}
for (int iface=0; iface < nyface; iface++){
if (yface_level[iface] == lev){
int ii = yface_i[iface];
int jj = yface_j[iface];
yface_flag[jj][ii] = 1;
}
}
if (DEBUG || print_output) {
printf("DEBUG -- y face_flag for level %d\n",lev);
printf("DEBUG -- sizes isize+1 %d jsize+1 %d\n",iymax_level[lev]+1,jymax_level[lev]+1);
printf(" ");
for (int ii=0; ii<iymax_level[lev]+1; ii++){
printf(" %4d ",ii);
}
printf("\n");
for (int jj=jymax_level[lev]; jj>=0; jj--){
printf("DEBUG -- j %4d: ",jj);
for (int ii=0; ii<iymax_level[lev]+1; ii++){
if (yface_flag[jj][ii] >= 0){
//printf(" yface_flag_check[%d][%d] = 1;\n",jj,ii);
printf(" %4d ", yface_flag[jj][ii]);
} else {
printf(" ");
}
}
printf("\n");
}
}
return(yface_flag);
}
void Mesh::get_flat_grid(int lev, int ***zone_flag_base, int ***zone_cell_base)
{
int isize = ixmax_level[lev]+4;
int jsize = jymax_level[lev]+4;
int iadjust = ixadjust[lev]-2;
int jadjust = jyadjust[lev]-2;
//printf("DEBUG -- sizes isize %d jsize %d\n",isize,jsize);
//printf("DEBUG -- adjust ixadjust %d jxadjust %d\n",ixadjust[lev],jxadjust[lev]);
//printf("DEBUG -- adjust iyadjust %d jyadjust %d\n",iyadjust[lev],jyadjust[lev]);
(*zone_flag_base) = (int **)genmatrix(jsize, isize, sizeof(int));
int **zone_flag = *zone_flag_base;
for (int jj=0; jj<jsize; jj++){
for (int ii=0; ii<isize; ii++){
zone_flag[jj][ii] = -1;
}
}
(*zone_cell_base) = (int **)genmatrix(jsize, isize, sizeof(int));
int **zone_cell = *zone_cell_base;
for (int jj=0; jj<jsize; jj++){
for (int ii=0; ii<isize; ii++){
zone_cell[jj][ii] = -1;
}
}
for (int iface=0; iface < nxface; iface++){
if (xface_level[iface] == lev){
int nz1 = map_xface2cell_lower[iface];
int nz2 = map_xface2cell_upper[iface];
if (lev == level[nz1]) {
int iii = i[nz1]-iadjust;
int jjj = j[nz1]-jadjust;
zone_flag[jjj][iii] = 1;
zone_cell[jjj][iii] = nz1;
if (nlft[nz1] != REAL_CELL) {
zone_cell[jjj][iii-1] = nlft[nz1];
}
} else {
int iii = i[nz1]*2-iadjust+1;
int jjj = j[nz1]*2-jadjust;
if (is_upper(j[nz2])) jjj += 1;
zone_flag[jjj][iii] = 1;
zone_cell[jjj][iii] = nz1;
zone_cell[jjj][iii-1] = nz1;
}
if (lev == level[nz2]) {
int iii = i[nz2]-iadjust;
int jjj = j[nz2]-jadjust;
zone_flag[jjj][iii] = 1;
zone_cell[jjj][iii] = nz2;
if (nrht[nz2] != REAL_CELL) {
zone_cell[jjj][iii+1] = nrht[nz2];
}
} else {
int iii = i[nz2]*2-iadjust;
int jjj = j[nz2]*2-jadjust;
if (is_upper(j[nz1])) jjj += 1;
zone_flag[jjj][iii] = 1;
zone_cell[jjj][iii] = nz2;
zone_cell[jjj][iii+1] = nz2;
}
}
}
for (int iface=0; iface < nyface; iface++){
if (yface_level[iface] == lev){
int nz1 = map_yface2cell_lower[iface];
int nz2 = map_yface2cell_upper[iface];
if (lev == level[nz1]) {
int iii = i[nz1]-iadjust;
int jjj = j[nz1]-jadjust;
zone_flag[jjj][iii] = 1;
zone_cell[jjj][iii] = nz1;
if (nbot[nz1] != REAL_CELL) {
zone_cell[jjj-1][iii] = nbot[nz1];
}
} else {
int iii = i[nz1]*2-iadjust;
int jjj = j[nz1]*2-jadjust+1;
if (is_upper(i[nz2])) iii += 1;
zone_flag[jjj][iii] = 1;
zone_cell[jjj][iii] = nz1;
zone_cell[jjj-1][iii] = nz1;
}
if (lev == level[nz2]) {
int iii = i[nz2]-iadjust;
int jjj = j[nz2]-jadjust;
zone_flag[jjj][iii] = 1;
zone_cell[jjj][iii] = nz2;
if (ntop[nz2] != REAL_CELL) {
zone_cell[jjj+1][iii] = ntop[nz2];
}
} else {
int iii = i[nz2]*2-iadjust;
int jjj = j[nz2]*2-jadjust;
if (is_upper(i[nz1])) iii += 1;
zone_flag[jjj][iii] = 1;
zone_cell[jjj][iii] = nz2;
zone_cell[jjj+1][iii] = nz2;
}
}
}
if (DEBUG) {
printf("DEBUG -- zone_flag for level %d\n",lev);
printf("DEBUG -- sizes isize %d jsize %d\n",isize,jsize);
for (int j=jsize-1; j>=0; j--){
for (int i=0; i<isize; i++){
if (zone_flag[j][i] >= 0){
printf(" zone_flag_check[%d][%d] = 1;\n",j,i);
}
}
}
for (int j=jsize-1; j>=0; j--){
for (int i=0; i<isize; i++){
if (zone_cell[j][i] >= 0){
printf(" zone_cell_check[%d][%d] = %d;\n",j,i,zone_cell[j][i]);
}
}
}
printf(" ");
for (int i=0; i<isize; i++){
printf(" %4d ",i);
}
printf("\n");
for (int j=jsize-1; j>=0; j--){
printf("DEBUG -- j %4d: ",j);
for (int i=0; i<isize; i++){
if (zone_flag[j][i] >= 0){
printf(" %4d ", zone_flag[j][i]);
} else {
printf(" ");
}
}
printf("\n");
}
printf("DEBUG -- zone_cell for level %d\n",lev);
printf(" ");
for (int i=0; i<isize; i++){
printf(" %4d ",i);
}
printf("\n");
for (int j=jsize-1; j>=0; j--){
printf("DEBUG -- j %4d: ",j);
for (int i=0; i<isize; i++){
if (zone_cell[j][i] >= 0){
printf(" %4d ", zone_cell[j][i]);
} else {
printf(" ");
}
}
printf("\n");
}
}
}
void Mesh::calc_face_list_clearmaps()
{
map_xface2cell_lower.clear();
map_xface2cell_upper.clear();
map_xcell2face_left1.clear();
map_xcell2face_left2.clear();
map_xcell2face_right1.clear();
map_xcell2face_right2.clear();
map_yface2cell_lower.clear();
map_yface2cell_upper.clear();
map_ycell2face_bot1.clear();
map_ycell2face_bot2.clear();
map_ycell2face_top1.clear();
map_ycell2face_top2.clear();
}
void Mesh::timer_output(mesh_timer_category category, mesh_device_types device_type, int timer_level)
{
double local_time = 0.0;
if (device_type == MESH_DEVICE_CPU){
local_time = get_cpu_timer(category);
} else {
local_time = get_gpu_timer(category);
}
char string[80] = "/0";
if (mype == 0) {
const char *blank=" ";
if (device_type == MESH_DEVICE_CPU){
sprintf(string,"CPU: %.*s%-30.30s\t", 2*timer_level, blank, mesh_timer_descriptor[category]);
} else {
sprintf(string,"GPU: %.*s%-30.30s\t", 2*timer_level, blank, mesh_timer_descriptor[category]);
}
}
parallel_output(string, local_time, timer_level, "s");
}
void Mesh::parallel_output(const char *string, double local_value, int output_level, const char *units)
{
vector<double> global_values(numpe);
global_values[0] = local_value;
#ifdef HAVE_MPI
if (numpe > 1) {
MPI_Gather(&local_value, 1, MPI_DOUBLE, &global_values[0], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
#endif
if (mype == 0) {
const char *blank=" ";
printf("%s\t",string);
if (numpe <= 4) {
for(int ip = 0; ip < numpe; ip++){
printf("%.*s%8.4f\t", 2*output_level, blank, global_values[ip]);
}
printf("%s\n",units);
} else {
sort(global_values.begin(),global_values.end());
double median_value;
int half_value = numpe/2;
if (numpe%2 == 0) {
median_value = (global_values[half_value-1]+global_values[half_value])/2.0;
} else {
median_value = global_values[half_value+1];
}
printf("%.*s%8.4f\t%.*s%8.4f\t%.*s%8.4f %s min/median/max\n",
2*output_level, blank, global_values[0],
2*output_level, blank, median_value,
2*output_level, blank, global_values[numpe-1],
units);
}
}
}
void Mesh::parallel_output(const char *string, long long local_value, int output_level, const char *units)
{
vector<long long> global_values(numpe);
global_values[0] = local_value;
#ifdef HAVE_MPI
if (numpe > 1) {
MPI_Gather(&local_value, 1, MPI_LONG_LONG, &global_values[0], 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
}
#endif
if (mype == 0) {
const char *blank=" ";
printf("%s\t",string);
if (numpe <= 4) {
for(int ip = 0; ip < numpe; ip++){
printf("%.*s%10lld\t", 2*output_level, blank, global_values[ip]);
}
printf("%s\n",units);
} else {
sort(global_values.begin(),global_values.end());
long long median_value;
int half_value = numpe/2;
if (numpe%2 == 0) {
median_value = (global_values[half_value-1]+global_values[half_value])/2;
} else {
median_value = global_values[half_value+1];
}
printf("%.*s%10lld\t%.*s%10lld\t%.*s%10lld %s min/median/max\n",
2*output_level, blank, global_values[0],
2*output_level, blank, median_value,
2*output_level, blank, global_values[numpe-1],
units);
}
}
}
void Mesh::parallel_output(const char *string, int local_value, int output_level, const char *units)
{
vector<int> global_values(numpe);
global_values[0] = local_value;
#ifdef HAVE_MPI
if (numpe > 1) {
MPI_Gather(&local_value, 1, MPI_INT, &global_values[0], 1, MPI_INT, 0, MPI_COMM_WORLD);
}
#endif
if (mype == 0) {
const char *blank=" ";
printf("%s\t",string);
if (numpe <= 4) {
for(int ip = 0; ip < numpe; ip++){
printf("%.*s%10d\t", 2*output_level, blank, global_values[ip]);
}
printf("%s\n",units);
} else {
sort(global_values.begin(),global_values.end());
int median_value;
int half_value = numpe/2;
if (numpe%2 == 0) {
median_value = (global_values[half_value-1]+global_values[half_value])/2;
} else {
median_value = global_values[half_value+1];
}
printf("%.*s%10d\t%.*s%10d\t%.*s%10d %s min/median/max\n",
2*output_level, blank, global_values[0],
2*output_level, blank, median_value,
2*output_level, blank, global_values[numpe-1],
units);
}
}
}
const int CRUX_MESH_VERSION = 103;
const int num_int_dist_vals = 3;
const int num_int_vals = 3;
const int num_double_vals = 1;
size_t Mesh::get_checkpoint_size(void)
{
size_t nsize;
nsize = num_int_dist_vals*sizeof(int);
nsize += num_int_vals*sizeof(int);
nsize += num_double_vals*sizeof(double);
nsize += 2*MESH_COUNTER_SIZE*sizeof(int);
nsize += MESH_TIMER_SIZE*sizeof(double);
nsize += MESH_TIMER_SIZE*sizeof(long);
nsize += ncells*3*sizeof(int);
return(nsize);
}
void Mesh::store_checkpoint(Crux *crux)
{
// Need ncells for memory allocation
int storage = mesh_memory.get_memory_capacity(level);
crux->store_named_ints("storage", 7, &storage, 1);
// Write scalars to arrays for storing in checkpoint
int int_vals[num_int_vals];
int_vals[ 0] = CRUX_MESH_VERSION;
int_vals[ 1] = ndim;
int_vals[ 2] = levmx;
// These are for values that will be different on every processor
int int_dist_vals[num_int_dist_vals];
int_dist_vals[ 0] = (int)ncells;
int_dist_vals[ 1] = (int)ncells_ghost;
int_dist_vals[ 2] = offtile_local_count;
double double_vals[num_double_vals];
double_vals[0] = offtile_ratio_local;
int flags = RESTART_DATA;
// Now add memory entries to database for storing checkpoint
mesh_memory.memory_add(int_dist_vals, (size_t)num_int_dist_vals, 4, "mesh_int_dist_vals", flags);
flags = RESTART_DATA | REPLICATED_DATA;
mesh_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "mesh_int_vals", flags);
flags = RESTART_DATA;
mesh_memory.memory_add(double_vals, (size_t)num_double_vals, 8, "mesh_double_vals", flags);
mesh_memory.memory_add(cpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_cpu_counters", flags);
mesh_memory.memory_add(gpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_gpu_counters", flags);
mesh_memory.memory_add(cpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_cpu_timers", flags);
mesh_memory.memory_add(gpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_gpu_timers", flags);
// Store MallocPlus memory database
crux->store_MallocPlus(mesh_memory);
// Remove memory entries from database now that data is stored
mesh_memory.memory_remove(int_dist_vals);
mesh_memory.memory_remove(int_vals);
mesh_memory.memory_remove(double_vals);
mesh_memory.memory_remove(cpu_counters);
mesh_memory.memory_remove(gpu_counters);
mesh_memory.memory_remove(cpu_timers);
mesh_memory.memory_remove(gpu_timers);
}
void Mesh::restore_checkpoint(Crux *crux)
{
int storage;
crux->restore_named_ints("storage", 7, &storage, 1);
// Create memory for reading data into
int int_dist_vals[num_int_dist_vals];
int int_vals[num_int_vals];
double double_vals[num_double_vals];
mesh_memory.memory_delete(nlft);
mesh_memory.memory_delete(nrht);
mesh_memory.memory_delete(nbot);
mesh_memory.memory_delete(ntop);
mesh_memory.memory_delete(celltype);
nlft = NULL;
nrht = NULL;
ntop = NULL;
nbot = NULL;
celltype = NULL;
// Resize is a mesh method
// resize(storage);
// memory_reset_ptrs();
allocate (storage);
int flags = RESTART_DATA;
// Now add memory entries to database for restoring checkpoint
mesh_memory.memory_add(int_dist_vals, (size_t)num_int_dist_vals, 4, "mesh_int_dist_vals", flags);
flags = RESTART_DATA | REPLICATED_DATA;
mesh_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "mesh_int_vals", flags);
mesh_memory.memory_add(double_vals, (size_t)num_double_vals, 8, "mesh_double_vals", flags);
flags = RESTART_DATA;
mesh_memory.memory_add(cpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_cpu_counters", flags);
mesh_memory.memory_add(gpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_gpu_counters", flags);
mesh_memory.memory_add(cpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_cpu_timers", flags);
mesh_memory.memory_add(gpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_gpu_timers", flags);
// Restore MallocPlus memory database
crux->restore_MallocPlus(mesh_memory);
// Remove memory entries from database now that data is restored
mesh_memory.memory_remove(int_dist_vals);
mesh_memory.memory_remove(int_vals);
mesh_memory.memory_remove(double_vals);
mesh_memory.memory_remove(cpu_counters);
mesh_memory.memory_remove(gpu_counters);
mesh_memory.memory_remove(cpu_timers);
mesh_memory.memory_remove(gpu_timers);
// Check version number
if (int_vals[ 0] != CRUX_MESH_VERSION) {
printf("CRUX version mismatch for mesh data, version on file is %d, version in code is %d\n",
int_vals[0], CRUX_MESH_VERSION);
exit(0);
}
// Copy out scalar values from array
ncells = int_dist_vals[ 0];
ncells_ghost = int_dist_vals[ 1];
offtile_local_count = int_dist_vals[ 2];
// Copy out scalar values from array
ndim = int_vals[ 1];
levmx = int_vals[ 2];
#ifdef DEBUG_RESTORE_VALS
if (DEBUG_RESTORE_VALS && mype == 0) {
const char *int_dist_vals_descriptor[num_int_dist_vals] = {
"ncells",
"ncells_ghost",
"offtile_local_count"
};
const char *int_vals_descriptor[num_int_vals] = {
"CRUX_MESH_VERSION",
"ndim",
"levmx",
};
printf("\n");
printf(" === Restored mesh int_dist_vals ===\n");
for (int i = 0; i < num_int_dist_vals; i++){
printf(" %-30s %d\n",int_dist_vals_descriptor[i], int_dist_vals[i]);
}
printf(" === Restored mesh int_vals ===\n");
for (int i = 0; i < num_int_vals; i++){
printf(" %-30s %d\n",int_vals_descriptor[i], int_vals[i]);
}
printf(" === Restored mesh int_vals ===\n");
printf("\n");
}
#endif
offtile_ratio_local = double_vals[0];
#ifdef DEBUG_RESTORE_VALS
if (DEBUG_RESTORE_VALS && mype == 0) {
const char *double_vals_descriptor[num_double_vals] = {
"offtile_ratio_local"
};
printf("\n");
printf(" === Restored mesh double_vals ===\n");
for (int i = 0; i < num_double_vals; i++){
printf(" %-30s %lf\n",double_vals_descriptor[i], double_vals[i]);
}
printf(" === Restored mesh double_vals ===\n");
printf("\n");
}
#endif
#ifdef DEBUG_RESTORE_VALS
if (DEBUG_RESTORE_VALS && mype == 0) {
printf(" === Restored mesh cpu counters ===\n");
for (int i = 0; i < MESH_COUNTER_SIZE; i++){
printf(" %-30s %d\n",mesh_counter_descriptor[i], cpu_counters[i]);
}
printf(" === Restored mesh cpu counters ===\n");
printf(" === Restored mesh gpu counters ===\n");
for (int i = 0; i < MESH_COUNTER_SIZE; i++){
printf(" %-30s %d\n",mesh_counter_descriptor[i], gpu_counters[i]);
}
printf(" === Restored mesh gpu counters ===\n");
printf("\n");
}
#endif
#ifdef DEBUG_RESTORE_VALS
if (DEBUG_RESTORE_VALS && mype == 0) {
printf(" === Restored mesh cpu timers ===\n");
for (int i = 0; i < MESH_TIMER_SIZE; i++){
printf(" %-30s %lf\n",mesh_timer_descriptor[i], cpu_timers[i]);
}
printf(" === Restored mesh cpu timers ===\n");
printf("\n");
}
#endif
#ifdef DEBUG_RESTORE_VALS
if (DEBUG_RESTORE_VALS && mype == 0) {
printf("\n");
printf(" === Restored mesh gpu timers ===\n");
for (int i = 0; i < MESH_TIMER_SIZE; i++){
printf(" %-30s %lld\n",mesh_timer_descriptor[i], gpu_timers[i]);
}
printf(" === Restored mesh gpu timers ===\n");
printf("\n");
}
#endif
//calc_celltype(ncells);
}
// This code due to Matt Calef
void scan ( scanInt *input , scanInt *output , scanInt length)
{
#ifdef _OPENMP
// This already assumes it is in a parallel region
// Get the total number of threads
scanInt numThreads = omp_get_num_threads ( );
// Compute the range for which this thread is responsible.
scanInt threadID = omp_get_thread_num ( );
scanInt start = length * ( threadID ) / numThreads;
scanInt end = length * ( threadID + 1 ) / numThreads;
// In the case that there are fewer entries than threads, some
// threads will have no entries. Only perform this operation if
// there is a postive number of entries.
if ( start < end ) {
// Do a scan over the region for this thread, with an initial
// value of zero.
output[start] = 0;
for ( scanInt i = start + 1 ; i < end ; i++ )
output[i] = output[i-1] + input[i-1];
}
// Wait until all threads get here.
#pragma omp barrier
// At this point each thread has done an independent scan of its
// region. All scans, except the first, are off by an
// offset. Here we have a single thread compute that offset with a
// serial scan that strides over the regions assigned to each
// thread.
#pragma omp single
for ( scanInt i = 1 ; i < numThreads ; i ++ ) {
scanInt s0 = length * ( i - 1 ) / numThreads;
scanInt s1 = length * ( i ) / numThreads;
if ( s0 < s1 )
output[s1] = output[s0] + input[s1-1];
if ( s0 < s1 - 1 )
output[s1] += output[s1-1];
}
// Barrier is implicit from omp single Wait until all threads get here.
// Apply the offset to the range for this thread.
for ( scanInt i = start + 1 ; i < end ; i++ )
output[i] += output[start];
#else
output[0] = 0;
for (int ic = 0; ic < length; ic++){
output[ic+1] = output[ic] + input[ic];
}
#endif
}
/****************************************************//**
*GET BOUNDS!!!!!!****
**********************************/
void Mesh::get_bounds(int& lowerBound, int& upperBound){
#ifdef _OPENMP
int threadID = omp_get_thread_num();
lowerBound = lowerBound_Global[threadID];
upperBound = upperBound_Global[threadID];
// printf("GETBOUNDs ThreadID: %d, upperBound: %d, lowerBound: %d \n",threadID, upperBound, lowerBound);
#else
lowerBound = 0;
upperBound = ncells;
#endif
}
/****************************************************//**
*SETTING BOUNDS!!!!!!****
**********************************/
void Mesh::set_bounds(int n){
#ifdef _OPENMP
// #pragma omp parallel
{
int nthreads = omp_get_num_threads();//Private for each thread
int threadID = omp_get_thread_num(); //Private for each thread
#pragma omp master
{
if(lowerBound_Global == NULL) lowerBound_Global = (int *)malloc(nthreads*sizeof(int));
if(upperBound_Global == NULL) upperBound_Global = (int *)malloc(nthreads*sizeof(int));
}
//#pragma omp flush (lowerBound_Global, upperBound_Global)
#pragma omp barrier
int work = n/nthreads;
if(threadID<(n%nthreads))work++;
int lowerBound = ((n / nthreads)*threadID) + min(n%nthreads, threadID);
int upperBound = lowerBound + work;
// printf("ThreadID: %d, upperBound: %d, lowerBound: %d \n",threadID, upperBound, lowerBound);
lowerBound_Global[threadID] = lowerBound;
upperBound_Global[threadID] = upperBound;
}
#else
if(lowerBound_Global == NULL) lowerBound_Global = (int *)malloc(1*sizeof(int));
if(upperBound_Global == NULL) upperBound_Global = (int *)malloc(1*sizeof(int));
int lowerBound = 0;
int upperBound = ncells;
lowerBound_Global[0] = lowerBound;
upperBound_Global[0] = upperBound;
#endif
}