blob: 6861e3b9841f81f6f274fd826158ddf26bf67349 [file] [log] [blame]
//------------------------------------------------------------------------------------------------------------------------------
// Samuel Williams
// SWWilliams@lbl.gov
// Lawrence Berkeley National Lab
//------------------------------------------------------------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
//------------------------------------------------------------------------------------------------------------------------------
#include "defines.h"
#include "box.h"
//------------------------------------------------------------------------------------------------------------------------------
int RandomPadding=-1;
//------------------------------------------------------------------------------------------------------------------------------
int create_box(box_type *box, int numGrids, int low_i, int low_j, int low_k, int dim_i, int dim_j, int dim_k, int ghosts){
uint64_t memory_allocated = 0;
box->numGrids = numGrids;
box->low.i = low_i;
box->low.j = low_j;
box->low.k = low_k;
box->dim.i = dim_i;
box->dim.j = dim_j;
box->dim.k = dim_k;
box->dim_with_ghosts.i = dim_i+2*ghosts;
box->dim_with_ghosts.j = dim_j+2*ghosts;
box->dim_with_ghosts.k = dim_k+2*ghosts;
box->ghosts = ghosts;
box->pencil = (dim_i+2*ghosts);
box->plane = (dim_i+2*ghosts)*(dim_j+2*ghosts);
// nominally the stencil assumes VL is a perfect multiple of $U and unrolls without cleanup. However, this means you can walk $U-1 beyond the last point you should update and start updating the ghost zone of the next plane.
//int MaxUnrolling = 8; // 2-way SIMD x unroll by 4 = 8/thread
int MaxUnrolling = 16; // 4-way SIMD x unroll by 4 = 16/thread
//int MaxUnrolling = 32; // 8-way SIMD x unroll by 4 = 32/thread
int paddingToAvoidStencilCleanup = 0;
if(box->pencil+1 < (MaxUnrolling-1)){paddingToAvoidStencilCleanup = (MaxUnrolling-1)-(box->pencil+1);}
// round each plane up to ensure SIMD alignment within each plane (not pencil). Thus if ijk is aligned, ijkk+/-plane is aligned
//box->plane =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0xF) & ~0xF; // multiple of 128 bytes
box->plane =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0x7) & ~0x7; // multiple of 64 bytes (required for MIC)
//box->plane =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0x3) & ~0x3; // multiple of 32 bytes (required for AVX/QPX)
//box->plane =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0x1) & ~0x1; // multiple of 16 bytes (required for SSE)
//printf("%2d^2 = %5d -> %5d\n",box->dim_with_ghosts.i,(dim_j+2*ghosts)*box->pencil,box->plane);
box->volume = (dim_k+2*ghosts)*box->plane;
//if(dim_i>=32){while( ((box->volume % 2048) != 288) )box->volume+=8;}
//if(dim_i>=32){while( ((box->volume % 512) != 72) )box->volume+=8;} // 32KB / 8way / 8bytes / 7 arrays
if(dim_i>=32){while( ((box->volume % 512) != 64) )box->volume+=8;} // 32KB / 8way / 8bytes / 8 arrays
//if(dim_i>=32){while( ((box->volume % 512) != 40) )box->volume+=8;} // 32KB / 8way / 8bytes / 13 arrays
//if(dim_i>=32){while( ((box->volume % 256) != 40) )box->volume+=8;} // 16KB / 8way / 8bytes / 7 arrays
//if(dim_i>=32){while( ((box->volume % 256) != 32) )box->volume+=8;} // 16KB / 8way / 8bytes / 8 arrays
//if(RandomPadding<0){srand(time(NULL));RandomPadding = 8*((int)rand()%128);}
//if(dim_i>=32){box->volume+=RandomPadding;}
// allocate pointers to grids and grids themselves
posix_memalign((void**)&(box->grids),64,box->numGrids*sizeof(double*));
memory_allocated += box->numGrids*sizeof(double*);
#if 0
int g;for(g=0;g<box->numGrids;g++){
posix_memalign((void**)&(box->grids[g]),64,box->volume*sizeof(double));
memset(box->grids[g],0,box->volume*sizeof(double));
memory_allocated += box->volume*sizeof(double);
}
#else
double * tmpbuf;
posix_memalign((void**)&tmpbuf,64,box->volume*box->numGrids*sizeof(double));
memset( tmpbuf,0,box->volume*box->numGrids*sizeof(double));
memory_allocated += box->volume*box->numGrids*sizeof(double);
int g;for(g=0;g<box->numGrids;g++){
box->grids[g] = tmpbuf + g*box->volume;
//printf("box->grids[%2d] = 0x%016llx\n",g,(uint64_t)box->grids[g] & (0x3<<3));
}
#endif
// allocate RedBlackMask array for a plane...
posix_memalign((void**)&(box->RedBlack_64bMask),64,box->plane*sizeof(uint64_t));
memset(box->RedBlack_64bMask, 0,box->plane*sizeof(uint64_t));
memory_allocated += box->plane*sizeof(uint64_t);
posix_memalign((void**)&(box->RedBlack_FP[0] ),64,box->plane*sizeof(double ));
memset(box->RedBlack_FP[0] , 0,box->plane*sizeof(double ));
memory_allocated += box->plane*sizeof(double );
posix_memalign((void**)&(box->RedBlack_FP[1] ),64,box->plane*sizeof(double ));
memset(box->RedBlack_FP[1] , 0,box->plane*sizeof(double ));
memory_allocated += box->plane*sizeof(double );
// initialize red/black... could do ij loop with ((i%pencil)^(j/pencil)&0x1)
int i,j;
for(j=0-ghosts;j<box->dim.j+ghosts;j++){
for(i=0-ghosts;i<box->dim.i+ghosts;i++){
int ij = (i+ghosts) + (j+ghosts)*box->pencil;
if((i^j)&0x1)box->RedBlack_64bMask[ij]=~0;else box->RedBlack_64bMask[ij]= 0;
if((i^j)&0x1)box->RedBlack_FP[0][ij]=1.0;else box->RedBlack_FP[0][ij]=0.0;
if((i^j)&0x1)box->RedBlack_FP[1][ij]=0.0;else box->RedBlack_FP[1][ij]=1.0;
}}
// done...
return(memory_allocated);
}
void destroy_box(box_type *box){
#if 0
int g;for(g=0;g<box->numGrids;g++){
free(box->grids[g]);
}
#else
free(box->grids[0]);
#endif
free(box->grids);
}