| //------------------------------------------------------------------------------------------------------------------------------ |
| // Samuel Williams |
| // SWWilliams@lbl.gov |
| // Lawrence Berkeley National Lab |
| //------------------------------------------------------------------------------------------------------------------------------ |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <stdint.h> |
| #include <string.h> |
| #include <math.h> |
| //------------------------------------------------------------------------------------------------------------------------------ |
| #ifdef __MPI |
| #include <mpi.h> |
| #endif |
| //------------------------------------------------------------------------------------------------------------------------------ |
| #include "defines.h" |
| #include "box.h" |
| #include "mg.h" |
| #include "operators.h" |
| #include "timer.h" |
| //------------------------------------------------------------------------------------------------------------------------------ |
| #warning Using Diagonally Preconditioned Iterative Solvers |
| #define DiagonallyPrecondition |
| //------------------------------------------------------------------------------------------------------------------------------ |
| #ifndef __CA_KRYLOV_S |
| #define __CA_KRYLOV_S 4 |
| #endif |
| |
| //#define __VERBOSE |
| //#define __DEBUG |
| //------------------------------------------------------------------------------------------------------------------------------ |
| // z[r] = alpha*A[r][c]*x[c]+beta*y[r] // [row][col] |
| // z[r] = alpha*A[r][c]*x[c]+beta*y[r] // [row][col] |
| #define __gemv(z,alpha,A,x,beta,y,rows,cols) {int r,c;double sum;for(r=0;r<(rows);r++){sum=0.0;for(c=0;c<(cols);c++){sum+=(A)[r][c]*(x)[c];}(z)[r]=(alpha)*sum+(beta)*(y)[r];}} |
| static inline void __axpy(double * z, double alpha, double * x, double beta, double * y, int n){ // z[n] = alpha*x[n]+beta*y[n] |
| int nn; |
| for(nn=0;nn<n;nn++){ |
| z[nn] = alpha*x[nn] + beta*y[nn]; |
| } |
| } |
| static inline double __dot(double * x, double * y, int n){ // x[n].y[n] |
| int nn; |
| double sum = 0.0; |
| for(nn=0;nn<n;nn++){ |
| sum += x[nn]*y[nn]; |
| } |
| return(sum); |
| } |
| static inline void __zero(double * z, int n){ // z[n] = 0.0 |
| int nn; |
| for(nn=0;nn<n;nn++){ |
| z[nn] = 0.0; |
| } |
| } |
| |
| |
| //------------------------------------------------------------------------------------------------------------------------------ |
| void TelescopingCABiCGStab(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){ |
| // based on Erin Carson/Jim Demmel/Nick Knight's s-Step BiCGStab Algorithm 3.4 |
| // However, the formation of [P,R] is expensive ~ 4S+1 exchanges. Moreover, formation of G[][] requires (4S+2)(4S+1) grid operations. |
| // When the required number of iterations is small, this overhead is large and can make the s-step version slower than vanilla BiCGStab |
| // Thus, this version is a telescoping s-step method that will start out with s=1, then do s=2, then s=4 |
| |
| |
| // note: __CA_KRYLOV_S should be tiny (2-8?). As such, 4*__CA_KRYLOV_S+1 is also tiny (9-33). Just allocate on the stack... |
| double temp1[4*__CA_KRYLOV_S+1]; // |
| double temp2[4*__CA_KRYLOV_S+1]; // |
| double temp3[4*__CA_KRYLOV_S+1]; // |
| double Tp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1]; // T' indexed as [row][col] |
| double Tpp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1]; // T'' indexed as [row][col] |
| double aj[4*__CA_KRYLOV_S+1]; // |
| double cj[4*__CA_KRYLOV_S+1]; // |
| double ej[4*__CA_KRYLOV_S+1]; // |
| double Tpaj[4*__CA_KRYLOV_S+1]; // |
| double Tpcj[4*__CA_KRYLOV_S+1]; // |
| double Tppaj[4*__CA_KRYLOV_S+1]; // |
| double G[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1]; // extracted from first 4*__CA_KRYLOV_S+1 columns of Gg[][]. indexed as [row][col] |
| double g[4*__CA_KRYLOV_S+1]; // extracted from last [4*__CA_KRYLOV_S+1] column of Gg[][]. |
| double Gg[(4*__CA_KRYLOV_S+1)*(4*__CA_KRYLOV_S+2)]; // buffer to hold the Gram-like matrix produced by matmul(). indexed as [row*(4*__CA_KRYLOV_S+2) + col] |
| int PRrt[4*__CA_KRYLOV_S+2]; // grid_id's of the concatenation of the 2S+1 matrix powers of P, 2S matrix powers of R, and rt |
| |
| int mMax=200; |
| int m=0,n; |
| int i,j,k; |
| int BiCGStabFailed = 0; |
| int BiCGStabConverged = 0; |
| double g_dot_Tpaj,alpha,omega_numerator,omega_denominator,omega,delta,delta_next,beta; |
| double L2_norm_of_rt,L2_norm_of_residual,cj_dot_Gcj,L2_norm_of_s; |
| |
| // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| residual(domain,level,__rt,e_id,R_id,a,b); // rt[] = R_id[] - A(e_id)... note, if DPC, then rt = R-AD^-1De |
| scale_grid(domain,level,__r,1.0,__rt); // r[] = rt[] |
| scale_grid(domain,level,__p,1.0,__rt); // p[] = rt[] |
| double norm_of_rt = norm(domain,level,__rt); // the norm of the initial residual... |
| #ifdef __VERBOSE |
| if(domain->rank==0)printf("m=%8d, norm =%0.20f\n",m,norm_of_rt); |
| #endif |
| if(norm_of_rt == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution |
| delta = dot(domain,level,__r,__rt); // delta = dot(r,rt) |
| if(delta==0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution (square of L2 norm of __r) |
| L2_norm_of_rt = sqrt(delta); |
| |
| int __ca_krylov_s = 1; |
| // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| while( (m<mMax) && (!BiCGStabFailed) && (!BiCGStabConverged) ){ // while(not done){ |
| __zero( aj,4*__ca_krylov_s+1); // |
| __zero( cj,4*__ca_krylov_s+1); // |
| __zero( ej,4*__ca_krylov_s+1); // |
| __zero( Tpaj,4*__ca_krylov_s+1); // |
| __zero( Tpcj,4*__ca_krylov_s+1); // |
| __zero(Tppaj,4*__ca_krylov_s+1); // |
| __zero(temp1,4*__ca_krylov_s+1); // |
| __zero(temp2,4*__ca_krylov_s+1); // |
| __zero(temp3,4*__ca_krylov_s+1); // |
| |
| for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++) Tp[i][j]=0;// initialize Tp[][] and Tpp[][] ... |
| for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++)Tpp[i][j]=0;// |
| for(i= 0;i<2*__ca_krylov_s ;i++){ Tp[i+1][i]=1;} // monomial basis... Fixed (typo in SIAM paper) |
| for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s ;i++){ Tp[i+1][i]=1;} // |
| for(i= 0;i<2*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;} // |
| for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;} // |
| |
| for(i=0;i<4*__ca_krylov_s+1;i++){PRrt[ i] = __PRrt+i;} // columns of PRrt map to the consecutive spare grid indices starting at __PRrt |
| PRrt[4*__ca_krylov_s+1] = __rt; // last column or PRrt (r tilde) maps to rt |
| int *P = PRrt+ 0; // grid_id's of the 2S+1 Matrix Powers of P. P[i] is the grid_id of A^i(p) |
| int *R = PRrt+2*__ca_krylov_s+1; // grid_id's of the 2S Matrix Powers of R. R[i] is the grid_id of A^i(r) |
| |
| // Using the monomial basis, compute 2s+1 matrix powers on p[] and 2s matrix powers on r[] one power at a time |
| // (conventional approach applicable to CHOMBO and BoxLib) |
| scale_grid(domain,level,P[0],1.0,__p); // P[0] = A^0p = __p |
| for(n=1;n<2*__ca_krylov_s+1;n++){ // naive way of calculating the monomial basis. |
| #ifdef DiagonallyPrecondition // |
| mul_grids(domain,level,__temp,1.0,__lambda,P[n-1]); // temp[] = lambda[]*P[n-1] |
| apply_op(domain,level,P[n],__temp,a,b); // P[n] = AD^{-1}__temp = AD^{-1}P[n-1] = ((AD^{-1})^n)p |
| #else // |
| apply_op(domain,level,P[n],P[n-1],a,b); // P[n] = A(P[n-1]) = (A^n)p |
| #endif // |
| } |
| scale_grid(domain,level,R[0],1.0,__r); // R[0] = A^0r = __r |
| for(n=1;n<2*__ca_krylov_s;n++){ // naive way of calculating the monomial basis. |
| #ifdef DiagonallyPrecondition // |
| mul_grids(domain,level,__temp,1.0,__lambda,R[n-1]); // temp[] = lambda[]*R[n-1] |
| apply_op(domain,level,R[n],__temp,a,b); // R[n] = AD^{-1}__temp = AD^{-1}R[n-1] |
| #else // |
| apply_op(domain,level,R[n],R[n-1],a,b); // R[n] = A(R[n-1]) = (A^n)r |
| #endif // |
| } |
| |
| // Compute Gg[][] = [P,R]^T * [P,R,rt] (Matmul with grids with ghost zones but only one MPI_AllReduce) |
| domain->CAKrylov_formations_of_G++; // Record the number of times CABiCGStab formed G[][] |
| matmul_grids(domain,level,Gg,PRrt,PRrt,4*__ca_krylov_s+1,4*__ca_krylov_s+2,1); |
| for(i=0,k=0;i<4*__ca_krylov_s+1;i++){ // extract G[][] and g[] from Gg[] |
| for(j=0 ;j<4*__ca_krylov_s+1;j++){G[i][j] = Gg[k++];} // first 4*__ca_krylov_s+1 elements in each row go to G[][]. |
| g[i] = Gg[k++]; // last element in row goes to g[]. |
| } |
| |
| for(i=0;i<4*__ca_krylov_s+1;i++)aj[i]=0.0;aj[ 0]=1.0; // initialized based on (3.26) |
| for(i=0;i<4*__ca_krylov_s+1;i++)cj[i]=0.0;cj[2*__ca_krylov_s+1]=1.0; // initialized based on (3.26) |
| for(i=0;i<4*__ca_krylov_s+1;i++)ej[i]=0.0; // initialized based on (3.26) |
| |
| for(n=0;n<__ca_krylov_s;n++){ // for(n=0;n<__ca_krylov_s;n++){ |
| domain->Krylov_iterations++; // record number of inner-loop (j) iterations for comparison |
| __gemv( Tpaj, 1.0, Tp, aj, 0.0, Tpaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T'aj |
| __gemv( Tpcj, 1.0, Tp, cj, 0.0, Tpcj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T'cj |
| __gemv(Tppaj, 1.0,Tpp, aj, 0.0,Tppaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T''aj |
| g_dot_Tpaj = __dot(g,Tpaj,4*__ca_krylov_s+1); // (g,T'aj) |
| if(g_dot_Tpaj == 0.0){ // pivot breakdown ??? |
| #ifdef __VERBOSE // |
| if(domain->rank==0){printf("g_dot_Tpaj == 0.0\n");} // |
| #endif // |
| BiCGStabFailed=1;break; // |
| } // |
| alpha = delta / g_dot_Tpaj; // delta / (g,T'aj) |
| if(isinf(alpha)){ // alpha = big/tiny(overflow) = inf -> breakdown |
| #ifdef __VERBOSE // |
| if(domain->rank==0){printf("alpha == inf\n");} // |
| #endif // |
| BiCGStabFailed=1;break; // |
| } // |
| #if 0 // seems to have accuracy problems in finite precision... |
| __gemv(temp1,-alpha, G, Tpaj, 0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = - alpha*GT'aj |
| __gemv(temp1, 1.0, G, cj, 1.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = Gcj - alpha*GT'aj |
| __gemv(temp2,-alpha, G,Tppaj, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = − alpha*GT′′aj |
| __gemv(temp2, 1.0, G, Tpcj, 1.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = GT′cj − alpha*GT′′aj |
| __axpy(temp3, 1.0, Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1); // temp3[] = T′cj − alpha*T′′aj |
| omega_numerator = __dot(temp3,temp1,4*__ca_krylov_s+1); // (temp3,temp1) = ( T'cj-alpha*T''aj , Gcj-alpha*GT'aj ) |
| omega_denominator = __dot(temp3,temp2,4*__ca_krylov_s+1); // (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj ) |
| #else // better to change the order of operations Gx-Gy -> G(x-y) ... (note, G is symmetric) |
| __axpy(temp1, 1.0, Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1); // temp1[] = (T'cj - alpha*T''aj) |
| __gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = G(T'cj - alpha*T''aj) |
| __axpy(temp3, 1.0, cj,-alpha, Tpaj,4*__ca_krylov_s+1); // temp3[] = cj - alpha*T'aj |
| omega_numerator = __dot(temp3,temp2,4*__ca_krylov_s+1); // (temp3,temp2) = ( ( cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) ) |
| omega_denominator = __dot(temp1,temp2,4*__ca_krylov_s+1); // (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) ) |
| #endif // |
| // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0 |
| // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj |
| // If omega_numerator==0, and ||s||!=0, then stabilization breakdown |
| |
| // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!! |
| __axpy( ej,1.0,ej, alpha, aj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] |
| |
| // calculate the norm of Saad's vector 's' to check intra s-step convergence... |
| __axpy(temp1, 1.0, cj,-alpha, Tpaj,4*__ca_krylov_s+1); // temp1[] = cj - alpha*T'aj |
| __gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = G(cj - alpha*T'aj) |
| L2_norm_of_s = __dot(temp1,temp2,4*__ca_krylov_s+1); // (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) ) == square of L2 norm of s in exact arithmetic |
| if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) |
| #ifdef __VERBOSE // |
| if(domain->rank==0){printf("m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);} // |
| #endif // |
| if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop |
| |
| |
| if(omega_denominator == 0.0){ // ??? breakdown |
| #ifdef __VERBOSE // |
| if(domain->rank==0){if(omega_denominator == 0.0)printf("omega_denominator == 0.0\n");} // |
| #endif // |
| BiCGStabFailed=1;break; // |
| } // |
| omega = omega_numerator / omega_denominator; // |
| if(isinf(omega)){ // omega = big/tiny(oveflow) = inf |
| #ifdef __VERBOSE // |
| if(domain->rank==0){if(isinf(omega))printf("omega == inf\n");} // |
| #endif // |
| BiCGStabFailed=1;break; // |
| } // |
| // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok // |
| __axpy( ej,1.0,ej, omega, cj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] |
| __axpy( ej,1.0,ej,-omega*alpha, Tpaj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[] |
| __axpy( cj,1.0,cj, -omega, Tpcj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] |
| __axpy( cj,1.0,cj, -alpha, Tpaj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] |
| __axpy( cj,1.0,cj, omega*alpha,Tppaj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[] |
| |
| |
| // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... |
| __gemv(temp1, 1.0, G, cj, 0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = Gcj |
| cj_dot_Gcj = __dot(cj,temp1,4*__ca_krylov_s+1); // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic |
| L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) |
| #ifdef __VERBOSE |
| if(domain->rank==0){printf("m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);} |
| #endif |
| if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop |
| |
| |
| delta_next = __dot( g,cj,4*__ca_krylov_s+1); // (g,cj) |
| #ifdef __VERBOSE // |
| if(domain->rank==0){ // |
| if(isinf(delta_next) ){printf("delta == inf\n");} // delta = big/tiny(overflow) = inf |
| if(delta_next == 0.0){printf("delta == 0.0\n");} // Lanczos breakdown |
| if(omega_numerator == 0.0){printf("omega_numerator == 0.0\n");} // stabilization breakdown |
| if(omega == 0.0){printf("omega == 0.0\n");} // stabilization breakdown |
| } // |
| #endif // |
| if(isinf(delta_next)){BiCGStabFailed =1;break;} // delta = inf? |
| if(delta_next ==0.0){BiCGStabFailed =1;break;} // Lanczos breakdown... |
| if(omega ==0.0){BiCGStabFailed =1;break;} // stabilization breakdown |
| beta = (delta_next/delta)*(alpha/omega); // (delta_next/delta)*(alpha/omega) |
| #ifdef __VERBOSE // |
| if(domain->rank==0){ // |
| if(isinf(beta) ){printf("beta == inf\n");} // beta = inf? |
| if(beta == 0.0){printf("beta == 0.0\n");} // beta = 0? can't make further progress(?) |
| } // |
| #endif // |
| if(isinf(beta) ){BiCGStabFailed =1;break;} // beta = inf? |
| if(beta == 0.0){BiCGStabFailed =1;break;} // beta = 0? can't make further progress(?) |
| __axpy( aj,1.0,cj, beta, aj,4*__ca_krylov_s+1); // aj[] = cj[] + beta*aj[] |
| __axpy( aj,1.0,aj, -omega*beta, Tpaj,4*__ca_krylov_s+1); // aj[] = cj[] + beta*aj[] - omega*beta*T'aj |
| delta = delta_next; // delta = delta_next |
| |
| } // inner n (j) loop |
| |
| // update iterates... |
| for(i=0;i<4*__ca_krylov_s+1;i++){add_grids(domain,level,e_id,1.0,e_id,ej[i],PRrt[i]);} // e_id[] = [P,R]ej + e_id[] |
| if(!BiCGStabFailed && !BiCGStabConverged){ // if we're done, then there is no point in updating these |
| add_grids(domain,level, __p,0.0, __p,aj[0],PRrt[0]); // p[] = [P,R]aj |
| for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __p,1.0, __p,aj[i],PRrt[i]);} // ... |
| add_grids(domain,level, __r,0.0, __r,cj[0],PRrt[0]); // r[] = [P,R]cj |
| for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __r,1.0, __r,cj[i],PRrt[i]);} // ... |
| } // |
| m+=__ca_krylov_s; // m+=ca_krylov_s; |
| __ca_krylov_s*=2;if(__ca_krylov_s>__CA_KRYLOV_S)__ca_krylov_s=__CA_KRYLOV_S; |
| } // } // outer m loop |
| // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| #ifdef DiagonallyPrecondition |
| mul_grids(domain,level,e_id,1.0,__lambda,e_id); // e_id[] = lambda[]*e_id[] // i.e. e = D^{-1}e' |
| #endif |
| |
| } |
| |
| |
| //------------------------------------------------------------------------------------------------------------------------------ |
| void CABiCGStab(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){ |
| // based on Erin Carson/Jim Demmel/Nick Knight's s-Step BiCGStab Algorithm 3.4 |
| |
| // note: __CA_KRYLOV_S should be tiny (2-8?). As such, 4*__CA_KRYLOV_S+1 is also tiny (9-33). Just allocate on the stack... |
| double temp1[4*__CA_KRYLOV_S+1]; // |
| double temp2[4*__CA_KRYLOV_S+1]; // |
| double temp3[4*__CA_KRYLOV_S+1]; // |
| double Tp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1]; // T' indexed as [row][col] |
| double Tpp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1]; // T'' indexed as [row][col] |
| double aj[4*__CA_KRYLOV_S+1]; // |
| double cj[4*__CA_KRYLOV_S+1]; // |
| double ej[4*__CA_KRYLOV_S+1]; // |
| double Tpaj[4*__CA_KRYLOV_S+1]; // |
| double Tpcj[4*__CA_KRYLOV_S+1]; // |
| double Tppaj[4*__CA_KRYLOV_S+1]; // |
| double G[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1]; // extracted from first 4*__CA_KRYLOV_S+1 columns of Gg[][]. indexed as [row][col] |
| double g[4*__CA_KRYLOV_S+1]; // extracted from last [4*__CA_KRYLOV_S+1] column of Gg[][]. |
| double Gg[(4*__CA_KRYLOV_S+1)*(4*__CA_KRYLOV_S+2)]; // buffer to hold the Gram-like matrix produced by matmul(). indexed as [row*(4*__CA_KRYLOV_S+2) + col] |
| int PRrt[4*__CA_KRYLOV_S+2]; // grid_id's of the concatenation of the 2S+1 matrix powers of P, 2S matrix powers of R, and rt |
| int *P = PRrt+ 0; // grid_id's of the 2S+1 Matrix Powers of P. P[i] is the grid_id of A^i(p) |
| int *R = PRrt+2*__CA_KRYLOV_S+1; // grid_id's of the 2S Matrix Powers of R. R[i] is the grid_id of A^i(r) |
| |
| int mMax=200; |
| int m=0,n; |
| int i,j,k; |
| int BiCGStabFailed = 0; |
| int BiCGStabConverged = 0; |
| double g_dot_Tpaj,alpha,omega_numerator,omega_denominator,omega,delta,delta_next,beta; |
| double L2_norm_of_rt,L2_norm_of_residual,cj_dot_Gcj,L2_norm_of_s; |
| |
| // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| residual(domain,level,__rt,e_id,R_id,a,b); // rt[] = R_id[] - A(e_id)... note, if DPC, then rt = R-AD^-1De |
| scale_grid(domain,level,__r,1.0,__rt); // r[] = rt[] |
| scale_grid(domain,level,__p,1.0,__rt); // p[] = rt[] |
| double norm_of_rt = norm(domain,level,__rt); // the norm of the initial residual... |
| #ifdef __VERBOSE |
| if(domain->rank==0)printf("m=%8d, norm =%0.20f\n",m,norm_of_rt); |
| #endif |
| if(norm_of_rt == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution |
| delta = dot(domain,level,__r,__rt); // delta = dot(r,rt) |
| if(delta==0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution (square of L2 norm of __r) |
| L2_norm_of_rt = sqrt(delta); |
| |
| int __ca_krylov_s = __CA_KRYLOV_S; // by making this a variable, I prevent the compiler from optimizing more than the telescoping version, thus preserving a bit-identcal result |
| |
| // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++) Tp[i][j]=0; // initialize Tp[][] and Tpp[][] ... |
| for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++)Tpp[i][j]=0; // |
| for(i= 0;i<2*__ca_krylov_s ;i++){ Tp[i+1][i]=1;} // monomial basis... Fixed (typo in SIAM paper) |
| for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s ;i++){ Tp[i+1][i]=1;} // |
| for(i= 0;i<2*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;} // |
| for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;} // |
| |
| // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| for(i=0;i<4*__ca_krylov_s+1;i++){PRrt[ i] = __PRrt+i;} // columns of PRrt map to the consecutive spare grid indices starting at __PRrt |
| PRrt[4*__ca_krylov_s+1] = __rt; // last column or PRrt (r tilde) maps to rt |
| |
| // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| while( (m<mMax) && (!BiCGStabFailed) && (!BiCGStabConverged) ){ // while(not done){ |
| __zero( aj,4*__ca_krylov_s+1); // |
| __zero( cj,4*__ca_krylov_s+1); // |
| __zero( ej,4*__ca_krylov_s+1); // |
| __zero( Tpaj,4*__ca_krylov_s+1); // |
| __zero( Tpcj,4*__ca_krylov_s+1); // |
| __zero(Tppaj,4*__ca_krylov_s+1); // |
| __zero(temp1,4*__ca_krylov_s+1); // |
| __zero(temp2,4*__ca_krylov_s+1); // |
| __zero(temp3,4*__ca_krylov_s+1); // |
| |
| // Using the monomial basis, compute 2s+1 matrix powers on p[] and 2s matrix powers on r[] one power at a time |
| // (conventional approach applicable to CHOMBO and BoxLib) |
| scale_grid(domain,level,P[0],1.0,__p); // P[0] = A^0p = __p |
| for(n=1;n<2*__ca_krylov_s+1;n++){ // naive way of calculating the monomial basis. |
| #ifdef DiagonallyPrecondition // |
| mul_grids(domain,level,__temp,1.0,__lambda,P[n-1]); // temp[] = lambda[]*P[n-1] |
| apply_op(domain,level,P[n],__temp,a,b); // P[n] = AD^{-1}__temp = AD^{-1}P[n-1] = ((AD^{-1})^n)p |
| #else // |
| apply_op(domain,level,P[n],P[n-1],a,b); // P[n] = A(P[n-1]) = (A^n)p |
| #endif // |
| } |
| scale_grid(domain,level,R[0],1.0,__r); // R[0] = A^0r = __r |
| for(n=1;n<2*__ca_krylov_s;n++){ // naive way of calculating the monomial basis. |
| #ifdef DiagonallyPrecondition // |
| mul_grids(domain,level,__temp,1.0,__lambda,R[n-1]); // temp[] = lambda[]*R[n-1] |
| apply_op(domain,level,R[n],__temp,a,b); // R[n] = AD^{-1}__temp = AD^{-1}R[n-1] |
| #else // |
| apply_op(domain,level,R[n],R[n-1],a,b); // R[n] = A(R[n-1]) = (A^n)r |
| #endif // |
| } |
| |
| // Compute Gg[][] = [P,R]^T * [P,R,rt] (Matmul with grids with ghost zones but only one MPI_AllReduce) |
| domain->CAKrylov_formations_of_G++; // Record the number of times CABiCGStab formed G[][] |
| matmul_grids(domain,level,Gg,PRrt,PRrt,4*__ca_krylov_s+1,4*__ca_krylov_s+2,1); |
| for(i=0,k=0;i<4*__ca_krylov_s+1;i++){ // extract G[][] and g[] from Gg[] |
| for(j=0 ;j<4*__ca_krylov_s+1;j++){G[i][j] = Gg[k++];} // first 4*__ca_krylov_s+1 elements in each row go to G[][]. |
| g[i] = Gg[k++]; // last element in row goes to g[]. |
| } |
| |
| for(i=0;i<4*__ca_krylov_s+1;i++)aj[i]=0.0;aj[ 0]=1.0; // initialized based on (3.26) |
| for(i=0;i<4*__ca_krylov_s+1;i++)cj[i]=0.0;cj[2*__ca_krylov_s+1]=1.0; // initialized based on (3.26) |
| for(i=0;i<4*__ca_krylov_s+1;i++)ej[i]=0.0; // initialized based on (3.26) |
| |
| for(n=0;n<__ca_krylov_s;n++){ // for(n=0;n<__ca_krylov_s;n++){ |
| domain->Krylov_iterations++; // record number of inner-loop (j) iterations for comparison |
| __gemv( Tpaj, 1.0, Tp, aj, 0.0, Tpaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T'aj |
| __gemv( Tpcj, 1.0, Tp, cj, 0.0, Tpcj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T'cj |
| __gemv(Tppaj, 1.0,Tpp, aj, 0.0,Tppaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T''aj |
| g_dot_Tpaj = __dot(g,Tpaj,4*__ca_krylov_s+1); // (g,T'aj) |
| if(g_dot_Tpaj == 0.0){ // pivot breakdown ??? |
| #ifdef __VERBOSE // |
| if(domain->rank==0){printf("g_dot_Tpaj == 0.0\n");} // |
| #endif // |
| BiCGStabFailed=1;break; // |
| } // |
| alpha = delta / g_dot_Tpaj; // delta / (g,T'aj) |
| if(isinf(alpha)){ // alpha = big/tiny(overflow) = inf -> breakdown |
| #ifdef __VERBOSE // |
| if(domain->rank==0){printf("alpha == inf\n");} // |
| #endif // |
| BiCGStabFailed=1;break; // |
| } // |
| #if 0 // seems to have accuracy problems in finite precision... |
| __gemv(temp1,-alpha, G, Tpaj, 0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = - alpha*GT'aj |
| __gemv(temp1, 1.0, G, cj, 1.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = Gcj - alpha*GT'aj |
| __gemv(temp2,-alpha, G,Tppaj, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = − alpha*GT′′aj |
| __gemv(temp2, 1.0, G, Tpcj, 1.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = GT′cj − alpha*GT′′aj |
| __axpy(temp3, 1.0, Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1); // temp3[] = T′cj − alpha*T′′aj |
| omega_numerator = __dot(temp3,temp1,4*__ca_krylov_s+1); // (temp3,temp1) = ( T'cj-alpha*T''aj , Gcj-alpha*GT'aj ) |
| omega_denominator = __dot(temp3,temp2,4*__ca_krylov_s+1); // (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj ) |
| #else // better to change the order of operations Gx-Gy -> G(x-y) ... (note, G is symmetric) |
| __axpy(temp1, 1.0, Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1); // temp1[] = (T'cj - alpha*T''aj) |
| __gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = G(T'cj - alpha*T''aj) |
| __axpy(temp3, 1.0, cj,-alpha, Tpaj,4*__ca_krylov_s+1); // temp3[] = cj - alpha*T'aj |
| omega_numerator = __dot(temp3,temp2,4*__ca_krylov_s+1); // (temp3,temp2) = ( ( cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) ) |
| omega_denominator = __dot(temp1,temp2,4*__ca_krylov_s+1); // (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) ) |
| #endif // |
| // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0 |
| // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj |
| // If omega_numerator==0, and ||s||!=0, then stabilization breakdown |
| |
| // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!! |
| __axpy( ej,1.0,ej, alpha, aj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] |
| |
| // calculate the norm of Saad's vector 's' to check intra s-step convergence... |
| __axpy(temp1, 1.0, cj,-alpha, Tpaj,4*__ca_krylov_s+1); // temp1[] = cj - alpha*T'aj |
| __gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = G(cj - alpha*T'aj) |
| L2_norm_of_s = __dot(temp1,temp2,4*__ca_krylov_s+1); // (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) ) == square of L2 norm of s in exact arithmetic |
| if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) |
| #ifdef __VERBOSE // |
| if(domain->rank==0){printf("m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);} // |
| #endif // |
| if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop |
| |
| |
| if(omega_denominator == 0.0){ // ??? breakdown |
| #ifdef __VERBOSE // |
| if(domain->rank==0){if(omega_denominator == 0.0)printf("omega_denominator == 0.0\n");} // |
| #endif // |
| BiCGStabFailed=1;break; // |
| } // |
| omega = omega_numerator / omega_denominator; // |
| if(isinf(omega)){ // omega = big/tiny(oveflow) = inf |
| #ifdef __VERBOSE // |
| if(domain->rank==0){if(isinf(omega))printf("omega == inf\n");} // |
| #endif // |
| BiCGStabFailed=1;break; // |
| } // |
| // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok // |
| __axpy( ej,1.0,ej, omega, cj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] |
| __axpy( ej,1.0,ej,-omega*alpha, Tpaj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[] |
| __axpy( cj,1.0,cj, -omega, Tpcj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] |
| __axpy( cj,1.0,cj, -alpha, Tpaj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] |
| __axpy( cj,1.0,cj, omega*alpha,Tppaj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[] |
| |
| |
| // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... |
| __gemv(temp1, 1.0, G, cj, 0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = Gcj |
| cj_dot_Gcj = __dot(cj,temp1,4*__ca_krylov_s+1); // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic |
| L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) |
| #ifdef __VERBOSE |
| if(domain->rank==0){printf("m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);} |
| #endif |
| if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop |
| |
| |
| delta_next = __dot( g,cj,4*__ca_krylov_s+1); // (g,cj) |
| #ifdef __VERBOSE // |
| if(domain->rank==0){ // |
| if(isinf(delta_next) ){printf("delta == inf\n");} // delta = big/tiny(overflow) = inf |
| if(delta_next == 0.0){printf("delta == 0.0\n");} // Lanczos breakdown |
| if(omega_numerator == 0.0){printf("omega_numerator == 0.0\n");} // stabilization breakdown |
| if(omega == 0.0){printf("omega == 0.0\n");} // stabilization breakdown |
| } // |
| #endif // |
| if(isinf(delta_next)){BiCGStabFailed =1;break;} // delta = inf? |
| if(delta_next ==0.0){BiCGStabFailed =1;break;} // Lanczos breakdown... |
| if(omega ==0.0){BiCGStabFailed =1;break;} // stabilization breakdown |
| beta = (delta_next/delta)*(alpha/omega); // (delta_next/delta)*(alpha/omega) |
| #ifdef __VERBOSE // |
| if(domain->rank==0){ // |
| if(isinf(beta) ){printf("beta == inf\n");} // beta = inf? |
| if(beta == 0.0){printf("beta == 0.0\n");} // beta = 0? can't make further progress(?) |
| } // |
| #endif // |
| if(isinf(beta) ){BiCGStabFailed =1;break;} // beta = inf? |
| if(beta == 0.0){BiCGStabFailed =1;break;} // beta = 0? can't make further progress(?) |
| __axpy( aj,1.0,cj, beta, aj,4*__ca_krylov_s+1); // aj[] = cj[] + beta*aj[] |
| __axpy( aj,1.0,aj, -omega*beta, Tpaj,4*__ca_krylov_s+1); // aj[] = cj[] + beta*aj[] - omega*beta*T'aj |
| delta = delta_next; // delta = delta_next |
| |
| } // inner n (j) loop |
| |
| // update iterates... |
| for(i=0;i<4*__ca_krylov_s+1;i++){add_grids(domain,level,e_id,1.0,e_id,ej[i],PRrt[i]);} // e_id[] = [P,R]ej + e_id[] |
| if(!BiCGStabFailed && !BiCGStabConverged){ // if we're done, then there is no point in updating these |
| add_grids(domain,level, __p,0.0, __p,aj[0],PRrt[0]); // p[] = [P,R]aj |
| for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __p,1.0, __p,aj[i],PRrt[i]);} // ... |
| add_grids(domain,level, __r,0.0, __r,cj[0],PRrt[0]); // r[] = [P,R]cj |
| for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __r,1.0, __r,cj[i],PRrt[i]);} // ... |
| } // |
| m+=__ca_krylov_s; // m+=__ca_krylov_s; |
| } // } // outer m loop |
| // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| #ifdef DiagonallyPrecondition |
| mul_grids(domain,level,e_id,1.0,__lambda,e_id); // e_id[] = lambda[]*e_id[] // i.e. e = D^{-1}e' |
| #endif |
| } |
| |
| |
| //------------------------------------------------------------------------------------------------------------------------------ |
| void BiCGStab(domain_type * domain, int level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){ |
| // Algorithm 7.7 in Iterative Methods for Sparse Linear Systems(Yousef Saad) |
| // uses "right" preconditioning... AD^{-1}(Dx) = b ... AD^{-1}y = b ... solve for y, then solve for x = D^{-1}y |
| int jMax=200; |
| int j=0; |
| int BiCGStabFailed = 0; |
| int BiCGStabConverged = 0; |
| residual(domain,level,__r0,x_id,R_id,a,b); // r0[] = R_id[] - A(x_id) |
| scale_grid(domain,level,__r,1.0,__r0); // r[] = r0[] |
| scale_grid(domain,level,__p,1.0,__r0); // p[] = r0[] |
| double r_dot_r0 = dot(domain,level,__r,__r0); // r_dot_r0 = dot(r,r0) |
| double norm_of_r0 = norm(domain,level,__r); // the norm of the initial residual... |
| if(r_dot_r0 == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution |
| if(norm_of_r0 == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution |
| while( (j<jMax) && (!BiCGStabFailed) && (!BiCGStabConverged) ){ // while(not done){ |
| j++;domain->Krylov_iterations++; // |
| #ifdef DiagonallyPrecondition // |
| mul_grids(domain,level,__temp,1.0,__lambda,__p); // temp[] = lambda[]*p[] |
| apply_op(domain,level,__Ap,__temp,a,b); // Ap = AD^{-1}(p) |
| #else // |
| apply_op(domain,level,__Ap,__p,a,b); // Ap = A(p) |
| #endif // |
| double Ap_dot_r0 = dot(domain,level,__Ap,__r0); // Ap_dot_r0 = dot(Ap,r0) |
| if(Ap_dot_r0 == 0.0){BiCGStabFailed=1;break;} // pivot breakdown ??? |
| double alpha = r_dot_r0 / Ap_dot_r0; // alpha = r_dot_r0 / Ap_dot_r0 |
| if(isinf(alpha)){BiCGStabFailed=1;break;} // pivot breakdown ??? |
| add_grids(domain,level,x_id,1.0,x_id, alpha,__p ); // x_id[] = x_id[] + alpha*p[] |
| add_grids(domain,level,__s ,1.0,__r ,-alpha,__Ap); // s[] = r[] - alpha*Ap[] (intermediate residual?) |
| double norm_of_s = norm(domain,level,__s); // FIX - redundant?? norm of intermediate residual |
| if(norm_of_s == 0.0){BiCGStabConverged=1;break;} // FIX - redundant?? if As_dot_As==0, then As must be 0 which implies s==0 |
| if(norm_of_s < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;} |
| #ifdef DiagonallyPrecondition // |
| mul_grids(domain,level,__temp,1.0,__lambda,__s); // temp[] = lambda[]*s[] |
| apply_op(domain,level,__As,__temp,a,b); // As = AD^{-1}(s) |
| #else // |
| apply_op(domain,level,__As,__s,a,b); // As = A(s) |
| #endif // |
| double As_dot_As = dot(domain,level,__As,__As); // As_dot_As = dot(As,As) |
| double As_dot_s = dot(domain,level,__As, __s); // As_dot_s = dot(As, s) |
| if(As_dot_As == 0.0){BiCGStabConverged=1;break;} // converged ? |
| double omega = As_dot_s / As_dot_As; // omega = As_dot_s / As_dot_As |
| if(omega == 0.0){BiCGStabFailed=1;break;} // stabilization breakdown ??? |
| if(isinf(omega)){BiCGStabFailed=1;break;} // stabilization breakdown ??? |
| add_grids(domain,level, x_id, 1.0,x_id, omega,__s ); // x_id[] = x_id[] + omega*s[] |
| add_grids(domain,level,__r , 1.0,__s ,-omega,__As ); // r[] = s[] - omega*As[] (recursively computed / updated residual) |
| double norm_of_r = norm(domain,level,__r); // norm of recursively computed residual (good enough??) |
| if(norm_of_r == 0.0){BiCGStabConverged=1;break;} // |
| if(norm_of_r < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;} |
| #ifdef __DEBUG // |
| residual(domain,level,__temp,x_id,R_id,a,b); // |
| double norm_of_residual = norm(domain,level,__temp); // |
| if(domain->rank==0)printf("j=%8d, norm=%12.6e, norm_inital=%12.6e, reduction=%e\n",j,norm_of_residual,norm_of_r0,norm_of_residual/norm_of_r0); // |
| #endif // |
| double r_dot_r0_new = dot(domain,level,__r,__r0); // r_dot_r0_new = dot(r,r0) |
| if(r_dot_r0_new == 0.0){BiCGStabFailed=1;break;} // Lanczos breakdown ??? |
| double beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega); // beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega) |
| if(isinf(beta)){BiCGStabFailed=1;break;} // ??? |
| add_grids(domain,level,__temp,1.0,__p,-omega,__Ap ); // __temp = (p[]-omega*Ap[]) |
| add_grids(domain,level,__p ,1.0,__r, beta,__temp); // p[] = r[] + beta*(p[]-omega*Ap[]) |
| r_dot_r0 = r_dot_r0_new; // r_dot_r0 = r_dot_r0_new (save old r_dot_r0) |
| } // } |
| #ifdef DiagonallyPrecondition // |
| mul_grids(domain,level,x_id,1.0,__lambda,x_id); // x_id[] = lambda[]*x_id[] // i.e. x = D^{-1}x' |
| #endif // |
| } |
| |
| //------------------------------------------------------------------------------------------------------------------------------ |
| void CACG(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){ |
| // based on Lauren Goodfriend, Yinghui Huang, and David Thorman's derivation in their Spring 2013 CS267 Report |
| |
| double temp1[2*__CA_KRYLOV_S+1]; // |
| double temp2[2*__CA_KRYLOV_S+1]; // |
| double temp3[2*__CA_KRYLOV_S+1]; // |
| double aj[2*__CA_KRYLOV_S+1]; // |
| double cj[2*__CA_KRYLOV_S+1]; // |
| double ej[2*__CA_KRYLOV_S+1]; // |
| double Tpaj[2*__CA_KRYLOV_S+1]; // |
| double Tp[2*__CA_KRYLOV_S+1][2*__CA_KRYLOV_S+1]; // T' indexed as [row][col] |
| double G[2*__CA_KRYLOV_S+1][2*__CA_KRYLOV_S+1]; // extracted from first 2*__CA_KRYLOV_S+1 columns of Gg[][]. indexed as [row][col] |
| double Gbuf[(2*__CA_KRYLOV_S+1)*(2*__CA_KRYLOV_S+1)]; // buffer to hold the Gram-like matrix produced by matmul(). indexed as [row*(2*__CA_KRYLOV_S+1) + col] |
| int PR[2*__CA_KRYLOV_S+1]; // grid_id's of the concatenation of the S+1 matrix powers of P, and the S matrix powers of R |
| int *P = PR+ 0; // grid_id's of the S+1 Matrix Powers of P. P[i] is the grid_id of A^i(p) |
| int *R = PR+__CA_KRYLOV_S+1; // grid_id's of the S Matrix Powers of R. R[i] is the grid_id of A^i(r) |
| |
| int mMax=200; |
| int m=0,n; |
| int i,j,k; |
| int CGFailed = 0; |
| int CGConverged = 0; |
| |
| double aj_dot_GTpaj,cj_dot_Gcj,alpha,cj_dot_Gcj_new,beta,L2_norm_of_r0,L2_norm_of_residual,delta; |
| |
| residual(domain,level,__r0,e_id,R_id,a,b); // r0[] = R_id[] - A(e_id) |
| scale_grid(domain,level,__r,1.0,__r0); // r[] = r0[] |
| scale_grid(domain,level,__p,1.0,__r0); // p[] = r0[] |
| double norm_of_r0 = norm(domain,level,__r0); // the norm of the initial residual... |
| if(norm_of_r0 == 0.0){CGConverged=1;} // entered CG with exact solution |
| |
| delta = dot(domain,level,__r,__r0); // delta = dot(r,r0) |
| if(delta==0.0){CGConverged=1;} // entered CG with exact solution (square of L2 norm of __r) |
| L2_norm_of_r0 = sqrt(delta); // |
| |
| |
| |
| // initialize Tp[][] ... |
| for(i=0;i<2*__CA_KRYLOV_S+1;i++)for(j=0;j<2*__CA_KRYLOV_S+1;j++) Tp[i][j]=0; // zero Tp |
| for(i= 0;i< __CA_KRYLOV_S ;i++){ Tp[i+1][i]=1;} // monomial basis |
| for(i=__CA_KRYLOV_S+1;i<2*__CA_KRYLOV_S ;i++){ Tp[i+1][i]=1;} // |
| |
| for(i=0;i<2*__CA_KRYLOV_S+1;i++){PR[i] = __PRrt+i;} // columns of PR map to the consecutive spare grids allocated for the bottom solver starting at __PRrt |
| |
| |
| while( (m<mMax) && (!CGFailed) && (!CGConverged) ){ // while(not done){ |
| __zero( aj,2*__CA_KRYLOV_S+1); |
| __zero( cj,2*__CA_KRYLOV_S+1); |
| __zero( ej,2*__CA_KRYLOV_S+1); |
| __zero( Tpaj,2*__CA_KRYLOV_S+1); |
| __zero(temp1,2*__CA_KRYLOV_S+1); |
| __zero(temp2,2*__CA_KRYLOV_S+1); |
| __zero(temp3,2*__CA_KRYLOV_S+1); |
| |
| // Using the monomial basis, compute s+1 matrix powers on p[] and s matrix powers on r[] one power at a time |
| // (conventional approach applicable to CHOMBO and BoxLib) |
| scale_grid(domain,level,P[0],1.0,__p); // P[0] = A^0p = __p |
| for(n=1;n<__CA_KRYLOV_S+1;n++){ // naive way of calculating the monomial basis. |
| apply_op(domain,level,P[n],P[n-1],a,b); // P[n] = A(P[n-1]) = A^(n)p |
| } |
| scale_grid(domain,level,R[0],1.0,__r); // R[0] = A^0r = __r |
| for(n=1;n<__CA_KRYLOV_S;n++){ // naive way of calculating the monomial basis. |
| apply_op(domain,level,R[n],R[n-1],a,b); // R[n] = A(R[n-1]) = A^(n)r |
| } |
| |
| |
| // form G[][] and g[] |
| domain->CAKrylov_formations_of_G++; // Record the number of times CACG formed G[][] |
| matmul_grids(domain,level,Gbuf,PR,PR,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1,1); // Compute Gbuf[][] = [P,R]^T * [P,R] (Matmul with grids but only one MPI_AllReduce) |
| for(i=0,k=0;i<2*__CA_KRYLOV_S+1;i++){ // extract G[][] from Gbuf[] |
| for(j=0 ;j<2*__CA_KRYLOV_S+1;j++){G[i][j] = Gbuf[k++];} // first 2*__CA_KRYLOV_S+1 elements in each row go to G[][]. |
| } |
| |
| |
| for(i=0;i<2*__CA_KRYLOV_S+1;i++)aj[i]=0.0;aj[ 0]=1.0; // initialized based on (???) |
| for(i=0;i<2*__CA_KRYLOV_S+1;i++)cj[i]=0.0;cj[__CA_KRYLOV_S+1]=1.0; // initialized based on (???) |
| for(i=0;i<2*__CA_KRYLOV_S+1;i++)ej[i]=0.0; // initialized based on (???) |
| |
| for(n=0;n<__CA_KRYLOV_S;n++){ // for(n=0;n<__CA_KRYLOV_S;n++){ |
| domain->Krylov_iterations++; // record number of inner-loop (j) iterations for comparison |
| __gemv( Tpaj,1.0,Tp, aj,0.0, Tpaj,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1); // T'aj |
| __gemv(temp1,1.0, G,Tpaj,0.0,temp1,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1); // temp1[] = GT'aj |
| __gemv(temp2,1.0, G, cj,0.0,temp2,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1); // temp2[] = Gcj |
| aj_dot_GTpaj = __dot(aj,temp1,2*__CA_KRYLOV_S+1); // (aj,GT'aj) |
| cj_dot_Gcj = __dot(cj,temp2,2*__CA_KRYLOV_S+1); // (cj, Gcj) |
| // FIX, can cj_dot_Gcj ever be zero ? |
| if(aj_dot_GTpaj == 0.0){ // pivot breakdown ??? |
| CGFailed=1;break; // |
| } // |
| alpha = cj_dot_Gcj / aj_dot_GTpaj; // alpha = (cj,Gcj) / (aj,GT'aj) |
| if(isinf(alpha)){ // alpha = big/tiny(overflow) = inf -> breakdown |
| CGFailed=1;break; // |
| } // |
| __axpy( ej,1.0,ej, alpha, aj,2*__CA_KRYLOV_S+1); // ej[] = ej[] + alpha*aj[] |
| __axpy( cj,1.0,cj, -alpha, Tpaj,2*__CA_KRYLOV_S+1); // cj[] = cj[] - alpha*T'*aj[] |
| __gemv(temp2,1.0, G, cj,0.0,temp2,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1); // temp2[] = Gcj |
| cj_dot_Gcj_new = __dot(cj,temp2,2*__CA_KRYLOV_S+1); // (cj, Gcj) |
| // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... == cj_dot_Gcj_new?? |
| L2_norm_of_residual = 0.0;if(cj_dot_Gcj_new>0)L2_norm_of_residual=sqrt(cj_dot_Gcj_new); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) |
| if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_r0){CGConverged=1;break;} // terminate the inner n-loop |
| if(cj_dot_Gcj_new == 0.0){ // Lanczos breakdown ??? |
| CGFailed=1;break; // |
| } // |
| beta = cj_dot_Gcj_new / cj_dot_Gcj; // |
| if(isinf(beta)){CGFailed=1;break;} // beta = inf? |
| if(beta == 0.0){CGFailed=1;break;} // beta = 0? can't make further progress(?) |
| __axpy( aj,1.0,cj, beta, aj,2*__CA_KRYLOV_S+1); // cj[] = cj[] + beta*aj[] |
| |
| } // inner n (j) loop |
| |
| // update iterates... |
| for(i=0;i<2*__CA_KRYLOV_S+1;i++){add_grids(domain,level,e_id,1.0,e_id,ej[i],PR[i]);} // e_id[] = [P,R]ej + e_id[] |
| if(!CGFailed && !CGConverged){ // if we're done, then there is no point in updating these |
| add_grids(domain,level, __p,0.0, __p,aj[0],PR[0]); // p[] = [P,R]aj |
| for(i=1;i<2*__CA_KRYLOV_S+1;i++){add_grids(domain,level, __p,1.0, __p,aj[i],PR[i]);} // ... |
| add_grids(domain,level, __r,0.0, __r,cj[0],PR[0]); // r[] = [P,R]cj |
| for(i=1;i<2*__CA_KRYLOV_S+1;i++){add_grids(domain,level, __r,1.0, __r,cj[i],PR[i]);} // ... |
| } |
| m+=__CA_KRYLOV_S; // m+=__CA_KRYLOV_S; |
| // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| } // } // outer m loop |
| |
| } |
| |
| |
| //------------------------------------------------------------------------------------------------------------------------------ |
| void CG(domain_type * domain, int level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){ |
| // Algorithm 6.18 in Iterative Methods for Sparse Linear Systems(Yousef Saad) |
| int jMax=200; |
| int j=0; |
| int CGFailed = 0; |
| int CGConverged = 0; |
| residual(domain,level,__r0,x_id,R_id,a,b); // r0[] = R_id[] - A(x_id) |
| scale_grid(domain,level,__r,1.0,__r0); // r[] = r0[] |
| scale_grid(domain,level,__p,1.0,__r0); // p[] = r0[] |
| double norm_of_r0 = norm(domain,level,__r); // the norm of the initial residual... |
| if(norm_of_r0 == 0.0){CGConverged=1;} // entered CG with exact solution |
| double r_dot_r = dot(domain,level,__r,__r); // r_dot_r = dot(r,r) |
| while( (j<jMax) && (!CGFailed) && (!CGConverged) ){ // while(not done){ |
| j++;domain->Krylov_iterations++; // |
| apply_op(domain,level,__Ap,__p,a,b); // Ap = A(p) |
| double Ap_dot_p = dot(domain,level,__Ap,__p); // Ap_dot_p = dot(Ap,p) |
| if(Ap_dot_p == 0.0){CGFailed=1;break;} // pivot breakdown ??? |
| double alpha = r_dot_r / Ap_dot_p; // alpha = r_dot_r / Ap_dot_p |
| if(isinf(alpha)){CGFailed=1;break;} // ??? |
| add_grids(domain,level,x_id,1.0,x_id, alpha,__p ); // x_id[] = x_id[] + alpha*p[] |
| add_grids(domain,level,__r ,1.0,__r ,-alpha,__Ap); // r[] = r[] - alpha*Ap[] (intermediate residual?) |
| double norm_of_r = norm(domain,level,__r); // norm of intermediate residual |
| if(norm_of_r == 0.0){CGConverged=1;break;} // |
| if(norm_of_r < desired_reduction_in_norm*norm_of_r0){CGConverged=1;break;} // |
| double r_dot_r_new = dot(domain,level,__r,__r); // r_dot_r_new = dot(r_{j+1},r_{j+1}) |
| if(r_dot_r_new == 0.0){CGFailed=1;break;} // Lanczos breakdown ??? |
| double beta = (r_dot_r_new/r_dot_r); // beta = (r_dot_r_new/r_dot_r) |
| if(isinf(beta)){CGFailed=1;break;} // ??? |
| add_grids(domain,level,__p,1.0,__r,beta,__p ); // p[] = r[] + beta*p[] |
| r_dot_r = r_dot_r_new; // r_dot_r = r_dot_r_new (save old r_dot_r) |
| } // } |
| } |
| |
| //------------------------------------------------------------------------------------------------------------------------------ |
| void IterativeSolver(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){ |
| // code presumes e_id was an externally initialized with the initial guess |
| #ifdef __USE_BICGSTAB |
| #warning Using BiCGStab bottom solver... |
| BiCGStab(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); |
| #elif __USE_CG |
| #warning Using CG bottom solver... |
| CG(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); |
| #elif __USE_CABICGSTAB |
| #warning Using Communication-Avoiding BiCGStab bottom solver... |
| // CABiCGStab(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); |
| TelescopingCABiCGStab(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); |
| #elif __USE_CACG |
| #warning Using Communication-Avoiding CG bottom solver... |
| CACG(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); |
| #else // just multiple smooth()'s |
| #warning Defaulting to simple bottom solver with fixed number of smooths... |
| int s,numSmoothsBottom=48; |
| for(s=0;s<numSmoothsBottom;s+=numSmooths)smooth(domain,level,e_id,R_id,a,b); |
| #endif |
| } |
| |
| int IterativeSolver_NumGrids(){ |
| // additionally number of grids required by an iterative solver... |
| #ifdef __USE_BICGSTAB |
| return(6); // BiCGStab requires additional grids r0,r,p,s,Ap,As |
| #elif __USE_CG |
| return(6); // CG requires extra grids r0,r,p,Ap |
| #elif __USE_CABICGSTAB |
| return(4+4*__CA_KRYLOV_S); // CABiCGStab requires additional grids rt,p,r,P[2s+1],R[2s]. |
| #elif __USE_CACG |
| return(4+2*__CA_KRYLOV_S); // CACG requires additional grids r0,p,r,P[s+1],R[s]. |
| #endif |
| return(0); // simply doing multiple smooths requires no extra grids |
| } |
| //------------------------------------------------------------------------------------------------------------------------------ |