Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeLists.txt =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeLists.txt +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeLists.txt @@ -1,4 +1,6 @@ set(PROG miniGMG) +list(APPEND CXXFLAGS -D__PRINT_NORM) list(APPEND LDFLAGS -lm) -set(RUN_OPTIONS 6 2 2 2 1 1 1) +set(FP_TOLERANCE 0.00001) +set(RUN_OPTIONS 5 2 2 2 1 1 1) llvm_multisource() Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/apply_op.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/apply_op.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/apply_op.inc @@ -24,6 +24,7 @@ #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k,s; int pencil = domain->subdomains[box].levels[level].pencil; @@ -41,7 +42,9 @@ double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[ __beta_k] + ghosts*(1+pencil+plane); double * __restrict__ lambda = domain->subdomains[box].levels[level].grids[ __lambda] + ghosts*(1+pencil+plane); +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=0;kcycles.apply_op[level] += (uint64_t)(CycleTime()-_timeStart); } //------------------------------------------------------------------------------------------------------------------------------ Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/exchange_boundary.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/exchange_boundary.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/exchange_boundary.inc @@ -141,7 +141,34 @@ if( (domain->bufferCopies[level][buffer].isFace && exchange_faces ) || (domain->bufferCopies[level][buffer].isEdge && exchange_edges ) || (domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){ - DoBufferCopy(domain,level,grid_id,buffer); + DoBufferCopy(domain,level,grid_id,buffer); + }} + _timeEnd = CycleTime(); + domain->cycles.pack[level] += (_timeEnd-_timeStart); + + // loop through MPI send buffers and post Isend's... + _timeStart = CycleTime(); + #ifdef __MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=nMessages/2;ncycles.send[level] += (_timeEnd-_timeStart); + #endif + + + // exchange locally... try and hide within Isend latency... + _timeStart = CycleTime(); +#ifdef OMP + #pragma omp parallel for schedule(static,1) +#endif + for(buffer=domain->bufferCopy_Local_Start;bufferbufferCopy_Local_End;buffer++){ + if( (domain->bufferCopies[level][buffer].isFace && exchange_faces ) || + (domain->bufferCopies[level][buffer].isEdge && exchange_edges ) || + (domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){ + DoBufferCopy(domain,level,grid_id,buffer); }} _timeEnd = CycleTime(); domain->cycles.grid2grid[level] += (_timeEnd-_timeStart); @@ -153,7 +180,7 @@ MPI_Waitall(nMessages,requests_packed,status_packed); _timeEnd = CycleTime(); domain->cycles.wait[level] += (_timeEnd-_timeStart); - #endif + // unpack MPI receive buffers _timeStart = CycleTime(); #pragma omp parallel for schedule(static,1) Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.h @@ -125,5 +125,6 @@ void MGBuild(domain_type * domain, double a, double b, double h0); void MGSolve(domain_type * domain, int u_id, int F_id, double a, double b, double desired_mg_norm); void print_timing(domain_type *domain); +void DoBufferCopy(domain_type *domain, int level, int grid_id, int buffer); //------------------------------------------------------------------------------------------------------------------------------ #endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.reference_output =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.reference_output +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.reference_output @@ -1,25 +1,46 @@ 1 MPI Tasks of 1 threads truncating the v-cycle at 2^3 subdomains creating domain... done - 64 x 64 x 64 (per subdomain) - 128 x 128 x 128 (per process) - 128 x 128 x 128 (overall) + 32 x 32 x 32 (per subdomain) + 64 x 64 x 64 (per process) + 64 x 64 x 64 (overall) 1-deep ghost zones - allocated 246 MB + allocated 34 MB initializing alpha, beta, RHS for the ``hard problem''... - average value of f = 0.000000000000e+00 + average value of f = -1.110223024625e-15 done MGBuild... - level= 0, eigenvalue_max ~= 1.000000e+00 - level= 1, eigenvalue_max ~= -1.000000e+00 - level= 2, eigenvalue_max ~= -1.000000e+00 - level= 3, eigenvalue_max ~= -1.000000e+00 - level= 4, eigenvalue_max ~= -1.000000e+00 - level= 5, eigenvalue_max ~= -1.000000e+00 + level= 0, eigenvalue_max ~= 1.999996e+00 + level= 1, eigenvalue_max ~= 1.999984e+00 + level= 2, eigenvalue_max ~= 1.999935e+00 + level= 3, eigenvalue_max ~= 1.999740e+00 + level= 4, eigenvalue_max ~= 1.998958e+00 done -MGSolve... done -MGSolve... done -Error test: h = 7.812500e-03, max = 0.000000e+00 -Error test: h = 7.812500e-03, L2 = 0.000000e+00 +MGSolve... +v-cycle= 1, norm=0.00104306838695688285 (1.043068e-03) +v-cycle= 2, norm=0.00008508164981969001 (8.508165e-05) +v-cycle= 3, norm=0.00001867173484947213 (1.867173e-05) +v-cycle= 4, norm=0.00000204150498158241 (2.041505e-06) +v-cycle= 5, norm=0.00000036859010963302 (3.685901e-07) +v-cycle= 6, norm=0.00000004500376124219 (4.500376e-08) +v-cycle= 7, norm=0.00000000777275418452 (7.772754e-09) +v-cycle= 8, norm=0.00000000096441180332 (9.644118e-10) +v-cycle= 9, norm=0.00000000016688265714 (1.668827e-10) +v-cycle=10, norm=0.00000000002055567632 (2.055568e-11) +done +MGSolve... +v-cycle= 1, norm=0.00104306838695688285 (1.043068e-03) +v-cycle= 2, norm=0.00008508164981969001 (8.508165e-05) +v-cycle= 3, norm=0.00001867173484947213 (1.867173e-05) +v-cycle= 4, norm=0.00000204150498158241 (2.041505e-06) +v-cycle= 5, norm=0.00000036859010963302 (3.685901e-07) +v-cycle= 6, norm=0.00000004500376124219 (4.500376e-08) +v-cycle= 7, norm=0.00000000777275418452 (7.772754e-09) +v-cycle= 8, norm=0.00000000096441180332 (9.644118e-10) +v-cycle= 9, norm=0.00000000016688265714 (1.668827e-10) +v-cycle=10, norm=0.00000000002055567632 (2.055568e-11) +done +Error test: h = 1.562500e-02, max = 3.529710e-04 +Error test: h = 1.562500e-02, L2 = 5.559935e-05 deallocating domain... done exit 0 Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/misc.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/misc.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/misc.inc @@ -20,6 +20,7 @@ #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -29,7 +30,9 @@ int dim_j = domain->subdomains[box].levels[level].dim.j; int dim_i = domain->subdomains[box].levels[level].dim.i; double * __restrict__ grid = domain->subdomains[box].levels[level].grids[grid_id] + ghosts*(1+pencil+plane); +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=-ghosts;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); } @@ -56,6 +58,7 @@ int box; #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -65,7 +68,9 @@ int dim_j = domain->subdomains[box].levels[level].dim.j; int dim_i = domain->subdomains[box].levels[level].dim.i; double * __restrict__ grid = domain->subdomains[box].levels[level].grids[grid_id] + ghosts*(1+pencil+plane); +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=-ghosts;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); } @@ -92,6 +96,7 @@ int box; #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -103,7 +108,9 @@ double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane); +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); } @@ -129,6 +135,7 @@ int box; #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -140,7 +147,9 @@ double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane); +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); } @@ -167,6 +175,7 @@ int box; #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -177,7 +186,9 @@ int dim_i = domain->subdomains[box].levels[level].dim.i; double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); } @@ -206,6 +216,7 @@ #ifdef OMP // FIX, schedule(static) is a stand in to guarantee reproducibility... #pragma omp parallel for private(box) if(omp_across_boxes) reduction(+:a_dot_b_domain) schedule(static) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -217,7 +228,9 @@ double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane); double a_dot_b_box = 0.0; +#ifdef OMP #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:a_dot_b_box) schedule(static) +#endif for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); #ifdef __MPI @@ -256,6 +268,7 @@ #ifdef OMP // FIX, schedule(static) is a stand in to guarantee reproducibility... #pragma omp parallel for private(box) if(omp_across_boxes) reduction(max:max_norm) schedule(static) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -266,7 +279,9 @@ int dim_i = domain->subdomains[box].levels[level].dim.i; double * __restrict__ grid = domain->subdomains[box].levels[level].grids[ grid_id] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point double box_norm = 0.0; +#ifdef OMP #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(max:box_norm) schedule(static) +#endif for(k=0;kmax_norm){max_norm = box_norm;} } // box list -#endif domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); #ifdef __MPI @@ -306,6 +320,7 @@ double sum_domain = 0.0; #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) reduction(+:sum_domain) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -316,7 +331,9 @@ int dim_i = domain->subdomains[box].levels[level].dim.i; double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point double sum_box = 0.0; +#ifdef OMP #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:sum_box) +#endif for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); double ncells_domain = (double)domain->dim.i*(double)domain->dim.j*(double)domain->dim.k; @@ -356,6 +372,7 @@ int box; #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -367,7 +384,9 @@ double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point +#ifdef OMP #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) +#endif for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); } @@ -391,6 +409,7 @@ int box; #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -407,7 +426,9 @@ case 1: stride = pencil;break;//j-direction case 2: stride = plane;break;//k-direction } +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=0;k<=dim_k;k++){ // <= to ensure you do low and high faces for(j=0;j<=dim_j;j++){ for(i=0;i<=dim_i;i++){ @@ -415,6 +436,5 @@ grid_face[ijk] = 0.5*(grid_cell[ijk-stride] + grid_cell[ijk]); // simple linear interpolation }}} } -#endif domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); } Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/residual.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/residual.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/residual.inc @@ -23,6 +23,7 @@ int box; #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int pencil = domain->subdomains[box].levels[level].pencil; @@ -40,7 +41,9 @@ double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane); double * __restrict__ res = domain->subdomains[box].levels[level].grids[ res_id] + ghosts*(1+pencil+plane); +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=0;kcycles.residual[level] += (uint64_t)(CycleTime()-_timeStart); } @@ -85,6 +87,7 @@ int box; #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int kk,jj; int pencil_c = domain->subdomains[box].levels[level_c].pencil; @@ -110,7 +113,9 @@ double * __restrict__ beta_k = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*(1+pencil_f+plane_f); double * __restrict__ res = domain->subdomains[box].levels[level_c].grids[ res_id] + ghosts_c*(1+pencil_c+plane_c); +#ifdef OMP #pragma omp parallel for private(kk,jj) if(omp_within_a_box) collapse(2) +#endif for(kk=0;kkcycles.residual[level_f] += (uint64_t)(CycleTime()-_timeStart); } #else @@ -158,7 +162,9 @@ int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize); int box; +#ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int pencil_c = domain->subdomains[box].levels[level_c].pencil; int plane_c = domain->subdomains[box].levels[level_c].plane; @@ -184,7 +190,10 @@ double * __restrict__ res = domain->subdomains[box].levels[level_c].grids[ res_id] + ghosts_c*(1+pencil_c+plane_c); int kk; +#ifdef OMP #pragma omp parallel for private(kk) if(omp_within_a_box) +#endif + for(kk=0;kksubdomains_per_rank;box++){ int i,j,k; int ghosts_c = domain->subdomains[box].levels[level_c].ghosts; @@ -37,8 +38,10 @@ double * __restrict__ grid_f = domain->subdomains[box].levels[level_f].grids[id_f] + ghosts_f*(1+pencil_f+plane_f); double * __restrict__ grid_c = domain->subdomains[box].levels[level_c].grids[id_c] + ghosts_c*(1+pencil_c+plane_c); - + +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=0;kcycles.restriction[level_f] += (uint64_t)(CycleTime()-_timeStart); } @@ -71,6 +73,7 @@ #ifdef OMP #pragma omp parallel for private(box) if(omp_across_boxes) +#endif for(box=0;boxsubdomains_per_rank;box++){ int i,j,k; int ghosts_c = domain->subdomains[box].levels[level_c].ghosts; @@ -103,7 +106,9 @@ // restrict beta_j (== face in ik) beta_f = domain->subdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f; beta_c = domain->subdomains[box].levels[level_c].grids[__beta_j] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c; +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=0;ksubdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f; beta_c = domain->subdomains[box].levels[level_c].grids[__beta_k] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c; +#ifdef OMP #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) +#endif for(k=0;kcycles.restriction[level_f] += (uint64_t)(CycleTime()-_timeStart); }