Index: test-suite/trunk/LICENSE.TXT =================================================================== --- test-suite/trunk/LICENSE.TXT +++ test-suite/trunk/LICENSE.TXT @@ -86,6 +86,7 @@ miniAMR: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR Pathfinder: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/Pathfinder XSBench: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench +miniGMG: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG CLAMR: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR HPCCG: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/HPCCG PENNANT: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(XSBench) add_subdirectory(Pathfinder) add_subdirectory(miniAMR) +add_subdirectory(miniGMG) Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile @@ -3,6 +3,7 @@ LEVEL = ../../.. PARALLEL_DIRS = XSBench \ miniAMR \ - Pathfinder + Pathfinder \ + miniGMG include $(LEVEL)/Makefile.programs Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeLists.txt =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeLists.txt +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeLists.txt @@ -0,0 +1,4 @@ +set(PROG miniGMG) +list(APPEND LDFLAGS -lm) +set(RUN_OPTIONS 6 2 2 2 1 1 1) +llvm_multisource() Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/LICENSE =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/LICENSE +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/LICENSE @@ -0,0 +1,16 @@ +*** License Agreement *** +miniGMG, Copyright (c) 2014, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved." + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +(1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +(2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +(3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the features, functionality or performance of the source code ("Enhancements") to anyone; however, if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley National Laboratory, without imposing a separate written license agreement for such Enhancements, then you hereby grant the following license: a non-exclusive, royalty-free perpetual license to install, use, modify, prepare derivative works, incorporate into other computer software, distribute, and sublicense such enhancements or derivative works thereof, in binary and source code form. +********* + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/Makefile =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/Makefile +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/Makefile @@ -0,0 +1,6 @@ +LEVEL = ../../../.. + +PROG = miniGMG +LDFLAGS = -lm +RUN_OPTIONS = 6 2 2 2 1 1 1 +include $(LEVEL)/MultiSource/Makefile.multisrc Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/apply_op.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/apply_op.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/apply_op.inc @@ -0,0 +1,63 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include "timer.h" +#include "mg.h" +#include "box.h" + +void apply_op(domain_type * domain, int level, int Ax_id, int x_id, double a, double b){ // y=Ax or y=D^{-1}Ax = lambda[]Ax + // exchange the boundary of x in preparation for Ax + // note, Ax (one power) with a 7-point stencil requires only the faces + exchange_boundary(domain,level,x_id,1,0,0); + + // now do Ax proper... + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + int box; + +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k,s; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double h2inv = 1.0/(domain->h[level]*domain->h[level]); + double * __restrict__ x = domain->subdomains[box].levels[level].grids[ x_id] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point + double * __restrict__ Ax = domain->subdomains[box].levels[level].grids[ Ax_id] + ghosts*(1+pencil+plane); + double * __restrict__ alpha = domain->subdomains[box].levels[level].grids[ __alpha ] + ghosts*(1+pencil+plane); + double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[ __beta_i] + ghosts*(1+pencil+plane); + double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[ __beta_j] + ghosts*(1+pencil+plane); + double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[ __beta_k] + ghosts*(1+pencil+plane); + double * __restrict__ lambda = domain->subdomains[box].levels[level].grids[ __lambda] + ghosts*(1+pencil+plane); + + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;kcycles.apply_op[level] += (uint64_t)(CycleTime()-_timeStart); +} +//------------------------------------------------------------------------------------------------------------------------------ Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/box.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/box.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/box.h @@ -0,0 +1,29 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef BOX_H +#define BOX_H + +typedef struct { + double h; // h at this level (==hLevel == h0 * (double)(1< +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#include "defines.h" +#include "box.h" +//------------------------------------------------------------------------------------------------------------------------------ +int RandomPadding=-1; +//------------------------------------------------------------------------------------------------------------------------------ +int create_box(box_type *box, int numGrids, int low_i, int low_j, int low_k, int dim_i, int dim_j, int dim_k, int ghosts){ + uint64_t memory_allocated = 0; + box->numGrids = numGrids; + box->low.i = low_i; + box->low.j = low_j; + box->low.k = low_k; + box->dim.i = dim_i; + box->dim.j = dim_j; + box->dim.k = dim_k; + box->dim_with_ghosts.i = dim_i+2*ghosts; + box->dim_with_ghosts.j = dim_j+2*ghosts; + box->dim_with_ghosts.k = dim_k+2*ghosts; + box->ghosts = ghosts; + box->pencil = (dim_i+2*ghosts); + box->plane = (dim_i+2*ghosts)*(dim_j+2*ghosts); + + // nominally the stencil assumes VL is a perfect multiple of $U and unrolls without cleanup. However, this means you can walk $U-1 beyond the last point you should update and start updating the ghost zone of the next plane. +//int MaxUnrolling = 8; // 2-way SIMD x unroll by 4 = 8/thread + int MaxUnrolling = 16; // 4-way SIMD x unroll by 4 = 16/thread +//int MaxUnrolling = 32; // 8-way SIMD x unroll by 4 = 32/thread + int paddingToAvoidStencilCleanup = 0; + if(box->pencil+1 < (MaxUnrolling-1)){paddingToAvoidStencilCleanup = (MaxUnrolling-1)-(box->pencil+1);} + +// round each plane up to ensure SIMD alignment within each plane (not pencil). Thus if ijk is aligned, ijkk+/-plane is aligned +//box->plane =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0xF) & ~0xF; // multiple of 128 bytes + box->plane =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0x7) & ~0x7; // multiple of 64 bytes (required for MIC) +//box->plane =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0x3) & ~0x3; // multiple of 32 bytes (required for AVX/QPX) +//box->plane =( ((dim_j+2*ghosts)*box->pencil)+paddingToAvoidStencilCleanup+0x1) & ~0x1; // multiple of 16 bytes (required for SSE) +//printf("%2d^2 = %5d -> %5d\n",box->dim_with_ghosts.i,(dim_j+2*ghosts)*box->pencil,box->plane); + + box->volume = (dim_k+2*ghosts)*box->plane; + //if(dim_i>=32){while( ((box->volume % 2048) != 288) )box->volume+=8;} + //if(dim_i>=32){while( ((box->volume % 512) != 72) )box->volume+=8;} // 32KB / 8way / 8bytes / 7 arrays + if(dim_i>=32){while( ((box->volume % 512) != 64) )box->volume+=8;} // 32KB / 8way / 8bytes / 8 arrays + //if(dim_i>=32){while( ((box->volume % 512) != 40) )box->volume+=8;} // 32KB / 8way / 8bytes / 13 arrays + //if(dim_i>=32){while( ((box->volume % 256) != 40) )box->volume+=8;} // 16KB / 8way / 8bytes / 7 arrays + //if(dim_i>=32){while( ((box->volume % 256) != 32) )box->volume+=8;} // 16KB / 8way / 8bytes / 8 arrays + + //if(RandomPadding<0){srand(time(NULL));RandomPadding = 8*((int)rand()%128);} + //if(dim_i>=32){box->volume+=RandomPadding;} + + + // allocate pointers to grids and grids themselves + posix_memalign((void**)&(box->grids),64,box->numGrids*sizeof(double*)); + memory_allocated += box->numGrids*sizeof(double*); +#if 0 + int g;for(g=0;gnumGrids;g++){ + posix_memalign((void**)&(box->grids[g]),64,box->volume*sizeof(double)); + memset(box->grids[g],0,box->volume*sizeof(double)); + memory_allocated += box->volume*sizeof(double); + } +#else + double * tmpbuf; + posix_memalign((void**)&tmpbuf,64,box->volume*box->numGrids*sizeof(double)); + memset( tmpbuf,0,box->volume*box->numGrids*sizeof(double)); + memory_allocated += box->volume*box->numGrids*sizeof(double); + int g;for(g=0;gnumGrids;g++){ + box->grids[g] = tmpbuf + g*box->volume; + //printf("box->grids[%2d] = 0x%016llx\n",g,(uint64_t)box->grids[g] & (0x3<<3)); + } +#endif + + // allocate RedBlackMask array for a plane... + posix_memalign((void**)&(box->RedBlack_64bMask),64,box->plane*sizeof(uint64_t)); + memset(box->RedBlack_64bMask, 0,box->plane*sizeof(uint64_t)); + memory_allocated += box->plane*sizeof(uint64_t); + posix_memalign((void**)&(box->RedBlack_FP[0] ),64,box->plane*sizeof(double )); + memset(box->RedBlack_FP[0] , 0,box->plane*sizeof(double )); + memory_allocated += box->plane*sizeof(double ); + posix_memalign((void**)&(box->RedBlack_FP[1] ),64,box->plane*sizeof(double )); + memset(box->RedBlack_FP[1] , 0,box->plane*sizeof(double )); + memory_allocated += box->plane*sizeof(double ); + + // initialize red/black... could do ij loop with ((i%pencil)^(j/pencil)&0x1) + int i,j; + for(j=0-ghosts;jdim.j+ghosts;j++){ + for(i=0-ghosts;idim.i+ghosts;i++){ + int ij = (i+ghosts) + (j+ghosts)*box->pencil; + if((i^j)&0x1)box->RedBlack_64bMask[ij]=~0;else box->RedBlack_64bMask[ij]= 0; + if((i^j)&0x1)box->RedBlack_FP[0][ij]=1.0;else box->RedBlack_FP[0][ij]=0.0; + if((i^j)&0x1)box->RedBlack_FP[1][ij]=0.0;else box->RedBlack_FP[1][ij]=1.0; + }} + // done... + return(memory_allocated); +} + +void destroy_box(box_type *box){ +#if 0 + int g;for(g=0;gnumGrids;g++){ + free(box->grids[g]); + } +#else + free(box->grids[0]); +#endif + free(box->grids); +} + + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/defines.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/defines.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/defines.h @@ -0,0 +1,52 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +// Helmholtz ~ Laplacian() = a*alpha*Identity - b*Divergence*beta*Gradient +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef DEFINES_H +#define DEFINES_H +#define __u 0 // = what we're eventually solving for (u), cell centered +#define __f 1 // = original right-hand side (Au=f), cell centered +#define __alpha 2 // cell centered constant +#define __beta 3 // cell centered constant +//------------------------------------------------------------------------------------------------------------------ +#define __lambda 4 // cell centered constant (inverse of the diagonal) +#define __beta_i 5 // face constant (n.b. element 0 is the left face of the ghost zone element) +#define __beta_j 6 // face constant (n.b. element 0 is the back face of the ghost zone element) +#define __beta_k 7 // face constant (n.b. element 0 is the bottom face of the ghost zone element) +#define __ee 8 // = used for correction (ee) in residual correction form, cell centered +#define __f_minus_Av 9 // = used for initial right-hand side (f-Av) in residual correction form, cell centered +#define __temp 10 // = used for unrestricted residual (r), cell centered +#define __u_exact 11 +//------------------------------------------------------------------------------------------------------------------ +#define __NumGrids 12 // total number of grids and the starting location for bottom solver grids +//------------------------------------------------------------------------------------------------------------------ + +// FIX, these should be adhoc allocated starting with __NumGrids +// For CG bottom solver +#define __r 13 +#define __p 14 +#define __Ap 16 + +// For BiCGStab bottom solver +#define __r0 12 +#define __r 13 +#define __p 14 +#define __s 15 +#define __Ap 16 +#define __As 17 + +// For CABiCGStab bottom solver +#define __rt 12 +#define __r 13 +#define __p 14 +#define __PRrt 15 // starting location for the 2S+1 extra p[]'s, 2S extra r[]'s, and rt + +// For CACG bottom solver +#define __r0 12 +#define __r 13 +#define __p 14 +#define __PRrt 15 // starting location for the S+1 extra p[]'s, S extra r[]'s, and rt +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/exchange_boundary.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/exchange_boundary.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/exchange_boundary.inc @@ -0,0 +1,172 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include "timer.h" +#include "mg.h" +//------------------------------------------------------------------------------------------------------------------------------ +inline void DoBufferCopy(domain_type *domain, int level, int grid_id, int buffer){ + // copy 3D array from read_i,j,k of read[] to write_i,j,k in write[] + int dim_i = domain->bufferCopies[level][buffer].dim.i; + int dim_j = domain->bufferCopies[level][buffer].dim.j; + int dim_k = domain->bufferCopies[level][buffer].dim.k; + + int read_i = domain->bufferCopies[level][buffer].read.i; + int read_j = domain->bufferCopies[level][buffer].read.j; + int read_k = domain->bufferCopies[level][buffer].read.k; + int read_pencil = domain->bufferCopies[level][buffer].read.pencil; + int read_plane = domain->bufferCopies[level][buffer].read.plane; + + int write_i = domain->bufferCopies[level][buffer].write.i; + int write_j = domain->bufferCopies[level][buffer].write.j; + int write_k = domain->bufferCopies[level][buffer].write.k; + int write_pencil = domain->bufferCopies[level][buffer].write.pencil; + int write_plane = domain->bufferCopies[level][buffer].write.plane; + + double * __restrict__ read = domain->bufferCopies[level][buffer].read.ptr; + double * __restrict__ write = domain->bufferCopies[level][buffer].write.ptr; + if(domain->bufferCopies[level][buffer].read.box >=0) read = domain->subdomains[ domain->bufferCopies[level][buffer].read.box].levels[level].grids[grid_id]; + if(domain->bufferCopies[level][buffer].write.box>=0)write = domain->subdomains[domain->bufferCopies[level][buffer].write.box].levels[level].grids[grid_id]; + + int i,j,k,read_ijk,write_ijk; + if(dim_i==1){ // be smart and don't have an inner loop from 0 to 1 + for(k=0;ksubdomains_per_rank_in.i*domain->subdomains[0].levels[level].dim.i;else sizes_all[n]*=domain->subdomains[0].levels[level].ghosts; + if(dj==0)sizes_all[n]*=domain->subdomains_per_rank_in.j*domain->subdomains[0].levels[level].dim.j;else sizes_all[n]*=domain->subdomains[0].levels[level].ghosts; + if(dk==0)sizes_all[n]*=domain->subdomains_per_rank_in.k*domain->subdomains[0].levels[level].dim.k;else sizes_all[n]*=domain->subdomains[0].levels[level].ghosts; + }}} + + // enumerate a packed list of messages... starting with receives... + for(n=0;n<27;n++)if(exchange[26-n] && (domain->rank_of_neighbor[26-n] != domain->rank) ){ + buffers_packed[nMessages] = domain->recv_buffer[26-n]; + sizes_packed[nMessages] = sizes_all[26-n]; + ranks_packed[nMessages] = domain->rank_of_neighbor[26-n]; + tags_packed[nMessages] = n; + nMessages++; + } + // enumerate a packed list of messages... continuing with sends... + for(n=0;n<27;n++)if(exchange[n] && (domain->rank_of_neighbor[n] != domain->rank) ){ + buffers_packed[nMessages] = domain->send_buffer[n]; + sizes_packed[nMessages] = sizes_all[n]; + ranks_packed[nMessages] = domain->rank_of_neighbor[n]; + tags_packed[nMessages] = n; + nMessages++; + } + + // loop through packed list of MPI receives and prepost Irecv's... + _timeStart = CycleTime(); + #ifdef __MPI_THREAD_MULTIPLE + #pragma omp parallel for schedule(dynamic,1) + #endif + for(n=0;ncycles.recv[level] += (_timeEnd-_timeStart); + + // pack MPI send buffers... + _timeStart = CycleTime(); + #pragma omp parallel for schedule(static,1) + for(buffer=domain->bufferCopy_Pack_Start;bufferbufferCopy_Pack_End;buffer++){ + if( (domain->bufferCopies[level][buffer].isFace && exchange_faces ) || + (domain->bufferCopies[level][buffer].isEdge && exchange_edges ) || + (domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){ + DoBufferCopy(domain,level,grid_id,buffer); + }} + _timeEnd = CycleTime(); + domain->cycles.grid2grid[level] += (_timeEnd-_timeStart); + + + // wait for MPI to finish... + #ifdef __MPI + _timeStart = CycleTime(); + MPI_Waitall(nMessages,requests_packed,status_packed); + _timeEnd = CycleTime(); + domain->cycles.wait[level] += (_timeEnd-_timeStart); + #endif + // unpack MPI receive buffers + _timeStart = CycleTime(); + #pragma omp parallel for schedule(static,1) + for(buffer=domain->bufferCopy_Unpack_Start;bufferbufferCopy_Unpack_End;buffer++){ + if( (domain->bufferCopies[level][buffer].isFace && exchange_faces ) || + (domain->bufferCopies[level][buffer].isEdge && exchange_edges ) || + (domain->bufferCopies[level][buffer].isCorner && exchange_corners) ){ + DoBufferCopy(domain,level,grid_id,buffer); + }} + _timeEnd = CycleTime(); + domain->cycles.unpack[level] += (_timeEnd-_timeStart); + #endif + + + domain->cycles.communication[level] += (uint64_t)(CycleTime()-_timeCommunicationStart); +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/interpolation.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/interpolation.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/interpolation.inc @@ -0,0 +1,217 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include "timer.h" +#include "mg.h" +//------------------------------------------------------------------------------------------------------------------------------ +// piecewise constant interpolation... +// +// +-------+ +---+---+ +// | | | x | x | +// | x | -> +---+---+ +// | | | x | x | +// +-------+ +---+---+ +// +//------------------------------------------------------------------------------------------------------------------------------ +void interpolation_constant(domain_type * domain, int level_f, double prescale_f, int id_f, int id_c){ // id_f = prescale*id_f + I_{2h}^{h}(id_c) + int level_c = level_f+1; + uint64_t _timeStart = CycleTime(); + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize); + int box; + + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int ghosts_c = domain->subdomains[box].levels[level_c].ghosts; + int pencil_c = domain->subdomains[box].levels[level_c].pencil; + int plane_c = domain->subdomains[box].levels[level_c].plane; + + int ghosts_f = domain->subdomains[box].levels[level_f].ghosts; + int pencil_f = domain->subdomains[box].levels[level_f].pencil; + int plane_f = domain->subdomains[box].levels[level_f].plane; + int dim_i_f = domain->subdomains[box].levels[level_f].dim.i; + int dim_j_f = domain->subdomains[box].levels[level_f].dim.j; + int dim_k_f = domain->subdomains[box].levels[level_f].dim.k; + + double * __restrict__ grid_f = domain->subdomains[box].levels[level_f].grids[id_f] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ grid_c = domain->subdomains[box].levels[level_c].grids[id_c] + ghosts_c*(1+pencil_c+plane_c); + + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;k>1) + (j>>1)*pencil_c + (k>>1)*plane_c; + grid_f[ijk_f] = prescale_f*grid_f[ijk_f] + grid_c[ijk_c]; + }}} + } + domain->cycles.interpolation[level_f] += (uint64_t)(CycleTime()-_timeStart); +} + +//------------------------------------------------------------------------------------------------------------------------------ +// piecewise linear interpolation... +// +//------------------------------------------------------------------------------------------------------------------------------ +void interpolation_linear(domain_type * domain, int level_f, double prescale_f, int id_f, int id_c){ // id_f = prescale*id_f + I_{2h}^{h}(id_c) + int level_c = level_f+1; + exchange_boundary(domain,level_c,id_c,1,1,1); // linear needs corners/edges in the coarse grid. + + uint64_t _timeStart = CycleTime(); + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize); + int box; + + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int ghosts_c = domain->subdomains[box].levels[level_c].ghosts; + int pencil_c = domain->subdomains[box].levels[level_c].pencil; + int plane_c = domain->subdomains[box].levels[level_c].plane; + int dim_i_c = domain->subdomains[box].levels[level_c].dim.i; + int dim_j_c = domain->subdomains[box].levels[level_c].dim.j; + int dim_k_c = domain->subdomains[box].levels[level_c].dim.k; + + int ghosts_f = domain->subdomains[box].levels[level_f].ghosts; + int pencil_f = domain->subdomains[box].levels[level_f].pencil; + int plane_f = domain->subdomains[box].levels[level_f].plane; + int dim_i_f = domain->subdomains[box].levels[level_f].dim.i; + int dim_j_f = domain->subdomains[box].levels[level_f].dim.j; + int dim_k_f = domain->subdomains[box].levels[level_f].dim.k; + + double * __restrict__ grid_f = domain->subdomains[box].levels[level_f].grids[ id_f] + ghosts_f*(1 + pencil_f + plane_f); // [0] is first non-ghost zone element + double * __restrict__ grid_c = domain->subdomains[box].levels[level_c].grids[ id_c] + ghosts_c*(1 + pencil_c + plane_c); + + // FIX what about dirichlet boundary conditions ??? + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;k>1) + (j>>1)*pencil_c + (k>>1)*plane_c; + + // ----------------------------------------------------------------------------------------------------------------------- + // Piecewise Quadratic Interpolation + // ----------------------------------------------------------------------------------------------------------------------- + // define parabola f(x) = ax^2 + bx + c through three coarse grid cells in i-direction... x=-1, x=0, and x=1 + // interpolate to (-0.25,j,k) and (+0.25,j,k) + // combine into 3 dimensions + // + // +-------+-------+-------+ + // | o | o | o | + // +-------+-------+-------+ + // . + // . + // interpolation + // . + // . + // +---+---+---+---+---+---+ + // | | | x | y | | | + // +---+---+---+---+---+---+ + // + #warning using 27pt stencil for piecewise-quadratic interpolation + double xm= 0.156250,x0=0.937500,xp=-0.093750; + double ym= 0.156250,y0=0.937500,yp=-0.093750; + double zm= 0.156250,z0=0.937500,zp=-0.093750; + if(i&0x1){xm=-0.093750;x0=0.937500;xp= 0.156250;} + if(j&0x1){ym=-0.093750;y0=0.937500;yp= 0.156250;} + if(k&0x1){zm=-0.093750;z0=0.937500;zp= 0.156250;} + grid_f[ijk_f] = + prescale_f*grid_f[ijk_f ] + + + zm*ym*xm*grid_c[ijk_c-1-pencil_c-plane_c] + + zm*ym*x0*grid_c[ijk_c -pencil_c-plane_c] + + zm*ym*xp*grid_c[ijk_c+1-pencil_c-plane_c] + + zm*y0*xm*grid_c[ijk_c-1 -plane_c] + + zm*y0*x0*grid_c[ijk_c -plane_c] + + zm*y0*xp*grid_c[ijk_c+1 -plane_c] + + zm*yp*xm*grid_c[ijk_c-1+pencil_c-plane_c] + + zm*yp*x0*grid_c[ijk_c +pencil_c-plane_c] + + zm*yp*xp*grid_c[ijk_c+1+pencil_c-plane_c] + + + z0*ym*xm*grid_c[ijk_c-1-pencil_c ] + + z0*ym*x0*grid_c[ijk_c -pencil_c ] + + z0*ym*xp*grid_c[ijk_c+1-pencil_c ] + + z0*y0*xm*grid_c[ijk_c-1 ] + + z0*y0*x0*grid_c[ijk_c ] + + z0*y0*xp*grid_c[ijk_c+1 ] + + z0*yp*xm*grid_c[ijk_c-1+pencil_c ] + + z0*yp*x0*grid_c[ijk_c +pencil_c ] + + z0*yp*xp*grid_c[ijk_c+1+pencil_c ] + + + zp*ym*xm*grid_c[ijk_c-1-pencil_c+plane_c] + + zp*ym*x0*grid_c[ijk_c -pencil_c+plane_c] + + zp*ym*xp*grid_c[ijk_c+1-pencil_c+plane_c] + + zp*y0*xm*grid_c[ijk_c-1 +plane_c] + + zp*y0*x0*grid_c[ijk_c +plane_c] + + zp*y0*xp*grid_c[ijk_c+1 +plane_c] + + zp*yp*xm*grid_c[ijk_c-1+pencil_c+plane_c] + + zp*yp*x0*grid_c[ijk_c +pencil_c+plane_c] + + zp*yp*xp*grid_c[ijk_c+1+pencil_c+plane_c] ; + /* + // ----------------------------------------------------------------------------------------------------------------------- + // Piecewise-Linear Interpolation + // ----------------------------------------------------------------------------------------------------------------------- + // define line f(x) = bx + c through two coarse grid cells in i-direction... x=+/-1 and x=0 + // interpolate to either (-0.25) or (+0.25) + // combine into 3 dimensions + // + // +-------+-------+ + // | o | o | + // +-------+-------+ + // . + // . + // interpolation + // . + // . + // +---+---+---+---+ + // | | x | y | | + // +---+---+---+---+ + // + #warning using 8pt stencil for piecewise-linear interpolation + int delta_i= -1;if(i&0x1)delta_i= 1; // i.e. even points look backwards while odd points look forward + int delta_j=-pencil_c;if(j&0x1)delta_j=pencil_c; + int delta_k= -plane_c;if(k&0x1)delta_k= plane_c; + grid_f[ijk_f] = + prescale_f*grid_f[ijk_f ] + + 0.421875*grid_c[ijk_c ] + + 0.140625*grid_c[ijk_c +delta_k] + + 0.140625*grid_c[ijk_c +delta_j ] + + 0.046875*grid_c[ijk_c +delta_j+delta_k] + + 0.140625*grid_c[ijk_c+delta_i ] + + 0.046875*grid_c[ijk_c+delta_i +delta_k] + + 0.046875*grid_c[ijk_c+delta_i+delta_j ] + + 0.015625*grid_c[ijk_c+delta_i+delta_j+delta_k] ; + */ + /* + // ----------------------------------------------------------------------------------------------------------------------- + // Piecewise-Linear Interpolation + // ----------------------------------------------------------------------------------------------------------------------- + #warning using 7pt stencil for piecewise-linear interpolation... doesn't given 2nd order FMG?? + double coefi = -0.125;if(i&0x1)coefi = 0.125; + double coefj = -0.125;if(j&0x1)coefj = 0.125; + double coefk = -0.125;if(k&0x1)coefk = 0.125; + grid_f[ijk_f] = prescale_f*grid_f[ijk_f ] + + grid_c[ijk_c ] + + coefi*(grid_c[ijk_c+ 1]-grid_c[ijk_c- 1]) + + coefj*(grid_c[ijk_c+pencil_c]-grid_c[ijk_c-pencil_c]) + + coefk*(grid_c[ijk_c+ plane_c]-grid_c[ijk_c- plane_c]); + */ + }}} + } + domain->cycles.interpolation[level_f] += (uint64_t)(CycleTime()-_timeStart); +} +//------------------------------------------------------------------------------------------------------------------------------ Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/jacobi.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/jacobi.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/jacobi.inc @@ -0,0 +1,86 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include "timer.h" +#include "mg.h" +#include "defines.h" +//------------------------------------------------------------------------------------------------------------------------------ +void smooth(domain_type * domain, int level, int phi_id, int rhs_id, double a, double b){ + if(numSmooths&1){ + printf("error - numSmooths must be even...\n"); + exit(0); + } + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + + int box,s; + int ghosts = domain->ghosts; + double TwoThirds = 2.0/3.0; + + // if communication-avoiding, need RHS for stencils in ghost zones + if(ghosts>1)exchange_boundary(domain,level,rhs_id,1,1,1); + + for(s=0;s1,ghosts>1); // corners/edges if doing communication-avoiding... + else exchange_boundary(domain,level,__temp,1,ghosts>1,ghosts>1); // corners/edges if doing communication-avoiding... + + // now do ghosts communication-avoiding smooths on each box... + uint64_t _timeStart = CycleTime(); + + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k,ss; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double h2inv = 1.0/(domain->h[level]*domain->h[level]); + double * __restrict__ rhs = domain->subdomains[box].levels[level].grids[ rhs_id] + ghosts*(1+pencil+plane); + double * __restrict__ alpha = domain->subdomains[box].levels[level].grids[__alpha ] + ghosts*(1+pencil+plane); + double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[__beta_i] + ghosts*(1+pencil+plane); + double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[__beta_j] + ghosts*(1+pencil+plane); + double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane); + double * __restrict__ lambda = domain->subdomains[box].levels[level].grids[__lambda] + ghosts*(1+pencil+plane); + + int ghostsToOperateOn=ghosts-1; + for(ss=s;sssubdomains[box].levels[level].grids[ phi_id] + ghosts*(1+pencil+plane); + phi_new= domain->subdomains[box].levels[level].grids[ __temp] + ghosts*(1+pencil+plane);} + else{phi = domain->subdomains[box].levels[level].grids[ __temp] + ghosts*(1+pencil+plane); + phi_new= domain->subdomains[box].levels[level].grids[ phi_id] + ghosts*(1+pencil+plane);} + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0-ghostsToOperateOn;kcycles.smooth[level] += (uint64_t)(CycleTime()-_timeStart); + } // s-loop +} + +//------------------------------------------------------------------------------------------------------------------------------ Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/lambda.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/lambda.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/lambda.inc @@ -0,0 +1,73 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include "timer.h" +#include "mg.h" +#include "defines.h" +//------------------------------------------------------------------------------------------------------------------------------ +void rebuild_lambda(domain_type * domain, int level, double a, double b){ + uint64_t _timeStart = CycleTime(); + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + int box; + + double dominant_eigenvalue = -1.0; + #pragma omp parallel for private(box) if(omp_across_boxes) reduction(max:dominant_eigenvalue) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double h2inv = 1.0/(domain->h[level]*domain->h[level]); + double * __restrict__ alpha = domain->subdomains[box].levels[level].grids[__alpha ] + ghosts*(1+pencil+plane); + double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[__beta_i] + ghosts*(1+pencil+plane); + double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[__beta_j] + ghosts*(1+pencil+plane); + double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane); + double * __restrict__ lambda = domain->subdomains[box].levels[level].grids[__lambda] + ghosts*(1+pencil+plane); + double box_eigenvalue = -1.0; + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) reduction(max:box_eigenvalue) + for(k=0;kbox_eigenvalue)box_eigenvalue=Di; // upper limit to Gershgorin disc == bound on dominant eigenvalue + }}} + if(box_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = box_eigenvalue;} + } + domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); + + #ifdef __MPI + uint64_t _timeStartAllReduce = CycleTime(); + double send = dominant_eigenvalue; + MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); + uint64_t _timeEndAllReduce = CycleTime(); + domain->cycles.collectives[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + #endif + + if(domain->rank==0){if(level==0)printf("\n");printf(" level=%2d, eigenvalue_max ~= %e\n",level,dominant_eigenvalue);fflush(stdout);} + domain->dominant_eigenvalue_of_DinvA[level] = dominant_eigenvalue; +} + + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/matmul.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/matmul.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/matmul.inc @@ -0,0 +1,71 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include "timer.h" +#include "mg.h" +//------------------------------------------------------------------------------------------------------------------------------ +void matmul_grids(domain_type * domain, int level, double *C, int * id_A, int * id_B, int rows, int cols, int A_equals_B_transpose){ + // *id_A = m grid_id's (conceptually pointers to the rows of a m x domain->subdomains_per_rank*volume matrix) + // *id_B = n grid_id's (conceptually pointers to the columns of a domain->subdomains_per_rank*volume matrix x n) + // *C is a mxn matrix where C[rows][cols] = dot(id_A[rows],id_B[cols]) + + // FIX, id_A and id_B are likely the same and thus C[][] will be symmetric (modulo missing row?) + // if(A_equals_B_transpose && (cols>=rows)) then use id_B and only run for nn>=mm // common case for s-step Krylov methods + // C_is_symmetric && cols< rows (use id_A) + int omp_across_matrix = 1; + int omp_within_a_box = 0; + int mm,nn; + + + uint64_t _timeStart = CycleTime(); + #pragma omp parallel for private(mm,nn) if(omp_across_matrix) collapse(2) schedule(static,1) + for(mm=0;mm=mm){ // upper triangular + int box; + double a_dot_b_domain = 0.0; + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_A[mm]] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point + double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_B[nn]] + ghosts*(1+pencil+plane); + double a_dot_b_box = 0.0; + #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:a_dot_b_box) + for(k=0;kcycles.blas3[level] += (uint64_t)(CycleTime()-_timeStart); + + #ifdef __MPI + double *send_buffer = (double*)malloc(rows*cols*sizeof(double)); + for(mm=0;mmcycles.collectives[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + free(send_buffer); + #endif + +} + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.h @@ -0,0 +1,129 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef MG_H +#define MG_H +#ifdef __MPI +#include +#endif +#include "box.h" +//------------------------------------------------------------------------------------------------------------------------------ +#define MaxLevels 10 +//------------------------------------------------------------------------------------------------------------------------------ +#define __DEFAULT_BOTTOM_NORM 1e-3 +//------------------------------------------------------------------------------------------------------------------------------ +#define numSmooths 4 +//------------------------------------------------------------------------------------------------------------------------------ +#define __BOUNDARY_PERIODIC 0 +#define __BOUNDARY_DIRICHLET 1 +#define __BOUNDARY_NEUMANN 2 +//------------------------------------------------------------------------------------------------------------------------------ +typedef struct { +//double scale; // +1.0 for ghost zone exchanges +// // -1.0 for dirichlet boundary conditions in finite volume +// // 0.0 for dirichlet boundary conditions in finite difference +// // +1.0 for neumann boundary conditions in finite volume or difference + int isFace,isEdge,isCorner; // used to bypass certain computations + struct {int i, j, k;}dim; // dimensions of the buffer to copy + struct {int box, i, j, k, pencil, plane;double * __restrict__ ptr;}read,write; // coordinates in the read grid to extract the buffer, + // and coordinates in the write grid to insert the buffer + // if read/write.box<0, then use write/read.ptr, + // otherwise domain->subdomains[.box].levels[level].grids[grid_id] +} bufferCopy_type; +//------------------------------------------------------------------------------------------------------------------------------ +typedef struct { + int rank; // MPI rank of remote process + int local_index; // index in subdomains[] on remote process + #ifdef __MPI + struct{int buf;struct{int faces,edges,corners;}offset;}send; // i.e. calculate offset as faceSize*faces + edgeSize*edges + cornerSize*corners + struct{int buf;struct{int faces,edges,corners;}offset;}recv; // i.e. calculate offset as faceSize*faces + edgeSize*edges + cornerSize*corners + #endif +} neighbor_type; + +//------------------------------------------------------------------------------------------------------------------------------ +typedef struct { + struct {int i, j, k;}low; // global coordinates of the first (non-ghost) element of subdomain at the finest resolution + struct {int i, j, k;}dim; // subdomain dimensions at finest resolution + int numLevels; // number of levels in MG v-cycle. 1=no restrictions + int ghosts; // ghost zone depth + neighbor_type neighbors[27]; // MPI rank and local index (on remote process) of each subdomain neighboring this subdomain + box_type * levels; // pointer to an array of all coarsenings of this box +} subdomain_type; + + +//------------------------------------------------------------------------------------------------------------------------------ + +typedef struct { + // timing information... + struct { + uint64_t smooth[MaxLevels]; + uint64_t apply_op[MaxLevels]; + uint64_t residual[MaxLevels]; + uint64_t restriction[MaxLevels]; + uint64_t interpolation[MaxLevels]; + uint64_t communication[MaxLevels]; + uint64_t pack[MaxLevels]; + uint64_t grid2grid[MaxLevels]; + uint64_t unpack[MaxLevels]; + uint64_t recv[MaxLevels]; + uint64_t send[MaxLevels]; + uint64_t wait[MaxLevels]; + uint64_t blas1[MaxLevels]; + uint64_t blas3[MaxLevels]; + uint64_t collectives[MaxLevels]; + uint64_t Total[MaxLevels]; + uint64_t MGBuild; // total time spent building the coefficients... + uint64_t vcycles; // total time spent in all vcycles (all CycleMG) + uint64_t MGSolve; // total time spent in MGSolve + }cycles; + int vcycles_performed, MGSolves_performed, Krylov_iterations, CAKrylov_formations_of_G; + + int rank_of_neighbor[27]; // = MPI rank of the neighbors of this process's subdomains (presumes rectahedral packing) + #ifdef __MPI + double * __restrict__ send_buffer[27]; // = MPI send buffers (one per neighbor) + double * __restrict__ recv_buffer[27]; // = MPI recieve buffer (one per neighbor) + struct{int faces,edges,corners;} buffer_size[27]; // = MPI buffer size (one per neighbor) in the units of faces/edges/corners +//int buffer_size[MaxLevels][27]; // = MPI buffer size (one per neighbor) in doubles + #endif + + bufferCopy_type * bufferCopies[MaxLevels]; // array of pointers to list of bufferCopy's that must be performed to realize a boundary exchange + int bufferCopy_Pack_Start, bufferCopy_Pack_End; // index into above list to pack the MPI buffers + int bufferCopy_Local_Start, bufferCopy_Local_End; // index into above list to perform a local exchange (hide within waitall) + int bufferCopy_Unpack_Start,bufferCopy_Unpack_End; // index into above list to unpack the MPI buffers + + // n.b. i=unit stride + struct {int i, j, k;}dim; // global dimensions at finest resolution + struct {int i, j, k;}ranks_in; // number of MPI ranks in i,j,k + struct {int i, j, k;}subdomains_per_rank_in; // number of subdomains in i,j,k + struct {int i, j, k;}subdomains_in; // total number of subdomains in i,j,k across the entire domain + struct {int i, j, k;}boundary_condition; // domain boundary condition in i,j,k directions (e.g. __BOUNDARY_PERIODIC) + int rank; // MPI rank of this process + int subdomains_per_rank; // number of subdomains owned by this process + int numLevels; // number of levels in MG v-cycle. 1=no restrictions + int numGrids; // number of grids (variables) + int ghosts; // ghost zone depth + double h[MaxLevels]; // h at each level + double dominant_eigenvalue_of_DinvA[MaxLevels]; // (estimate) for the dominant(largest) eigenvalue at each level of the operator D^{-1}A = lambda*helmholtz + subdomain_type * subdomains; // pointer to a list of all subdomains owned by this process +} domain_type; + +//------------------------------------------------------------------------------------------------------------------------------ + int create_subdomain(subdomain_type * box, + int subdomain_low_i, int subdomain_low_j, int subdomain_low_k, + int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k, + int numGrids, int ghosts, int numLevels); +void destroy_domain(domain_type * domain); + int create_domain(domain_type * domain, + int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k, + int subdomains_per_rank_in_i, int subdomains_per_rank_in_j, int subdomains_per_rank_in_k, + int ranks_in_i, int ranks_in_j, int ranks_in_k, + int rank, + int *boundary_conditions, + int numGrids, int ghosts, int numLevels); +void MGBuild(domain_type * domain, double a, double b, double h0); +void MGSolve(domain_type * domain, int u_id, int F_id, double a, double b, double desired_mg_norm); +void print_timing(domain_type *domain); +//------------------------------------------------------------------------------------------------------------------------------ +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/mg.c @@ -0,0 +1,798 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef __MPI +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "timer.h" +#include "defines.h" +#include "box.h" +#include "mg.h" +#include "operators.h" +#include "solver.h" +//------------------------------------------------------------------------------------------------------------------------------ +int create_subdomain(subdomain_type * box, int subdomain_low_i, int subdomain_low_j, int subdomain_low_k, + int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k, + int numGrids, int ghosts, int numLevels){ + int level; + uint64_t memory_allocated=0; + box->numLevels=numLevels; + box->ghosts=ghosts; + box->low.i = subdomain_low_i; + box->low.j = subdomain_low_j; + box->low.k = subdomain_low_k; + box->dim.i = subdomain_dim_i; + box->dim.j = subdomain_dim_j; + box->dim.k = subdomain_dim_k; + posix_memalign((void**)&(box->levels),64,numLevels*sizeof(box_type)); + memory_allocated += numLevels*sizeof(box_type); + for(level=0;levellevels[level],numGridsActual,subdomain_low_i>>level,subdomain_low_j>>level,subdomain_low_k>>level, + subdomain_dim_i>>level,subdomain_dim_j>>level,subdomain_dim_k>>level,ghosts); + } + return(memory_allocated); +} + + +void destroy_subdomain(subdomain_type * box){ + int level; + for(level=0;levelnumLevels;level++){ + destroy_box(&box->levels[level]); + } + free(box->levels); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +int calculate_neighboring_subdomain_index(domain_type *domain, int bi, int bj, int bk, int di, int dj, int dk){ + int ni,nj,nk; + ni=(bi+domain->subdomains_per_rank_in.i+di)%domain->subdomains_per_rank_in.i; + nj=(bj+domain->subdomains_per_rank_in.j+dj)%domain->subdomains_per_rank_in.j; + nk=(bk+domain->subdomains_per_rank_in.k+dk)%domain->subdomains_per_rank_in.k; + int index = ni + nj*domain->subdomains_per_rank_in.i + nk*domain->subdomains_per_rank_in.i*domain->subdomains_per_rank_in.j; + return(index); +} + +int calculate_neighboring_subdomain_rank(domain_type * domain, int bi, int bj, int bk, int di, int dj, int dk, int ri, int rj, int rk){ + if( domain->boundary_condition.i != __BOUNDARY_PERIODIC){ + if( ((domain->subdomains_per_rank_in.i*ri)+bi+di) < 0 )return(-1); + if( ((domain->subdomains_per_rank_in.i*ri)+bi+di) >= domain->subdomains_in.i )return(-1); + } + if( domain->boundary_condition.j != __BOUNDARY_PERIODIC){ + if( ((domain->subdomains_per_rank_in.j*rj)+bj+dj) < 0 )return(-1); + if( ((domain->subdomains_per_rank_in.j*rj)+bj+dj) >= domain->subdomains_in.j )return(-1); + } + if( domain->boundary_condition.k != __BOUNDARY_PERIODIC){ + if( ((domain->subdomains_per_rank_in.k*rk)+bk+dk) < 0 )return(-1); + if( ((domain->subdomains_per_rank_in.k*rk)+bk+dk) >= domain->subdomains_in.k )return(-1); + } + // if didn't walk off domain, calculate new rank for periodic boundaries... + if((bi+di)<0)ri--;if((bi+di)>=domain->subdomains_per_rank_in.i)ri++;ri=(ri+domain->ranks_in.i)%domain->ranks_in.i; + if((bj+dj)<0)rj--;if((bj+dj)>=domain->subdomains_per_rank_in.j)rj++;rj=(rj+domain->ranks_in.j)%domain->ranks_in.j; + if((bk+dk)<0)rk--;if((bk+dk)>=domain->subdomains_per_rank_in.k)rk++;rk=(rk+domain->ranks_in.k)%domain->ranks_in.k; + int rank = ri + rj*domain->ranks_in.i + rk*domain->ranks_in.i*domain->ranks_in.j; + return(rank); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +int create_domain(domain_type * domain, + int subdomain_dim_i, int subdomain_dim_j, int subdomain_dim_k, + int subdomains_per_rank_in_i, int subdomains_per_rank_in_j, int subdomains_per_rank_in_k, + int ranks_in_i, int ranks_in_j, int ranks_in_k, + int rank, + int *boundary_conditions, + int numGrids, int ghosts, int numLevels + ){ + int level; + int i, j, k; + int di,dj,dk; + // FIX ghosts = array (varies with level) + domain->rank = rank; + if(domain->rank==0){printf("creating domain... ");fflush(stdout);} + //FIX, limit ghosts based on coarsest problem size + if(ghosts>(subdomain_dim_i>>(numLevels-1))){if(domain->rank==0)printf("#ghosts(%d)>bottom grid size(%d)\n",ghosts,subdomain_dim_i>>(numLevels-1));exit(0);} + + if( (subdomain_dim_i!=subdomain_dim_j)||(subdomain_dim_j!=subdomain_dim_k)||(subdomain_dim_i!=subdomain_dim_k) ){ + if(domain->rank==0)printf("subdomain_dim's must be equal\n");exit(0); + } + uint64_t memory_allocated =0; + // processes are laid out in x, then y, then z + //int ranks = ranks_in_i*ranks_in_j*ranks_in_k; + int my_k_rank = (rank ) / (ranks_in_i*ranks_in_j); + int my_j_rank = (rank - (ranks_in_i*ranks_in_j*my_k_rank) ) / (ranks_in_i ); + int my_i_rank = (rank - (ranks_in_i*ranks_in_j*my_k_rank) - (ranks_in_i*my_j_rank) ); + //printf("%2d: (%2d,%2d,%2d)\n",domain->rank,my_k_rank,my_j_rank,my_i_rank); + + + domain->ranks_in.i = ranks_in_i; + domain->ranks_in.j = ranks_in_j; + domain->ranks_in.k = ranks_in_k; + domain->subdomains_per_rank_in.i = subdomains_per_rank_in_i; + domain->subdomains_per_rank_in.j = subdomains_per_rank_in_j; + domain->subdomains_per_rank_in.k = subdomains_per_rank_in_k; + domain->subdomains_in.i = subdomains_per_rank_in_i*ranks_in_i; + domain->subdomains_in.j = subdomains_per_rank_in_j*ranks_in_j; + domain->subdomains_in.k = subdomains_per_rank_in_k*ranks_in_k; + domain->subdomains_per_rank = subdomains_per_rank_in_i*subdomains_per_rank_in_j*subdomains_per_rank_in_k; + posix_memalign((void**)&(domain->subdomains),64,domain->subdomains_per_rank*sizeof(subdomain_type)); + memory_allocated+=domain->subdomains_per_rank*sizeof(subdomain_type); + + domain->dim.i = domain->subdomains_in.i * subdomain_dim_i; + domain->dim.j = domain->subdomains_in.j * subdomain_dim_j; + domain->dim.k = domain->subdomains_in.k * subdomain_dim_k; + domain->boundary_condition.i = boundary_conditions[0]; + domain->boundary_condition.j = boundary_conditions[1]; + domain->boundary_condition.k = boundary_conditions[2]; + domain->numLevels = numLevels; + domain->numGrids = numGrids; + domain->ghosts = ghosts; + for(level=0;levelnumLevels;level++){ + domain->h[level] = -1; + domain->dominant_eigenvalue_of_DinvA[level] = -1; + } + + // calculate the neighbors of this process + for(dk=-1;dk<=1;dk++){ + for(dj=-1;dj<=1;dj++){ + for(di=-1;di<=1;di++){ + int n = 13+di+3*dj+9*dk; + int neighbor_rank_in_i = (my_i_rank+di+ranks_in_i)%ranks_in_i; + int neighbor_rank_in_j = (my_j_rank+dj+ranks_in_j)%ranks_in_j; + int neighbor_rank_in_k = (my_k_rank+dk+ranks_in_k)%ranks_in_k; + domain->rank_of_neighbor[n] = neighbor_rank_in_i + ranks_in_i*neighbor_rank_in_j + ranks_in_i*ranks_in_j*neighbor_rank_in_k; + if( domain->boundary_condition.i != __BOUNDARY_PERIODIC){ + if( (my_i_rank+di) < 0 )domain->rank_of_neighbor[n] = -1; + if( (my_i_rank+di) >= ranks_in_i )domain->rank_of_neighbor[n] = -1; + } + if( domain->boundary_condition.j != __BOUNDARY_PERIODIC){ + if( (my_j_rank+dj) < 0 )domain->rank_of_neighbor[n] = -1; + if( (my_j_rank+dj) >= ranks_in_j )domain->rank_of_neighbor[n] = -1; + } + if( domain->boundary_condition.k != __BOUNDARY_PERIODIC){ + if( (my_k_rank+dk) < 0 )domain->rank_of_neighbor[n] = -1; + if( (my_k_rank+dk) >= ranks_in_k )domain->rank_of_neighbor[n] = -1; + } + }}} + + // subdomains within a process are laid out in i, then j, then k + for(k=0;ksubdomains[box],low_i,low_j,low_k,subdomain_dim_i,subdomain_dim_j,subdomain_dim_k,numGrids,ghosts,numLevels); + for(dk=-1;dk<=1;dk++){ + for(dj=-1;dj<=1;dj++){ + for(di=-1;di<=1;di++){ + int n = 13+di+3*dj+9*dk; + domain->subdomains[box].neighbors[n].rank = calculate_neighboring_subdomain_rank(domain,i,j,k,di,dj,dk,my_i_rank,my_j_rank,my_k_rank); + domain->subdomains[box].neighbors[n].local_index = calculate_neighboring_subdomain_index(domain,i,j,k,di,dj,dk); + // FIX boundary conditions?? + //printf("rank=%2d, box[%2d].neighbors[%3d]=(%3d,%3d)\n",domain->rank,box,n,domain->subdomains[box].neighbors[n].rank,domain->subdomains[box].neighbors[n].local_index); + }}} + }}} + + + int FacesEdgesCorners[27] = {4,10,12,14,16,22, 1,3,5,7,9,11,15,17,19,21,23,25, 0,2,6,8,18,20,24,26, 13}; + int faces[27] = {0,0,0,0,1,0,0,0,0, 0,1,0,1,0,1,0,1,0, 0,0,0,0,1,0,0,0,0}; + int edges[27] = {0,1,0,1,0,1,0,1,0, 1,0,1,0,0,0,1,0,1, 0,1,0,1,0,1,0,1,0}; + int corners[27] = {1,0,1,0,0,0,1,0,1, 0,0,0,0,0,0,0,0,0, 1,0,1,0,0,0,1,0,1}; + + #ifdef __MPI + // allocate MPI send/recv buffers for the 26 neighbors.... + for(dk=-1;dk<=1;dk++){ + for(dj=-1;dj<=1;dj++){ + for(di=-1;di<=1;di++){ + int n = 13+di+3*dj+9*dk;if(n!=13){ + int bufSize = 1; + if(di==0)bufSize*=domain->subdomains_per_rank_in.i*subdomain_dim_i;else bufSize*=ghosts; + if(dj==0)bufSize*=domain->subdomains_per_rank_in.j*subdomain_dim_j;else bufSize*=ghosts; + if(dk==0)bufSize*=domain->subdomains_per_rank_in.k*subdomain_dim_k;else bufSize*=ghosts; + posix_memalign((void**)&(domain->send_buffer[n]),64,bufSize*sizeof(double)); + posix_memalign((void**)&(domain->recv_buffer[n]),64,bufSize*sizeof(double)); + memset(domain->send_buffer[n],0,bufSize*sizeof(double)); + memset(domain->recv_buffer[n],0,bufSize*sizeof(double)); + memory_allocated+=bufSize*sizeof(double); + memory_allocated+=bufSize*sizeof(double); + }}}} + #endif + + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // now enumerate a list of data transfers to effect a ghost zone exchange (boundary conditions are skipped) + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + for(level=0;levelnumLevels;level++){ + int initialize;for(initialize=0;initialize<2;initialize++){ // initialize=0=count/malloc. initialize=1=initialize + int buffer=0; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Traverse list of boxes, and create MPI pack for faces, edges, and corners + domain->bufferCopy_Pack_Start=buffer; + int nn; + #ifdef __MPI + for(nn=0;nn<26;nn++){ + int n = FacesEdgesCorners[nn]; + int di = ((n/1)%3) - 1; + int dj = ((n/3)%3) - 1; + int dk = ((n/9)%3) - 1; + + // traverse all boxes in direction di/dj/dk... (e.g. all subdomains on the corresponding face) + int sd_i_lo,sd_i_hi; + int sd_j_lo,sd_j_hi; + int sd_k_lo,sd_k_hi; + switch(di){ + case -1:sd_i_lo= 0;sd_i_hi= 1;break; + case 0:sd_i_lo= 0;sd_i_hi=subdomains_per_rank_in_i;break; + case 1:sd_i_lo=subdomains_per_rank_in_i-1;sd_i_hi=subdomains_per_rank_in_i;break; + }; + switch(dj){ + case -1:sd_j_lo= 0;sd_j_hi= 1;break; + case 0:sd_j_lo= 0;sd_j_hi=subdomains_per_rank_in_j;break; + case 1:sd_j_lo=subdomains_per_rank_in_j-1;sd_j_hi=subdomains_per_rank_in_j;break; + }; + switch(dk){ + case -1:sd_k_lo= 0;sd_k_hi= 1;break; + case 0:sd_k_lo= 0;sd_k_hi=subdomains_per_rank_in_k;break; + case 1:sd_k_lo=subdomains_per_rank_in_k-1;sd_k_hi=subdomains_per_rank_in_k;break; + }; + for(k=sd_k_lo;krank)off_node_exchange=0; // same process + if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == -1)off_node_exchange=0; // domain boundary + if(off_node_exchange){ + int sendBox = i + j*subdomains_per_rank_in_i + k*subdomains_per_rank_in_i*subdomains_per_rank_in_j; + int grid_i,grid_j,grid_k; // ijk in the source grid (relative to first ghost zone element) + int dim_i, dim_j, dim_k; // dimensions of face/edge/corner + switch(di){ + case -1:grid_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].ghosts;break; + case 0:grid_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].dim.i; break; + case 1:grid_i=domain->subdomains[sendBox].levels[level].dim.i; dim_i=domain->subdomains[sendBox].levels[level].ghosts;break; + }; + switch(dj){ + case -1:grid_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].ghosts;break; + case 0:grid_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].dim.j; break; + case 1:grid_j=domain->subdomains[sendBox].levels[level].dim.j; dim_j=domain->subdomains[sendBox].levels[level].ghosts;break; + }; + switch(dk){ + case -1:grid_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].ghosts;break; + case 0:grid_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].dim.k; break; + case 1:grid_k=domain->subdomains[sendBox].levels[level].dim.k; dim_k=domain->subdomains[sendBox].levels[level].ghosts;break; + }; + int buf_i = dim_i*(i-sd_i_lo); // ijk in the target buffer + int buf_j = dim_j*(j-sd_j_lo); + int buf_k = dim_k*(k-sd_k_lo); + int buf_pencil = dim_i*(sd_i_hi-sd_i_lo); + int buf_plane = dim_j*(sd_j_hi-sd_j_lo)*buf_pencil; + + // FIX, could be broken into sub buffers for more parallelism (i.e. pencils) + if(initialize==1){ + domain->bufferCopies[level][buffer].dim.i = dim_i; + domain->bufferCopies[level][buffer].dim.j = dim_j; + domain->bufferCopies[level][buffer].dim.k = dim_k; + domain->bufferCopies[level][buffer].read.box = sendBox; + domain->bufferCopies[level][buffer].read.ptr = NULL; + domain->bufferCopies[level][buffer].read.i = grid_i; + domain->bufferCopies[level][buffer].read.j = grid_j; + domain->bufferCopies[level][buffer].read.k = grid_k; + domain->bufferCopies[level][buffer].read.pencil = domain->subdomains[sendBox].levels[level].pencil; + domain->bufferCopies[level][buffer].read.plane = domain->subdomains[sendBox].levels[level].plane; + domain->bufferCopies[level][buffer].write.box = -1; + domain->bufferCopies[level][buffer].write.ptr = domain->send_buffer[n]; + domain->bufferCopies[level][buffer].write.i = buf_i; + domain->bufferCopies[level][buffer].write.j = buf_j; + domain->bufferCopies[level][buffer].write.k = buf_k; + domain->bufferCopies[level][buffer].write.pencil = buf_pencil; + domain->bufferCopies[level][buffer].write.plane = buf_plane; + domain->bufferCopies[level][buffer].isFace = faces[n]; + domain->bufferCopies[level][buffer].isEdge = edges[n]; + domain->bufferCopies[level][buffer].isCorner = corners[n]; + } + //else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from box=%3d to (%3d,%3d,%3d) in MPI[%2d,%2d,%2d] (%3d,%4d)\n",level,buffer,dim_i,dim_j,dim_k,grid_i,grid_j,grid_k,sendBox,buf_i,buf_j,buf_k,di,dj,dk,buf_pencil,buf_plane); + buffer++; + } // off-process neighbor + }}} // subdomains + } // directions + #endif + domain->bufferCopy_Pack_End=buffer; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Traverse list of boxes, and create local exchange/domain copies for faces, edges, and corners + domain->bufferCopy_Local_Start=buffer; + for(nn=0;nn<26;nn++){ + int n = FacesEdgesCorners[nn]; + int di = ((n/1)%3) - 1; + int dj = ((n/3)%3) - 1; + int dk = ((n/9)%3) - 1; + for(k=0;krank)on_node_exchange=0; // if == -1, then domain boundary and thus not on_node + if(on_node_exchange){ + int recvBox = calculate_neighboring_subdomain_index(domain,i,j,k,di,dj,dk); + int send_i,send_j,send_k; // ijk in the source grid (relative to first ghost zone element) + int recv_i,recv_j,recv_k; // ijk in the destination grid (relative to first ghost zone element) + int dim_i, dim_j, dim_k; // dimensions of face/edge/corner + switch(di){ + case -1:send_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].ghosts;recv_i=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.i;break; + case 0:send_i=domain->subdomains[sendBox].levels[level].ghosts;dim_i=domain->subdomains[sendBox].levels[level].dim.i; recv_i=domain->subdomains[recvBox].levels[level].ghosts;break; + case 1:send_i=domain->subdomains[sendBox].levels[level].dim.i; dim_i=domain->subdomains[sendBox].levels[level].ghosts;recv_i=0;break; + }; + switch(dj){ + case -1:send_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].ghosts;recv_j=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.j;break; + case 0:send_j=domain->subdomains[sendBox].levels[level].ghosts;dim_j=domain->subdomains[sendBox].levels[level].dim.j; recv_j=domain->subdomains[recvBox].levels[level].ghosts;break; + case 1:send_j=domain->subdomains[sendBox].levels[level].dim.j; dim_j=domain->subdomains[sendBox].levels[level].ghosts;recv_j=0;break; + }; + switch(dk){ + case -1:send_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].ghosts;recv_k=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.k;break; + case 0:send_k=domain->subdomains[sendBox].levels[level].ghosts;dim_k=domain->subdomains[sendBox].levels[level].dim.k; recv_k=domain->subdomains[recvBox].levels[level].ghosts;break; + case 1:send_k=domain->subdomains[sendBox].levels[level].dim.k; dim_k=domain->subdomains[sendBox].levels[level].ghosts;recv_k=0;break; + }; + // FIX, could be broken into sub buffers for more parallelism (i.e. pencils) + if(initialize==1){ + domain->bufferCopies[level][buffer].dim.i = dim_i; + domain->bufferCopies[level][buffer].dim.j = dim_j; + domain->bufferCopies[level][buffer].dim.k = dim_k; + domain->bufferCopies[level][buffer].read.box = sendBox; + domain->bufferCopies[level][buffer].read.ptr = NULL; + domain->bufferCopies[level][buffer].read.i = send_i; + domain->bufferCopies[level][buffer].read.j = send_j; + domain->bufferCopies[level][buffer].read.k = send_k; + domain->bufferCopies[level][buffer].read.pencil = domain->subdomains[sendBox].levels[level].pencil; + domain->bufferCopies[level][buffer].read.plane = domain->subdomains[sendBox].levels[level].plane; + domain->bufferCopies[level][buffer].write.box = recvBox; + domain->bufferCopies[level][buffer].write.ptr = NULL; + domain->bufferCopies[level][buffer].write.i = recv_i; + domain->bufferCopies[level][buffer].write.j = recv_j; + domain->bufferCopies[level][buffer].write.k = recv_k; + domain->bufferCopies[level][buffer].write.pencil = domain->subdomains[recvBox].levels[level].pencil; + domain->bufferCopies[level][buffer].write.plane = domain->subdomains[recvBox].levels[level].plane; + domain->bufferCopies[level][buffer].isFace = faces[n]; + domain->bufferCopies[level][buffer].isEdge = edges[n]; + domain->bufferCopies[level][buffer].isCorner = corners[n]; + } + //else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from box=%3d to (%3d,%3d,%3d) in box=%3d\n",level,buffer,dim_i,dim_j,dim_k,send_i,send_j,send_k,sendBox,recv_i,recv_j,recv_k,recvBox); + buffer++; + } + }}} // all local subdomains + } // directions + domain->bufferCopy_Local_End=buffer; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Traverse list of boxes and their neighbors and gather data from buffers... + // faces come from face buffers + // edges can come from either edge buffers or face buffers + // corners can come from either corner buffers, edge buffers or face buffers + domain->bufferCopy_Unpack_Start=buffer; + #ifdef __MPI + for(nn=0;nn<26;nn++){ + int n = FacesEdgesCorners[nn]; + int di = ((n/1)%3) - 1; + int dj = ((n/3)%3) - 1; + int dk = ((n/9)%3) - 1; + for(k=0;krank)off_node_exchange=0; // same node + if(calculate_neighboring_subdomain_rank(domain,i,j,k, di,dj,dk, my_i_rank,my_j_rank,my_k_rank) == -1)off_node_exchange=0; // domain boundary + if(off_node_exchange){ + int _di=0; + int _dj=0; + int _dk=0; + if((i+di)<0){_di=-1;}else if((i+di)>=subdomains_per_rank_in_i){_di=1;} + if((j+dj)<0){_dj=-1;}else if((j+dj)>=subdomains_per_rank_in_j){_dj=1;} + if((k+dk)<0){_dk=-1;}else if((k+dk)>=subdomains_per_rank_in_k){_dk=1;} + int _n = 13 + _di + 3*_dj + 9*_dk; + int dim_i, dim_j, dim_k; // dimensions of face/edge/corner + int grid_i,grid_j,grid_k; // ijk in the destination grid (relative to first ghost zone element) + int buf_i, buf_j, buf_k; // ijk in the source buffer + int buf_plane =1; + int buf_pencil=1; + switch(di){ + case -1:dim_i=domain->subdomains[recvBox].levels[level].ghosts;grid_i=0; break; + case 0:dim_i=domain->subdomains[recvBox].levels[level].dim.i; grid_i=domain->subdomains[recvBox].levels[level].ghosts; break; + case 1:dim_i=domain->subdomains[recvBox].levels[level].ghosts;grid_i=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.i;break; + }; + switch(dj){ + case -1:dim_j=domain->subdomains[recvBox].levels[level].ghosts;grid_j=0; break; + case 0:dim_j=domain->subdomains[recvBox].levels[level].dim.j; grid_j=domain->subdomains[recvBox].levels[level].ghosts; break; + case 1:dim_j=domain->subdomains[recvBox].levels[level].ghosts;grid_j=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.j;break; + }; + switch(dk){ + case -1:dim_k=domain->subdomains[recvBox].levels[level].ghosts;grid_k=0; break; + case 0:dim_k=domain->subdomains[recvBox].levels[level].dim.k; grid_k=domain->subdomains[recvBox].levels[level].ghosts; break; + case 1:dim_k=domain->subdomains[recvBox].levels[level].ghosts;grid_k=domain->subdomains[recvBox].levels[level].ghosts+domain->subdomains[recvBox].levels[level].dim.k;break; + }; + switch(_di){ + case -1:buf_i=0; buf_plane*= domain->subdomains[recvBox].levels[level].ghosts;buf_pencil= domain->subdomains[recvBox].levels[level].ghosts;break; + case 0:buf_i=i*domain->subdomains[recvBox].levels[level].dim.i;buf_plane*=subdomains_per_rank_in_i*domain->subdomains[recvBox].levels[level].dim.i; buf_pencil=subdomains_per_rank_in_i*domain->subdomains[recvBox].levels[level].dim.i; break; + case 1:buf_i=0; buf_plane*= domain->subdomains[recvBox].levels[level].ghosts;buf_pencil= domain->subdomains[recvBox].levels[level].ghosts;break; + }; + switch(_dj){ + case -1:buf_j=0; buf_plane*= domain->subdomains[recvBox].levels[level].ghosts;break; + case 0:buf_j=j*domain->subdomains[recvBox].levels[level].dim.j;buf_plane*=subdomains_per_rank_in_j*domain->subdomains[recvBox].levels[level].dim.j; break; + case 1:buf_j=0; buf_plane*= domain->subdomains[recvBox].levels[level].ghosts;break; + }; + switch(_dk){ + case -1:buf_k=0; break; + case 0:buf_k=k*domain->subdomains[recvBox].levels[level].dim.k;break; + case 1:buf_k=0; break; + }; + if( (i+di>=0) && (i+disubdomains[recvBox].levels[level].ghosts;else if(di>0)buf_i+=domain->subdomains[recvBox].levels[level].dim.i;}// forward by dim.i or back by ghosts + if( (j+dj>=0) && (j+djsubdomains[recvBox].levels[level].ghosts;else if(dj>0)buf_j+=domain->subdomains[recvBox].levels[level].dim.j;}// forward by dim.j or back by ghosts + if( (k+dk>=0) && (k+dksubdomains[recvBox].levels[level].ghosts;else if(dk>0)buf_k+=domain->subdomains[recvBox].levels[level].dim.k;}// forward by dim.k or back by ghosts + // FIX, could be broken into sub buffers for more parallelism (i.e. pencils) + if(initialize==1){ + domain->bufferCopies[level][buffer].dim.i = dim_i; + domain->bufferCopies[level][buffer].dim.j = dim_j; + domain->bufferCopies[level][buffer].dim.k = dim_k; + domain->bufferCopies[level][buffer].read.box = -1; + domain->bufferCopies[level][buffer].read.ptr = domain->recv_buffer[_n]; + domain->bufferCopies[level][buffer].read.i = buf_i; + domain->bufferCopies[level][buffer].read.j = buf_j; + domain->bufferCopies[level][buffer].read.k = buf_k; + domain->bufferCopies[level][buffer].read.pencil = buf_pencil; + domain->bufferCopies[level][buffer].read.plane = buf_plane; + domain->bufferCopies[level][buffer].write.box = recvBox; + domain->bufferCopies[level][buffer].write.ptr = NULL; + domain->bufferCopies[level][buffer].write.i = grid_i; + domain->bufferCopies[level][buffer].write.j = grid_j; + domain->bufferCopies[level][buffer].write.k = grid_k; + domain->bufferCopies[level][buffer].write.pencil = domain->subdomains[recvBox].levels[level].pencil; + domain->bufferCopies[level][buffer].write.plane = domain->subdomains[recvBox].levels[level].plane; + domain->bufferCopies[level][buffer].isFace = faces[n]; + domain->bufferCopies[level][buffer].isEdge = edges[n]; + domain->bufferCopies[level][buffer].isCorner = corners[n]; + } + //else if(domain->rank==0)printf("level=%d, buffer=%3d, copy %3d x %3d x %3d at (%3d,%3d,%3d) from MPI[%2d,%2d,%2d] (%3d,%4d) to box=%3d at (%3d,%3d,%3d)\n",level,buffer,dim_i,dim_j,dim_k,buf_i,buf_j,buf_k,_di,_dj,_dk,buf_pencil,buf_plane,recvBox,grid_i,grid_j,grid_k); + buffer++; + } + }}} + } + #endif + domain->bufferCopy_Unpack_End=buffer; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // At the end of initialize 0, malloc the buffers + if(initialize==0){ + domain->bufferCopies[level] = (bufferCopy_type*)malloc(buffer*sizeof(bufferCopy_type)); + memory_allocated+=buffer*sizeof(bufferCopy_type); + } + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + }} + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + if(domain->rank==0){ + printf("done\n");fflush(stdout); + printf(" %d x %d x %d (per subdomain)\n",subdomain_dim_i,subdomain_dim_j,subdomain_dim_k); + printf(" %d x %d x %d (per process)\n",subdomains_per_rank_in_i*subdomain_dim_i,subdomains_per_rank_in_j*subdomain_dim_j,subdomains_per_rank_in_k*subdomain_dim_k); + printf(" %d x %d x %d (overall)\n",ranks_in_i*subdomains_per_rank_in_i*subdomain_dim_i,ranks_in_j*subdomains_per_rank_in_j*subdomain_dim_j,ranks_in_k*subdomains_per_rank_in_k*subdomain_dim_k); + printf(" %d-deep ghost zones\n",ghosts); + printf(" allocated %llu MB\n",memory_allocated>>20); + fflush(stdout); + } + return(memory_allocated); +} + + +void destroy_domain(domain_type * domain){ + if(domain->rank==0){printf("deallocating domain... ");fflush(stdout);} + int box;for(box=0;boxsubdomains_per_rank;box++){ + destroy_subdomain(&domain->subdomains[box]); + } + free(domain->subdomains); + // FIX, free buffer_copies, etc... + if(domain->rank==0){printf("done\n");fflush(stdout);} +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void print_timing(domain_type *domain){ + #ifdef TIMING + int level,numLevels = domain->numLevels; + uint64_t _timeStart=CycleTime();sleep(1);uint64_t _timeEnd=CycleTime(); + double SecondsPerCycle = (double)1.0/(double)(_timeEnd-_timeStart); + SecondsPerCycle = SecondsPerCycle/(double)domain->MGSolves_performed; // prints average performance per MGSolve + + if(domain->rank!=0)return; + + uint64_t total; + printf(" ");for(level=0;level<(numLevels );level++){printf("%12d ",level);}printf("\n"); + printf(" ");for(level=0;level<(numLevels );level++){printf("%10d^3 ",domain->subdomains[0].levels[0].dim.i>>level);}printf(" total\n"); + total=0;printf("smooth ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.smooth[level]);total+= domain->cycles.smooth[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf("residual ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.residual[level]);total+= domain->cycles.residual[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf("restriction ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.restriction[level]);total+= domain->cycles.restriction[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf("interpolation ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double)domain->cycles.interpolation[level]);total+=domain->cycles.interpolation[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf("applyOp ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.apply_op[level]);total+= domain->cycles.apply_op[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf("BLAS1 ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.blas1[level]);total+= domain->cycles.blas1[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf("BLAS3 ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.blas3[level]);total+= domain->cycles.blas3[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf("communication ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double)domain->cycles.communication[level]);total+=domain->cycles.communication[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf(" local exchange ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.grid2grid[level]);total+= domain->cycles.grid2grid[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + #ifdef __MPI + total=0;printf(" pack MPI buffers ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.pack[level]);total+= domain->cycles.pack[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf(" unpack MPI buffers ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.unpack[level]);total+= domain->cycles.unpack[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf(" MPI_Isend ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.send[level]);total+= domain->cycles.send[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf(" MPI_Irecv ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.recv[level]);total+= domain->cycles.recv[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf(" MPI_Waitall ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.wait[level]);total+= domain->cycles.wait[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + total=0;printf(" MPI_collectives ");for(level=0;level<(numLevels );level++){printf("%12.6f ",SecondsPerCycle*(double) domain->cycles.collectives[level]);total+= domain->cycles.collectives[level];}printf("%12.6f\n",SecondsPerCycle*(double)total); + #endif + total=0;printf("-------------- ");for(level=0;level<(numLevels+1);level++){printf("------------ ");}printf("\n"); + total=0;printf("Total by level ");for(level=0;level<(numLevels );level++){printf("%12.6f " ,SecondsPerCycle*(double)domain->cycles.Total[level]);total+=domain->cycles.Total[level];} + printf("%12.6f\n",SecondsPerCycle*(double)total); + printf("\n"); + printf( " Total time in MGBuild %12.6f\n",SecondsPerCycle*(double)domain->cycles.MGBuild); + printf( " Total time in MGSolve %12.6f\n",SecondsPerCycle*(double)domain->cycles.MGSolve); + printf(" \" v-cycles %12.6f\n",SecondsPerCycle*(double)domain->cycles.vcycles); + printf( " number of v-cycles %12d\n" ,domain->vcycles_performed/domain->MGSolves_performed); + printf( "Bottom solver iterations %12d\n" ,domain->Krylov_iterations/domain->MGSolves_performed); + #if defined(__USE_CABICGSTAB) || defined(__USE_CACG) + printf( " formations of G[][] %12d\n" ,domain->CAKrylov_formations_of_G/domain->MGSolves_performed); + #endif + printf("\n\n");fflush(stdout); + #endif +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void MGResetTimers(domain_type * domain){ + int level; + for(level=0;level<10;level++){ + domain->cycles.smooth[level] = 0; + domain->cycles.apply_op[level] = 0; + domain->cycles.residual[level] = 0; + domain->cycles.communication[level] = 0; + domain->cycles.restriction[level] = 0; + domain->cycles.interpolation[level] = 0; + domain->cycles.blas1[level] = 0; + domain->cycles.blas3[level] = 0; + domain->cycles.pack[level] = 0; + domain->cycles.unpack[level] = 0; + domain->cycles.grid2grid[level] = 0; + domain->cycles.recv[level] = 0; + domain->cycles.send[level] = 0; + domain->cycles.wait[level] = 0; + domain->cycles.collectives[level] = 0; + domain->cycles.Total[level] = 0; + } + domain->cycles.vcycles = 0; + domain->cycles.MGSolve = 0; + domain->vcycles_performed = 0; + domain->MGSolves_performed = 0; + domain->Krylov_iterations = 0; + domain->CAKrylov_formations_of_G = 0; +} +//------------------------------------------------------------------------------------------------------------------------------ +void MGBuild(domain_type * domain, double a, double b, double h0){ + int box,level,numLevels = domain->numLevels; + + if(domain->rank==0){printf("MGBuild... ");fflush(stdout);} + + // initialize timers... + MGResetTimers(domain); + domain->cycles.MGBuild = 0; + + uint64_t _timeStartMGBuild = CycleTime(); + + + // calculate hLevel for all levels/boxes + for(level=0;levelh[level] = hLevel; + for(box=0;boxsubdomains_per_rank;box++){ + domain->subdomains[box].levels[level].h = hLevel; + }} + + // alias all red/black masks to box0's - performance tweak to avoid having a single global list of RedBlack masks as a function of box size + //for(level=0;levelsubdomains_per_rank;box++){ + domain->subdomains[box].levels[level].RedBlack_64bMask = domain->subdomains[0].levels[level].RedBlack_64bMask; + domain->subdomains[box].levels[level].RedBlack_FP[0] = domain->subdomains[0].levels[level].RedBlack_FP[0]; + domain->subdomains[box].levels[level].RedBlack_FP[1] = domain->subdomains[0].levels[level].RedBlack_FP[1]; + }} + + + // form all restrictions of alpha[] for all boxes... + for(level=0;levelrank==0){printf("\n");fflush(stdout);} + for(level=0;levelrank==0){printf(" level=%2d, eigenvalue_max ~= %13.3f\n",level,eigenvalue_max);fflush(stdout);} + domain->dominant_eigenvalue_of_DinvA[level] = eigenvalue_max; + } + // find eigenvalue_min using the power method... + if(domain->rank==0){printf("\n");fflush(stdout);} + for(level=0;leveldominant_eigenvalue_of_DinvA[level],bk); // Abk = (A-ev_max I)bk = Abk - ev_max*bk + eigenvalue_min = dot(domain,level,bk,Abk)/dot(domain,level,bk,bk); + double den = sqrt(dot(domain,level,Abk,Abk)); // normalize bk+1 by the magnitude of Abk + scale_grid(domain,level,bk,1.0/den,Abk); // i.e. bk+1 = Abk / ||Abk|| + } + eigenvalue_min += domain->dominant_eigenvalue_of_DinvA[level]; + if(domain->rank==0){printf(" level=%2d, eigenvalue_min ~= %13.3f\n",level,eigenvalue_min);fflush(stdout);} + } +*/ + + + uint64_t delta = (CycleTime()-_timeStartMGBuild); + domain->cycles.MGBuild += delta; + + if(domain->rank==0){printf("done\n");fflush(stdout);} +} + + + + +//------------------------------------------------------------------------------------------------------------------------------ +void MGSolve(domain_type * domain, int u_id, int F_id, double a, double b, double desired_mg_norm){ + domain->MGSolves_performed++; + int level; + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + int e_id = __u; + int R_id = __f_minus_Av; + int box,v; + int numLevels = domain->numLevels; + #ifdef __TEST_MG_CONVERGENCE + int maxVCycles = 20; + #else + int maxVCycles = 10; + #endif + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(domain->rank==0){printf("MGSolve... ");fflush(stdout);} + uint64_t _timeStartMGSolve = CycleTime(); + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // make initial guess for e (=0) and setup the RHS + #if 1 + zero_grid(domain,0,e_id); // ee = 0 + scale_grid(domain,0,R_id,1.0,F_id); // R_id = F_id + #else + mul_grids(domain,0,e_id,1.0,F_id,__lambda); // e_id = F_id*lambda + scale_grid(domain,0,R_id,1.0,F_id); // R_id = F_id + #endif + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // now do v-cycles to calculate the correction... + uint64_t _timeStartVCycle = CycleTime(); + for(v=0;vvcycles_performed++; + + // down the v-cycle... + for(level=0;level<(domain->numLevels-1);level++){ + uint64_t _LevelStart = CycleTime(); + #ifdef __FUSION_RESIDUAL_RESTRICTION + smooth(domain,level,e_id,R_id,a,b); + residual_and_restriction(domain,level,e_id,R_id,level+1,R_id,a,b); + zero_grid(domain,level+1,e_id); + #else + smooth(domain,level,e_id,R_id,a,b); + residual(domain,level,__temp,e_id,R_id,a,b); + restriction(domain,level,R_id,__temp); + zero_grid(domain,level+1,e_id); + #endif + domain->cycles.Total[level] += (uint64_t)(CycleTime()-_LevelStart); + } // down + + + // bottom solve... + uint64_t _timeBottomStart = CycleTime(); + level = domain->numLevels-1; + IterativeSolver(domain,level,e_id,R_id,a,b,__DEFAULT_BOTTOM_NORM); + domain->cycles.Total[level] += (uint64_t)(CycleTime()-_timeBottomStart); + + + // back up the v-cycle... + for(level=(domain->numLevels-2);level>=0;level--){ + uint64_t _LevelStart = CycleTime(); + #ifdef __LINEAR_INTERPOLATION + interpolation_linear(domain,level,1.0,e_id,e_id); + #else + interpolation_constant(domain,level,1.0,e_id,e_id); + #endif + smooth(domain,level,e_id,R_id,a,b); + domain->cycles.Total[level] += (uint64_t)(CycleTime()-_LevelStart); + } // up + + + // now calculate the norm of the residual... + #if defined(__PRINT_NORM) || defined(__TEST_MG_CONVERGENCE) + uint64_t _timeStart = CycleTime(); + level = 0; + residual(domain,level,__temp,e_id,F_id,a,b); + #if 1 + #warning Using ||D^{-1}(b-Ax)||_{inf} as convergence criteria... + mul_grids(domain,level,__temp,1.0,__temp,__lambda); /// <<< precondition by D^{-1} ??? + #endif + double norm_of_residual = norm(domain,level,__temp); + uint64_t _timeNorm = CycleTime(); + domain->cycles.Total[0] += (uint64_t)(_timeNorm-_timeStart); + #if defined(__PRINT_NORM) + if(domain->rank==0){ + if(v==0)printf("\n"); + printf("v-cycle=%2d, norm=%22.20f (%12.6e)\n",v+1,norm_of_residual,norm_of_residual); + } + #endif + #if defined(__TEST_MG_CONVERGENCE) + if(norm_of_residualcycles.vcycles += (uint64_t)(CycleTime()-_timeStartVCycle); + domain->cycles.MGSolve += (uint64_t)(CycleTime()-_timeStartMGSolve); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + if(domain->rank==0){printf("done\n");fflush(stdout);} +} + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.c @@ -0,0 +1,181 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef OMP +#include +#endif +#ifdef __MPI +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "defines.h" +#include "box.h" +#include "mg.h" +#include "operators.h" +//------------------------------------------------------------------------------------------------------------------------------ +int main(int argc, char **argv){ + int MPI_Rank=0; + int MPI_Tasks=1; + int OMP_Threads = 1; + +#ifdef OMP + #pragma omp parallel + { + #pragma omp master + { + OMP_Threads = omp_get_num_threads(); + } + } +#endif + + #ifdef __MPI + #warning Compiling for MPI... + int MPI_threadingModel = -1; +//int MPI_threadingModelRequested = MPI_THREAD_SINGLE; +//int MPI_threadingModelRequested = MPI_THREAD_SERIALIZED; + int MPI_threadingModelRequested = MPI_THREAD_FUNNELED; +//int MPI_threadingModelRequested = MPI_THREAD_MULTIPLE; + #ifdef __MPI_THREAD_MULTIPLE + MPI_threadingModelRequested = MPI_THREAD_MULTIPLE; + #endif + MPI_Init_thread(&argc, &argv, MPI_threadingModelRequested, &MPI_threadingModel); + MPI_Comm_size(MPI_COMM_WORLD, &MPI_Tasks); + MPI_Comm_rank(MPI_COMM_WORLD, &MPI_Rank); + + if(MPI_threadingModel>MPI_threadingModelRequested)MPI_threadingModel=MPI_threadingModelRequested; + if(MPI_Rank==0){ + if(MPI_threadingModelRequested == MPI_THREAD_MULTIPLE )printf("Requested MPI_THREAD_MULTIPLE, "); + else if(MPI_threadingModelRequested == MPI_THREAD_SINGLE )printf("Requested MPI_THREAD_SINGLE, "); + else if(MPI_threadingModelRequested == MPI_THREAD_FUNNELED )printf("Requested MPI_THREAD_FUNNELED, "); + else if(MPI_threadingModelRequested == MPI_THREAD_SERIALIZED)printf("Requested MPI_THREAD_SERIALIZED, "); + else if(MPI_threadingModelRequested == MPI_THREAD_MULTIPLE )printf("Requested MPI_THREAD_MULTIPLE, "); + else printf("Requested Unknown MPI Threading Model (%d), ",MPI_threadingModelRequested); + if(MPI_threadingModel == MPI_THREAD_MULTIPLE )printf("got MPI_THREAD_MULTIPLE\n"); + else if(MPI_threadingModel == MPI_THREAD_SINGLE )printf("got MPI_THREAD_SINGLE\n"); + else if(MPI_threadingModel == MPI_THREAD_FUNNELED )printf("got MPI_THREAD_FUNNELED\n"); + else if(MPI_threadingModel == MPI_THREAD_SERIALIZED)printf("got MPI_THREAD_SERIALIZED\n"); + else if(MPI_threadingModel == MPI_THREAD_MULTIPLE )printf("got MPI_THREAD_MULTIPLE\n"); + else printf("got Unknown MPI Threading Model (%d)\n",MPI_threadingModel); + fflush(stdout); } + #ifdef __MPI_THREAD_MULTIPLE + if( (MPI_threadingModelRequested == MPI_THREAD_MULTIPLE) && (MPI_threadingModel != MPI_THREAD_MULTIPLE) ){MPI_Finalize();exit(0);} + #endif + #endif + + + int log2_subdomain_dim = 6; + int subdomains_per_rank_in_i=256 / (1<7){ + if(MPI_Rank==0){printf("error, log2_subdomain_dim(%d)>7\n",log2_subdomain_dim);} + #ifdef __MPI + MPI_Finalize(); + #endif + exit(0); + } +*/ + + if(ranks_in_i*ranks_in_j*ranks_in_k != MPI_Tasks){ + if(MPI_Rank==0){printf("error, ranks_in_i*ranks_in_j*ranks_in_k(%d*%d*%d=%d) != MPI_Tasks(%d)\n",ranks_in_i,ranks_in_j,ranks_in_k,ranks_in_i*ranks_in_j*ranks_in_k,MPI_Tasks);} + #ifdef __MPI + MPI_Finalize(); + #endif + exit(0); + } + + if(MPI_Rank==0)printf("%d MPI Tasks of %d threads\n",MPI_Tasks,OMP_Threads); + + int subdomain_dim_i=1< +#include "timer.h" +#include "mg.h" +//------------------------------------------------------------------------------------------------------------------------------ +void zero_grid(domain_type *domain, int level, int grid_id){ + // zero's the entire grid INCLUDING ghost zones... + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + int box; + +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid = domain->subdomains[box].levels[level].grids[grid_id] + ghosts*(1+pencil+plane); + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=-ghosts;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); +} + + + +//------------------------------------------------------------------------------------------------------------------------------ +void initialize_grid_to_scalar(domain_type *domain, int level, int grid_id, double scalar){ + // initializes the grid to a scalar while zero'ing the ghost zones... + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + int box; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid = domain->subdomains[box].levels[level].grids[grid_id] + ghosts*(1+pencil+plane); + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=-ghosts;k=dim_i) || (j>=dim_j) || (k>=dim_k); + grid[ijk] = ghostZone ? 0.0 : scalar; + }}} + } +#endif + domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void add_grids(domain_type *domain, int level, int id_c, double scale_a, int id_a, double scale_b, int id_b){ // c=scale_a*id_a + scale_b*id_b + uint64_t _timeStart = CycleTime(); + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + int box; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); + double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); + double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane); + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void mul_grids(domain_type *domain, int level, int id_c, double scale, int id_a, int id_b){ // id_c=scale*id_a*id_b + uint64_t _timeStart = CycleTime(); + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + int box; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); + double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); + double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane); + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); +} + + + +//------------------------------------------------------------------------------------------------------------------------------ +void scale_grid(domain_type *domain, int level, int id_c, double scale_a, int id_a){ // c[]=scale_a*a[] + uint64_t _timeStart = CycleTime(); + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + int box; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); + double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +double dot(domain_type *domain, int level, int id_a, int id_b){ + uint64_t _timeStart = CycleTime(); + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + + int box; + double a_dot_b_domain = 0.0; +#ifdef OMP + // FIX, schedule(static) is a stand in to guarantee reproducibility... + #pragma omp parallel for private(box) if(omp_across_boxes) reduction(+:a_dot_b_domain) schedule(static) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point + double * __restrict__ grid_b = domain->subdomains[box].levels[level].grids[id_b] + ghosts*(1+pencil+plane); + double a_dot_b_box = 0.0; + #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:a_dot_b_box) schedule(static) + for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); + + #ifdef __MPI + uint64_t _timeStartAllReduce = CycleTime(); + double send = a_dot_b_domain; + MPI_Allreduce(&send,&a_dot_b_domain,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); + uint64_t _timeEndAllReduce = CycleTime(); + domain->cycles.collectives[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + #endif + + return(a_dot_b_domain); +} + +//------------------------------------------------------------------------------------------------------------------------------ +double norm(domain_type *domain, int level, int grid_id){ // implements the max norm + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + + int box; + double max_norm = 0.0; +#ifdef OMP + // FIX, schedule(static) is a stand in to guarantee reproducibility... + #pragma omp parallel for private(box) if(omp_across_boxes) reduction(max:max_norm) schedule(static) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid = domain->subdomains[box].levels[level].grids[ grid_id] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point + double box_norm = 0.0; + #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(max:box_norm) schedule(static) + for(k=0;kbox_norm){box_norm=fabs_grid_ijk;} // max norm + }}} + if(box_norm>max_norm){max_norm = box_norm;} + } // box list +#endif + domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); + + #ifdef __MPI + uint64_t _timeStartAllReduce = CycleTime(); + double send = max_norm; + MPI_Allreduce(&send,&max_norm,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); + uint64_t _timeEndAllReduce = CycleTime(); + domain->cycles.collectives[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + #endif + return(max_norm); +} + + +//------------------------------------------------------------------------------------------------------------------------------ +double mean(domain_type *domain, int level, int id_a){ + uint64_t _timeStart = CycleTime(); + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + + int box; + double sum_domain = 0.0; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) reduction(+:sum_domain) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point + double sum_box = 0.0; + #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) reduction(+:sum_box) + for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); + double ncells_domain = (double)domain->dim.i*(double)domain->dim.j*(double)domain->dim.k; + + #ifdef __MPI + uint64_t _timeStartAllReduce = CycleTime(); + double send = sum_domain; + MPI_Allreduce(&send,&sum_domain,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); + uint64_t _timeEndAllReduce = CycleTime(); + domain->cycles.collectives[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + domain->cycles.communication[level] += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); + #endif + + double mean_domain = sum_domain / ncells_domain; + return(mean_domain); +} + + +void shift_grid(domain_type *domain, int level, int id_c, int id_a, double shift_a){ + uint64_t _timeStart = CycleTime(); + + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + + int box; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid_c = domain->subdomains[box].levels[level].grids[id_c] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point + double * __restrict__ grid_a = domain->subdomains[box].levels[level].grids[id_a] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point + + #pragma omp parallel for private(i,j,k) if(omp_within_a_box) collapse(2) + for(k=0;kcycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); +} + +//------------------------------------------------------------------------------------------------------------------------------ +void project_cell_to_face(domain_type *domain, int level, int id_cell, int id_face, int dir){ + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + int box; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double * __restrict__ grid_cell = domain->subdomains[box].levels[level].grids[id_cell] + ghosts*(1+pencil+plane); + double * __restrict__ grid_face = domain->subdomains[box].levels[level].grids[id_face] + ghosts*(1+pencil+plane); + int stride; + switch(dir){ + case 0: stride = 1;break;//i-direction + case 1: stride = pencil;break;//j-direction + case 2: stride = plane;break;//k-direction + } + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;k<=dim_k;k++){ // <= to ensure you do low and high faces + for(j=0;j<=dim_j;j++){ + for(i=0;i<=dim_i;i++){ + int ijk = i + j*pencil + k*plane; + grid_face[ijk] = 0.5*(grid_cell[ijk-stride] + grid_cell[ijk]); // simple linear interpolation + }}} + } +#endif + domain->cycles.blas1[level] += (uint64_t)(CycleTime()-_timeStart); +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.h @@ -0,0 +1,36 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ + void apply_op(domain_type *domain, int level, int Ax_id, int x_id, double a, double b); + void residual(domain_type *domain, int level, int res_id, int phi_id, int rhs_id, double a, double b); + void smooth(domain_type *domain, int level, int phi_id, int rhs_id, double a, double b); + void rebuild_lambda(domain_type *domain, int level, double a, double b); +//------------------------------------------------------------------------------------------------------------------------------ + void residual_and_restriction(domain_type *domain, int level_f, int phi_id, int rhs_id, int level_c, int res_id, double a, double b); + void restriction(domain_type *domain, int level_f, int id_c, int id_f); + void restriction_betas(domain_type *domain, int level_f, int level_c); + void interpolation_constant(domain_type *domain, int level_f, double prescale_f, int id_f, int id_c); + void interpolation_linear(domain_type *domain, int level_f, double prescale_f, int id_f, int id_c); +//------------------------------------------------------------------------------------------------------------------------------ + void exchange_boundary(domain_type *domain, int level, int grid_id, int exchange_faces, int exchange_edges, int exchange_corners); +//------------------------------------------------------------------------------------------------------------------------------ +double dot(domain_type *domain, int level, int id_a, int id_b); +double norm(domain_type *domain, int level, int grid_id); +double mean(domain_type *domain, int level, int id_a); + void add_grids(domain_type *domain, int level, int id_c, double scale_a, int id_a, double scale_b, int id_b); + void scale_grid(domain_type *domain, int level, int id_c, double scale_a, int id_a); + void zero_grid(domain_type *domain, int level, int grid_id); +// void shift_grid(domain_type *domain, int level, int id_c, int id_a, double shift_a); + void mul_grids(domain_type *domain, int level, int id_c, double scale, int id_a, int id_b); +// void initialize_grid_to_scalar(domain_type *domain, int level, int grid_id, double scalar); +//------------------------------------------------------------------------------------------------------------------------------ + void project_cell_to_face(domain_type *domain, int level, int id_cell, int id_face, int dir); +//------------------------------------------------------------------------------------------------------------------------------ + void matmul_grids(domain_type *domain, int level, double *C, int *id_A, int *id_B, int rows, int cols, int A_equals_B_transpose); +//------------------------------------------------------------------------------------------------------------------------------ + void initialize_problem(domain_type *domain, int level, double hLevel, double a, double b); +//------------------------------------------------------------------------------------------------------------------------------ +// void __box_smooth_GSRB_multiple(box_type *box, int phi_id, int rhs_id, double a, double b, int sweep); +// void __box_smooth_GSRB_multiple_threaded(box_type *box, int phi_id, int rhs_id, double a, double b, int sweep); Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.ompif.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.ompif.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/operators.ompif.c @@ -0,0 +1,31 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include + +//------------------------------------------------------------------------------------------------------------------------------ +#include "timer.h" +#include "defines.h" +#include "box.h" +#include "mg.h" +//------------------------------------------------------------------------------------------------------------------------------ +#include "exchange_boundary.inc" +#include "lambda.inc" +#include "jacobi.inc" +//#include "operators.ompif/gsrb.inc" +//#include "operators.ompif/chebyshev.inc" +#include "apply_op.inc" +#include "residual.inc" +#include "restriction.inc" +#include "interpolation.inc" +#include "misc.inc" +#include "matmul.inc" +#include "problem1.inc" +//#include "operators.ompif/problem2.inc" +//------------------------------------------------------------------------------------------------------------------------------ Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/problem1.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/problem1.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/problem1.inc @@ -0,0 +1,84 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include "defines.h" +#include "timer.h" +#include "mg.h" +//------------------------------------------------------------------------------------------------------------------------------ +void initialize_problem(domain_type *domain, int level, double hLevel, double a, double b){ + double NPi = 2.0*M_PI; + double Bmin = 1.0; + double Bmax = 10.0; + double c2 = (Bmax-Bmin)/2; + double c1 = (Bmax+Bmin)/2; + double c3=10.0; // how sharply (B)eta transitions + double c4 = -5.0/0.25; + + int box; + for(box=0;boxsubdomains_per_rank;box++){ + memset(domain->subdomains[box].levels[level].grids[__u_exact],0,domain->subdomains[box].levels[level].volume*sizeof(double)); + memset(domain->subdomains[box].levels[level].grids[__f ],0,domain->subdomains[box].levels[level].volume*sizeof(double)); + int i,j,k; + #pragma omp parallel for private(k,j,i) collapse(2) + for(k=0;ksubdomains[box].levels[level].dim.k;k++){ + for(j=0;jsubdomains[box].levels[level].dim.j;j++){ + for(i=0;isubdomains[box].levels[level].dim.i;i++){ + double x = hLevel*((double)(i+domain->subdomains[box].levels[level].low.i)+0.5); + double y = hLevel*((double)(j+domain->subdomains[box].levels[level].low.j)+0.5); + double z = hLevel*((double)(k+domain->subdomains[box].levels[level].low.k)+0.5); + int ijk = (i+domain->subdomains[box].levels[level].ghosts)+ + domain->subdomains[box].levels[level].pencil*(j+domain->subdomains[box].levels[level].ghosts)+ + domain->subdomains[box].levels[level].plane *(k+domain->subdomains[box].levels[level].ghosts); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + double r2 = pow((x-0.50),2) + pow((y-0.50),2) + pow((z-0.50),2); // distance from center squared + double r2x = 2.0*(x-0.50); + double r2y = 2.0*(y-0.50); + double r2z = 2.0*(z-0.50); + double r2xx = 2.0; + double r2yy = 2.0; + double r2zz = 2.0; + double r = pow(r2,0.5); + double rx = 0.5*r2x*pow(r2,-0.5); + double ry = 0.5*r2y*pow(r2,-0.5); + double rz = 0.5*r2z*pow(r2,-0.5); + double rxx = 0.5*r2xx*pow(r2,-0.5) - 0.25*r2x*r2x*pow(r2,-1.5); + double ryy = 0.5*r2yy*pow(r2,-0.5) - 0.25*r2y*r2y*pow(r2,-1.5); + double rzz = 0.5*r2zz*pow(r2,-0.5) - 0.25*r2z*r2z*pow(r2,-1.5); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // beta (B) is a bubble... + double A = 1.0; + double B = c1+c2*tanh( c3*(r-0.25) ); + double Bx = c2*c3*rx*(1-pow(tanh( c3*(r-0.25) ),2)); + double By = c2*c3*ry*(1-pow(tanh( c3*(r-0.25) ),2)); + double Bz = c2*c3*rz*(1-pow(tanh( c3*(r-0.25) ),2)); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + double u = exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*sin(NPi*z); + double ux = c4*r2x*u + NPi*exp(c4*r2)*cos(NPi*x)*sin(NPi*y)*sin(NPi*z); + double uy = c4*r2y*u + NPi*exp(c4*r2)*sin(NPi*x)*cos(NPi*y)*sin(NPi*z); + double uz = c4*r2z*u + NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*cos(NPi*z); + double uxx = c4*r2xx*u + c4*r2x*ux + c4*r2x*NPi*exp(c4*r2)*cos(NPi*x)*sin(NPi*y)*sin(NPi*z) - NPi*NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*sin(NPi*z); + double uyy = c4*r2yy*u + c4*r2y*uy + c4*r2y*NPi*exp(c4*r2)*sin(NPi*x)*cos(NPi*y)*sin(NPi*z) - NPi*NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*sin(NPi*z); + double uzz = c4*r2zz*u + c4*r2z*uz + c4*r2z*NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*cos(NPi*z) - NPi*NPi*exp(c4*r2)*sin(NPi*x)*sin(NPi*y)*sin(NPi*z); + double f = a*A*u - b*( (Bx*ux + By*uy + Bz*uz) + B*(uxx + uyy + uzz) ); + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + domain->subdomains[box].levels[level].grids[__alpha ][ijk] = A; + domain->subdomains[box].levels[level].grids[__beta ][ijk] = B; + domain->subdomains[box].levels[level].grids[__u_exact][ijk] = u; + domain->subdomains[box].levels[level].grids[__f ][ijk] = f; + }}} + } + + double average_value_of_f = mean(domain,level,__f); + if(domain->rank==0){printf("\n average value of f = %20.12e\n",average_value_of_f);fflush(stdout);} + if(a!=0){ + shift_grid(domain,level,__f ,__f ,-average_value_of_f); + shift_grid(domain,level,__u_exact,__u_exact,-average_value_of_f/a); + } + // what if a==0 and average_value_of_f != 0 ??? +} +//------------------------------------------------------------------------------------------------------------------------------ Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/residual.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/residual.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/residual.inc @@ -0,0 +1,218 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include "timer.h" +#include "mg.h" +//------------------------------------------------------------------------------------------------------------------------------ +void residual(domain_type * domain, int level, int res_id, int phi_id, int rhs_id, double a, double b){ + // exchange the boundary for x in prep for Ax... + // for 7-point stencil, only needs to be a 1-deep ghost zone & faces only + exchange_boundary(domain,level,phi_id,1,0,0); + + // now do residual/restriction proper... + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level].dim.i >= CollaborativeThreadingBoxSize); + int box; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int pencil = domain->subdomains[box].levels[level].pencil; + int plane = domain->subdomains[box].levels[level].plane; + int ghosts = domain->subdomains[box].levels[level].ghosts; + int dim_k = domain->subdomains[box].levels[level].dim.k; + int dim_j = domain->subdomains[box].levels[level].dim.j; + int dim_i = domain->subdomains[box].levels[level].dim.i; + double h2inv = 1.0/(domain->h[level]*domain->h[level]); + double * __restrict__ phi = domain->subdomains[box].levels[level].grids[ phi_id] + ghosts*(1+pencil+plane); // i.e. [0] = first non ghost zone point + double * __restrict__ rhs = domain->subdomains[box].levels[level].grids[ rhs_id] + ghosts*(1+pencil+plane); + double * __restrict__ alpha = domain->subdomains[box].levels[level].grids[__alpha ] + ghosts*(1+pencil+plane); + double * __restrict__ beta_i = domain->subdomains[box].levels[level].grids[__beta_i] + ghosts*(1+pencil+plane); + double * __restrict__ beta_j = domain->subdomains[box].levels[level].grids[__beta_j] + ghosts*(1+pencil+plane); + double * __restrict__ beta_k = domain->subdomains[box].levels[level].grids[__beta_k] + ghosts*(1+pencil+plane); + double * __restrict__ res = domain->subdomains[box].levels[level].grids[ res_id] + ghosts*(1+pencil+plane); + + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;kcycles.residual[level] += (uint64_t)(CycleTime()-_timeStart); +} + +#if 1 +//------------------------------------------------------------------------------------------------------------------------------ +// This version maximizes parallelism by parallelizing over the resultant coarse grid. +// Thus, +// one parallelizes over the list of 2x2 fine-grid bars, +// initializes a coarse grid pencil to zero, +// additively restricts each pencil in the 2x2 fine-grid bar to the coarse grid pencil +//------------------------------------------------------------------------------------------------------------------------------ +void residual_and_restriction(domain_type *domain, int level_f, int phi_id, int rhs_id, int level_c, int res_id, double a, double b){ + // exchange the boundary for x in prep for Ax... + // for 7-point stencil, only needs to be a 1-deep ghost zone & faces only + exchange_boundary(domain,level_f,phi_id,1,0,0); + + // now do residual/restriction proper... + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize); + int box; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int kk,jj; + int pencil_c = domain->subdomains[box].levels[level_c].pencil; + int plane_c = domain->subdomains[box].levels[level_c].plane; + int ghosts_c = domain->subdomains[box].levels[level_c].ghosts; + int dim_k_c = domain->subdomains[box].levels[level_c].dim.k; + int dim_j_c = domain->subdomains[box].levels[level_c].dim.j; + int dim_i_c = domain->subdomains[box].levels[level_c].dim.i; + + int pencil_f = domain->subdomains[box].levels[level_f].pencil; + int plane_f = domain->subdomains[box].levels[level_f].plane; + int ghosts_f = domain->subdomains[box].levels[level_f].ghosts; + int dim_k_f = domain->subdomains[box].levels[level_f].dim.k; + int dim_j_f = domain->subdomains[box].levels[level_f].dim.j; + int dim_i_f = domain->subdomains[box].levels[level_f].dim.i; + + double h2inv = 1.0/(domain->h[level_f]*domain->h[level_f]); + double * __restrict__ phi = domain->subdomains[box].levels[level_f].grids[ phi_id] + ghosts_f*(1+pencil_f+plane_f); // i.e. [0] = first non ghost zone point + double * __restrict__ rhs = domain->subdomains[box].levels[level_f].grids[ rhs_id] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ alpha = domain->subdomains[box].levels[level_f].grids[__alpha ] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ beta_i = domain->subdomains[box].levels[level_f].grids[__beta_i] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ beta_j = domain->subdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ beta_k = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ res = domain->subdomains[box].levels[level_c].grids[ res_id] + ghosts_c*(1+pencil_c+plane_c); + + #pragma omp parallel for private(kk,jj) if(omp_within_a_box) collapse(2) + for(kk=0;kk>1)*pencil_c + (kk>>1)*plane_c; + res[ijk_c] = 0.0; + } + for(k=kk;k>1) + (j>>1)*pencil_c + (k>>1)*plane_c; + double helmholtz = a*alpha[ijk_f]*phi[ijk_f] + -b*h2inv*( + beta_i[ijk_f+1 ]*( phi[ijk_f+1 ]-phi[ijk_f ] ) + -beta_i[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-1 ] ) + +beta_j[ijk_f+pencil_f]*( phi[ijk_f+pencil_f]-phi[ijk_f ] ) + -beta_j[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-pencil_f] ) + +beta_k[ijk_f+plane_f ]*( phi[ijk_f+plane_f ]-phi[ijk_f ] ) + -beta_k[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-plane_f ] ) + ); + res[ijk_c] += (rhs[ijk_f]-helmholtz)*0.125; + } + }}}} + } +#endif + domain->cycles.residual[level_f] += (uint64_t)(CycleTime()-_timeStart); +} +#else +//------------------------------------------------------------------------------------------------------------------------------ +// This version performs a 1D parallelization over the coarse-grid k-dimension (every two fine-grid planes) +// It first zeros the coarse grid plane, then increments with restrictions from the fine grid +//------------------------------------------------------------------------------------------------------------------------------ +void residual_and_restriction(domain_type *domain, int level_f, int phi_id, int rhs_id, int level_c, int res_id, double a, double b){ + // exchange the boundary for x in prep for Ax... + // for 7-point stencil, only needs to be a 1-deep ghost zone & faces only + exchange_boundary(domain,level_f,phi_id,1,0,0); + + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level_f].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level_f].dim.i >= CollaborativeThreadingBoxSize); + int box; + + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int pencil_c = domain->subdomains[box].levels[level_c].pencil; + int plane_c = domain->subdomains[box].levels[level_c].plane; + int ghosts_c = domain->subdomains[box].levels[level_c].ghosts; + int dim_k_c = domain->subdomains[box].levels[level_c].dim.k; + int dim_j_c = domain->subdomains[box].levels[level_c].dim.j; + int dim_i_c = domain->subdomains[box].levels[level_c].dim.i; + + int pencil_f = domain->subdomains[box].levels[level_f].pencil; + int plane_f = domain->subdomains[box].levels[level_f].plane; + int ghosts_f = domain->subdomains[box].levels[level_f].ghosts; + int dim_k_f = domain->subdomains[box].levels[level_f].dim.k; + int dim_j_f = domain->subdomains[box].levels[level_f].dim.j; + int dim_i_f = domain->subdomains[box].levels[level_f].dim.i; + + double h2inv = 1.0/(domain->h[level_f]*domain->h[level_f]); + double * __restrict__ phi = domain->subdomains[box].levels[level_f].grids[ phi_id] + ghosts_f*(1+pencil_f+plane_f); // i.e. [0] = first non ghost zone point + double * __restrict__ rhs = domain->subdomains[box].levels[level_f].grids[ rhs_id] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ alpha = domain->subdomains[box].levels[level_f].grids[__alpha ] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ beta_i = domain->subdomains[box].levels[level_f].grids[__beta_i] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ beta_j = domain->subdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ beta_k = domain->subdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ res = domain->subdomains[box].levels[level_c].grids[ res_id] + ghosts_c*(1+pencil_c+plane_c); + + int kk; + #pragma omp parallel for private(kk) if(omp_within_a_box) + for(kk=0;kk>1)*plane_c; + res[ijk_c] = 0.0; + }} + // restrict two fine grid planes into one coarse grid plane + for(k=kk;k>1) + (j>>1)*pencil_c + (k>>1)*plane_c; + double helmholtz = a*alpha[ijk_f]*phi[ijk_f] + -b*h2inv*( + beta_i[ijk_f+1 ]*( phi[ijk_f+1 ]-phi[ijk_f ] ) + -beta_i[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-1 ] ) + +beta_j[ijk_f+pencil_f]*( phi[ijk_f+pencil_f]-phi[ijk_f ] ) + -beta_j[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-pencil_f] ) + +beta_k[ijk_f+plane_f ]*( phi[ijk_f+plane_f ]-phi[ijk_f ] ) + -beta_k[ijk_f ]*( phi[ijk_f ]-phi[ijk_f-plane_f ] ) + ); + res[ijk_c] += (rhs[ijk_f]-helmholtz)*0.125; + }}} + } + } + domain->cycles.residual[level_f] += (uint64_t)(CycleTime()-_timeStart); +} +#endif +//------------------------------------------------------------------------------------------------------------------------------ Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/restriction.inc =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/restriction.inc +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/restriction.inc @@ -0,0 +1,131 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include "timer.h" +#include "mg.h" +//------------------------------------------------------------------------------------------------------------------------------ +// id_c^2h = R id_f^h +// restriction of cell-centered data... +//------------------------------------------------------------------------------------------------------------------------------ +void restriction(domain_type *domain, int level_f, int id_c, int id_f){ + int level_c = level_f+1; + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level_c].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level_c].dim.i >= CollaborativeThreadingBoxSize); + int box; +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int ghosts_c = domain->subdomains[box].levels[level_c].ghosts; + int pencil_c = domain->subdomains[box].levels[level_c].pencil; + int plane_c = domain->subdomains[box].levels[level_c].plane; + int dim_i_c = domain->subdomains[box].levels[level_c].dim.i; + int dim_j_c = domain->subdomains[box].levels[level_c].dim.j; + int dim_k_c = domain->subdomains[box].levels[level_c].dim.k; + + int ghosts_f = domain->subdomains[box].levels[level_f].ghosts; + int pencil_f = domain->subdomains[box].levels[level_f].pencil; + int plane_f = domain->subdomains[box].levels[level_f].plane; + + double * __restrict__ grid_f = domain->subdomains[box].levels[level_f].grids[id_f] + ghosts_f*(1+pencil_f+plane_f); + double * __restrict__ grid_c = domain->subdomains[box].levels[level_c].grids[id_c] + ghosts_c*(1+pencil_c+plane_c); + + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;kcycles.restriction[level_f] += (uint64_t)(CycleTime()-_timeStart); +} + + + +//------------------------------------------------------------------------------------------------------------------------------ +// restriction of face-centered data... +//------------------------------------------------------------------------------------------------------------------------------ +void restriction_betas(domain_type * domain, int level_f, int level_c){ + uint64_t _timeStart = CycleTime(); + int CollaborativeThreadingBoxSize = 100000; // i.e. never + #ifdef __COLLABORATIVE_THREADING + CollaborativeThreadingBoxSize = 1 << __COLLABORATIVE_THREADING; + #endif + int omp_across_boxes = (domain->subdomains[0].levels[level_c].dim.i < CollaborativeThreadingBoxSize); + int omp_within_a_box = (domain->subdomains[0].levels[level_c].dim.i >= CollaborativeThreadingBoxSize); + int box; + +#ifdef OMP + #pragma omp parallel for private(box) if(omp_across_boxes) + for(box=0;boxsubdomains_per_rank;box++){ + int i,j,k; + int ghosts_c = domain->subdomains[box].levels[level_c].ghosts; + int pencil_c = domain->subdomains[box].levels[level_c].pencil; + int plane_c = domain->subdomains[box].levels[level_c].plane; + int dim_i_c = domain->subdomains[box].levels[level_c].dim.i; + int dim_j_c = domain->subdomains[box].levels[level_c].dim.j; + int dim_k_c = domain->subdomains[box].levels[level_c].dim.k; + + int ghosts_f = domain->subdomains[box].levels[level_f].ghosts; + int pencil_f = domain->subdomains[box].levels[level_f].pencil; + int plane_f = domain->subdomains[box].levels[level_f].plane; + + double * __restrict__ beta_f; + double * __restrict__ beta_c; + + // restrict beta_i (== face in jk) + beta_f = domain->subdomains[box].levels[level_f].grids[__beta_i] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f; + beta_c = domain->subdomains[box].levels[level_c].grids[__beta_i] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c; + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;ksubdomains[box].levels[level_f].grids[__beta_j] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f; + beta_c = domain->subdomains[box].levels[level_c].grids[__beta_j] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c; + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;ksubdomains[box].levels[level_f].grids[__beta_k] + ghosts_f*plane_f + ghosts_f*pencil_f + ghosts_f; + beta_c = domain->subdomains[box].levels[level_c].grids[__beta_k] + ghosts_c*plane_c + ghosts_c*pencil_c + ghosts_c; + #pragma omp parallel for private(k,j,i) if(omp_within_a_box) collapse(2) + for(k=0;kcycles.restriction[level_f] += (uint64_t)(CycleTime()-_timeStart); +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.h @@ -0,0 +1,11 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef SOLVER_H +#define SOLVER_H +#include "mg.h" +void IterativeSolver(domain_type *domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm); +int IterativeSolver_NumGrids(); +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/solver.c @@ -0,0 +1,783 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +#include +#include +#include +#include +//------------------------------------------------------------------------------------------------------------------------------ +#ifdef __MPI +#include +#endif +//------------------------------------------------------------------------------------------------------------------------------ +#include "defines.h" +#include "box.h" +#include "mg.h" +#include "operators.h" +#include "timer.h" +//------------------------------------------------------------------------------------------------------------------------------ +#warning Using Diagonally Preconditioned Iterative Solvers +#define DiagonallyPrecondition +//------------------------------------------------------------------------------------------------------------------------------ +#ifndef __CA_KRYLOV_S +#define __CA_KRYLOV_S 4 +#endif + +//#define __VERBOSE +//#define __DEBUG +//------------------------------------------------------------------------------------------------------------------------------ +// z[r] = alpha*A[r][c]*x[c]+beta*y[r] // [row][col] +// z[r] = alpha*A[r][c]*x[c]+beta*y[r] // [row][col] +#define __gemv(z,alpha,A,x,beta,y,rows,cols) {int r,c;double sum;for(r=0;r<(rows);r++){sum=0.0;for(c=0;c<(cols);c++){sum+=(A)[r][c]*(x)[c];}(z)[r]=(alpha)*sum+(beta)*(y)[r];}} +inline void __axpy(double * z, double alpha, double * x, double beta, double * y, int n){ // z[n] = alpha*x[n]+beta*y[n] + int nn; + for(nn=0;nnrank==0)printf("m=%8d, norm =%0.20f\n",m,norm_of_rt); + #endif + if(norm_of_rt == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution + delta = dot(domain,level,__r,__rt); // delta = dot(r,rt) + if(delta==0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution (square of L2 norm of __r) + L2_norm_of_rt = sqrt(delta); + + int __ca_krylov_s = 1; + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + while( (mCAKrylov_formations_of_G++; // Record the number of times CABiCGStab formed G[][] + matmul_grids(domain,level,Gg,PRrt,PRrt,4*__ca_krylov_s+1,4*__ca_krylov_s+2,1); + for(i=0,k=0;i<4*__ca_krylov_s+1;i++){ // extract G[][] and g[] from Gg[] + for(j=0 ;j<4*__ca_krylov_s+1;j++){G[i][j] = Gg[k++];} // first 4*__ca_krylov_s+1 elements in each row go to G[][]. + g[i] = Gg[k++]; // last element in row goes to g[]. + } + + for(i=0;i<4*__ca_krylov_s+1;i++)aj[i]=0.0;aj[ 0]=1.0; // initialized based on (3.26) + for(i=0;i<4*__ca_krylov_s+1;i++)cj[i]=0.0;cj[2*__ca_krylov_s+1]=1.0; // initialized based on (3.26) + for(i=0;i<4*__ca_krylov_s+1;i++)ej[i]=0.0; // initialized based on (3.26) + + for(n=0;n<__ca_krylov_s;n++){ // for(n=0;n<__ca_krylov_s;n++){ + domain->Krylov_iterations++; // record number of inner-loop (j) iterations for comparison + __gemv( Tpaj, 1.0, Tp, aj, 0.0, Tpaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T'aj + __gemv( Tpcj, 1.0, Tp, cj, 0.0, Tpcj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T'cj + __gemv(Tppaj, 1.0,Tpp, aj, 0.0,Tppaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T''aj + g_dot_Tpaj = __dot(g,Tpaj,4*__ca_krylov_s+1); // (g,T'aj) + if(g_dot_Tpaj == 0.0){ // pivot breakdown ??? + #ifdef __VERBOSE // + if(domain->rank==0){printf("g_dot_Tpaj == 0.0\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + alpha = delta / g_dot_Tpaj; // delta / (g,T'aj) + if(isinf(alpha)){ // alpha = big/tiny(overflow) = inf -> breakdown + #ifdef __VERBOSE // + if(domain->rank==0){printf("alpha == inf\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + #if 0 // seems to have accuracy problems in finite precision... + __gemv(temp1,-alpha, G, Tpaj, 0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = - alpha*GT'aj + __gemv(temp1, 1.0, G, cj, 1.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = Gcj - alpha*GT'aj + __gemv(temp2,-alpha, G,Tppaj, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = − alpha*GT′′aj + __gemv(temp2, 1.0, G, Tpcj, 1.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = GT′cj − alpha*GT′′aj + __axpy(temp3, 1.0, Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1); // temp3[] = T′cj − alpha*T′′aj + omega_numerator = __dot(temp3,temp1,4*__ca_krylov_s+1); // (temp3,temp1) = ( T'cj-alpha*T''aj , Gcj-alpha*GT'aj ) + omega_denominator = __dot(temp3,temp2,4*__ca_krylov_s+1); // (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj ) + #else // better to change the order of operations Gx-Gy -> G(x-y) ... (note, G is symmetric) + __axpy(temp1, 1.0, Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1); // temp1[] = (T'cj - alpha*T''aj) + __gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = G(T'cj - alpha*T''aj) + __axpy(temp3, 1.0, cj,-alpha, Tpaj,4*__ca_krylov_s+1); // temp3[] = cj - alpha*T'aj + omega_numerator = __dot(temp3,temp2,4*__ca_krylov_s+1); // (temp3,temp2) = ( ( cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) ) + omega_denominator = __dot(temp1,temp2,4*__ca_krylov_s+1); // (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) ) + #endif // + // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0 + // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj + // If omega_numerator==0, and ||s||!=0, then stabilization breakdown + + // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!! + __axpy( ej,1.0,ej, alpha, aj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + + // calculate the norm of Saad's vector 's' to check intra s-step convergence... + __axpy(temp1, 1.0, cj,-alpha, Tpaj,4*__ca_krylov_s+1); // temp1[] = cj - alpha*T'aj + __gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = G(cj - alpha*T'aj) + L2_norm_of_s = __dot(temp1,temp2,4*__ca_krylov_s+1); // (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) ) == square of L2 norm of s in exact arithmetic + if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + #ifdef __VERBOSE // + if(domain->rank==0){printf("m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);} // + #endif // + if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop + + + if(omega_denominator == 0.0){ // ??? breakdown + #ifdef __VERBOSE // + if(domain->rank==0){if(omega_denominator == 0.0)printf("omega_denominator == 0.0\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + omega = omega_numerator / omega_denominator; // + if(isinf(omega)){ // omega = big/tiny(oveflow) = inf + #ifdef __VERBOSE // + if(domain->rank==0){if(isinf(omega))printf("omega == inf\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok // + __axpy( ej,1.0,ej, omega, cj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] + __axpy( ej,1.0,ej,-omega*alpha, Tpaj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[] + __axpy( cj,1.0,cj, -omega, Tpcj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] + __axpy( cj,1.0,cj, -alpha, Tpaj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + __axpy( cj,1.0,cj, omega*alpha,Tppaj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[] + + + // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... + __gemv(temp1, 1.0, G, cj, 0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = Gcj + cj_dot_Gcj = __dot(cj,temp1,4*__ca_krylov_s+1); // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic + L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + #ifdef __VERBOSE + if(domain->rank==0){printf("m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);} + #endif + if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop + + + delta_next = __dot( g,cj,4*__ca_krylov_s+1); // (g,cj) + #ifdef __VERBOSE // + if(domain->rank==0){ // + if(isinf(delta_next) ){printf("delta == inf\n");} // delta = big/tiny(overflow) = inf + if(delta_next == 0.0){printf("delta == 0.0\n");} // Lanczos breakdown + if(omega_numerator == 0.0){printf("omega_numerator == 0.0\n");} // stabilization breakdown + if(omega == 0.0){printf("omega == 0.0\n");} // stabilization breakdown + } // + #endif // + if(isinf(delta_next)){BiCGStabFailed =1;break;} // delta = inf? + if(delta_next ==0.0){BiCGStabFailed =1;break;} // Lanczos breakdown... + if(omega ==0.0){BiCGStabFailed =1;break;} // stabilization breakdown + beta = (delta_next/delta)*(alpha/omega); // (delta_next/delta)*(alpha/omega) + #ifdef __VERBOSE // + if(domain->rank==0){ // + if(isinf(beta) ){printf("beta == inf\n");} // beta = inf? + if(beta == 0.0){printf("beta == 0.0\n");} // beta = 0? can't make further progress(?) + } // + #endif // + if(isinf(beta) ){BiCGStabFailed =1;break;} // beta = inf? + if(beta == 0.0){BiCGStabFailed =1;break;} // beta = 0? can't make further progress(?) + __axpy( aj,1.0,cj, beta, aj,4*__ca_krylov_s+1); // aj[] = cj[] + beta*aj[] + __axpy( aj,1.0,aj, -omega*beta, Tpaj,4*__ca_krylov_s+1); // aj[] = cj[] + beta*aj[] - omega*beta*T'aj + delta = delta_next; // delta = delta_next + + } // inner n (j) loop + + // update iterates... + for(i=0;i<4*__ca_krylov_s+1;i++){add_grids(domain,level,e_id,1.0,e_id,ej[i],PRrt[i]);} // e_id[] = [P,R]ej + e_id[] + if(!BiCGStabFailed && !BiCGStabConverged){ // if we're done, then there is no point in updating these + add_grids(domain,level, __p,0.0, __p,aj[0],PRrt[0]); // p[] = [P,R]aj + for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __p,1.0, __p,aj[i],PRrt[i]);} // ... + add_grids(domain,level, __r,0.0, __r,cj[0],PRrt[0]); // r[] = [P,R]cj + for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __r,1.0, __r,cj[i],PRrt[i]);} // ... + } // + m+=__ca_krylov_s; // m+=ca_krylov_s; + __ca_krylov_s*=2;if(__ca_krylov_s>__CA_KRYLOV_S)__ca_krylov_s=__CA_KRYLOV_S; + } // } // outer m loop + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef DiagonallyPrecondition + mul_grids(domain,level,e_id,1.0,__lambda,e_id); // e_id[] = lambda[]*e_id[] // i.e. e = D^{-1}e' + #endif + +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void CABiCGStab(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){ + // based on Erin Carson/Jim Demmel/Nick Knight's s-Step BiCGStab Algorithm 3.4 + + // note: __CA_KRYLOV_S should be tiny (2-8?). As such, 4*__CA_KRYLOV_S+1 is also tiny (9-33). Just allocate on the stack... + double temp1[4*__CA_KRYLOV_S+1]; // + double temp2[4*__CA_KRYLOV_S+1]; // + double temp3[4*__CA_KRYLOV_S+1]; // + double Tp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1]; // T' indexed as [row][col] + double Tpp[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1]; // T'' indexed as [row][col] + double aj[4*__CA_KRYLOV_S+1]; // + double cj[4*__CA_KRYLOV_S+1]; // + double ej[4*__CA_KRYLOV_S+1]; // + double Tpaj[4*__CA_KRYLOV_S+1]; // + double Tpcj[4*__CA_KRYLOV_S+1]; // + double Tppaj[4*__CA_KRYLOV_S+1]; // + double G[4*__CA_KRYLOV_S+1][4*__CA_KRYLOV_S+1]; // extracted from first 4*__CA_KRYLOV_S+1 columns of Gg[][]. indexed as [row][col] + double g[4*__CA_KRYLOV_S+1]; // extracted from last [4*__CA_KRYLOV_S+1] column of Gg[][]. + double Gg[(4*__CA_KRYLOV_S+1)*(4*__CA_KRYLOV_S+2)]; // buffer to hold the Gram-like matrix produced by matmul(). indexed as [row*(4*__CA_KRYLOV_S+2) + col] + int PRrt[4*__CA_KRYLOV_S+2]; // grid_id's of the concatenation of the 2S+1 matrix powers of P, 2S matrix powers of R, and rt + int *P = PRrt+ 0; // grid_id's of the 2S+1 Matrix Powers of P. P[i] is the grid_id of A^i(p) + int *R = PRrt+2*__CA_KRYLOV_S+1; // grid_id's of the 2S Matrix Powers of R. R[i] is the grid_id of A^i(r) + + int mMax=200; + int m=0,n; + int i,j,k; + int BiCGStabFailed = 0; + int BiCGStabConverged = 0; + double g_dot_Tpaj,alpha,omega_numerator,omega_denominator,omega,delta,delta_next,beta; + double L2_norm_of_rt,L2_norm_of_residual,cj_dot_Gcj,L2_norm_of_s; + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + residual(domain,level,__rt,e_id,R_id,a,b); // rt[] = R_id[] - A(e_id)... note, if DPC, then rt = R-AD^-1De + scale_grid(domain,level,__r,1.0,__rt); // r[] = rt[] + scale_grid(domain,level,__p,1.0,__rt); // p[] = rt[] + double norm_of_rt = norm(domain,level,__rt); // the norm of the initial residual... + #ifdef __VERBOSE + if(domain->rank==0)printf("m=%8d, norm =%0.20f\n",m,norm_of_rt); + #endif + if(norm_of_rt == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution + delta = dot(domain,level,__r,__rt); // delta = dot(r,rt) + if(delta==0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution (square of L2 norm of __r) + L2_norm_of_rt = sqrt(delta); + + int __ca_krylov_s = __CA_KRYLOV_S; // by making this a variable, I prevent the compiler from optimizing more than the telescoping version, thus preserving a bit-identcal result + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++) Tp[i][j]=0; // initialize Tp[][] and Tpp[][] ... + for(i=0;i<4*__ca_krylov_s+1;i++)for(j=0;j<4*__ca_krylov_s+1;j++)Tpp[i][j]=0; // + for(i= 0;i<2*__ca_krylov_s ;i++){ Tp[i+1][i]=1;} // monomial basis... Fixed (typo in SIAM paper) + for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s ;i++){ Tp[i+1][i]=1;} // + for(i= 0;i<2*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;} // + for(i=2*__ca_krylov_s+1;i<4*__ca_krylov_s-1;i++){Tpp[i+2][i]=1;} // + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + for(i=0;i<4*__ca_krylov_s+1;i++){PRrt[ i] = __PRrt+i;} // columns of PRrt map to the consecutive spare grid indices starting at __PRrt + PRrt[4*__ca_krylov_s+1] = __rt; // last column or PRrt (r tilde) maps to rt + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + while( (mCAKrylov_formations_of_G++; // Record the number of times CABiCGStab formed G[][] + matmul_grids(domain,level,Gg,PRrt,PRrt,4*__ca_krylov_s+1,4*__ca_krylov_s+2,1); + for(i=0,k=0;i<4*__ca_krylov_s+1;i++){ // extract G[][] and g[] from Gg[] + for(j=0 ;j<4*__ca_krylov_s+1;j++){G[i][j] = Gg[k++];} // first 4*__ca_krylov_s+1 elements in each row go to G[][]. + g[i] = Gg[k++]; // last element in row goes to g[]. + } + + for(i=0;i<4*__ca_krylov_s+1;i++)aj[i]=0.0;aj[ 0]=1.0; // initialized based on (3.26) + for(i=0;i<4*__ca_krylov_s+1;i++)cj[i]=0.0;cj[2*__ca_krylov_s+1]=1.0; // initialized based on (3.26) + for(i=0;i<4*__ca_krylov_s+1;i++)ej[i]=0.0; // initialized based on (3.26) + + for(n=0;n<__ca_krylov_s;n++){ // for(n=0;n<__ca_krylov_s;n++){ + domain->Krylov_iterations++; // record number of inner-loop (j) iterations for comparison + __gemv( Tpaj, 1.0, Tp, aj, 0.0, Tpaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T'aj + __gemv( Tpcj, 1.0, Tp, cj, 0.0, Tpcj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T'cj + __gemv(Tppaj, 1.0,Tpp, aj, 0.0,Tppaj,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // T''aj + g_dot_Tpaj = __dot(g,Tpaj,4*__ca_krylov_s+1); // (g,T'aj) + if(g_dot_Tpaj == 0.0){ // pivot breakdown ??? + #ifdef __VERBOSE // + if(domain->rank==0){printf("g_dot_Tpaj == 0.0\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + alpha = delta / g_dot_Tpaj; // delta / (g,T'aj) + if(isinf(alpha)){ // alpha = big/tiny(overflow) = inf -> breakdown + #ifdef __VERBOSE // + if(domain->rank==0){printf("alpha == inf\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + #if 0 // seems to have accuracy problems in finite precision... + __gemv(temp1,-alpha, G, Tpaj, 0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = - alpha*GT'aj + __gemv(temp1, 1.0, G, cj, 1.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = Gcj - alpha*GT'aj + __gemv(temp2,-alpha, G,Tppaj, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = − alpha*GT′′aj + __gemv(temp2, 1.0, G, Tpcj, 1.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = GT′cj − alpha*GT′′aj + __axpy(temp3, 1.0, Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1); // temp3[] = T′cj − alpha*T′′aj + omega_numerator = __dot(temp3,temp1,4*__ca_krylov_s+1); // (temp3,temp1) = ( T'cj-alpha*T''aj , Gcj-alpha*GT'aj ) + omega_denominator = __dot(temp3,temp2,4*__ca_krylov_s+1); // (temp3,temp2) = ( T′cj−alpha*T′′aj , GT′cj−alpha*GT′′aj ) + #else // better to change the order of operations Gx-Gy -> G(x-y) ... (note, G is symmetric) + __axpy(temp1, 1.0, Tpcj,-alpha,Tppaj,4*__ca_krylov_s+1); // temp1[] = (T'cj - alpha*T''aj) + __gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = G(T'cj - alpha*T''aj) + __axpy(temp3, 1.0, cj,-alpha, Tpaj,4*__ca_krylov_s+1); // temp3[] = cj - alpha*T'aj + omega_numerator = __dot(temp3,temp2,4*__ca_krylov_s+1); // (temp3,temp2) = ( ( cj - alpha*T'aj ) , G(T'cj - alpha*T''aj) ) + omega_denominator = __dot(temp1,temp2,4*__ca_krylov_s+1); // (temp1,temp2) = ( (T'cj - alpha*T''aj) , G(T'cj - alpha*T''aj) ) + #endif // + // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0 + // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj + // If omega_numerator==0, and ||s||!=0, then stabilization breakdown + + // !!! PARTIAL UPDATE OF ej MUST HAPPEN BEFORE THE CHECK ON OMEGA TO ENSURE FORWARD PROGRESS !!! + __axpy( ej,1.0,ej, alpha, aj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + + // calculate the norm of Saad's vector 's' to check intra s-step convergence... + __axpy(temp1, 1.0, cj,-alpha, Tpaj,4*__ca_krylov_s+1); // temp1[] = cj - alpha*T'aj + __gemv(temp2, 1.0, G,temp1, 0.0,temp2,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp2[] = G(cj - alpha*T'aj) + L2_norm_of_s = __dot(temp1,temp2,4*__ca_krylov_s+1); // (temp1,temp2) = ( (cj - alpha*T'aj) , G(cj - alpha*T'aj) ) == square of L2 norm of s in exact arithmetic + if(L2_norm_of_s<0)L2_norm_of_s=0;else L2_norm_of_s=sqrt(L2_norm_of_s); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + #ifdef __VERBOSE // + if(domain->rank==0){printf("m=%8d, norm(s)=%0.20f\n",m+n,L2_norm_of_s);} // + #endif // + if(L2_norm_of_s < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop + + + if(omega_denominator == 0.0){ // ??? breakdown + #ifdef __VERBOSE // + if(domain->rank==0){if(omega_denominator == 0.0)printf("omega_denominator == 0.0\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + omega = omega_numerator / omega_denominator; // + if(isinf(omega)){ // omega = big/tiny(oveflow) = inf + #ifdef __VERBOSE // + if(domain->rank==0){if(isinf(omega))printf("omega == inf\n");} // + #endif // + BiCGStabFailed=1;break; // + } // + // !!! COMPLETE THE UPDATE OF ej & cj now that omega is known to be ok // + __axpy( ej,1.0,ej, omega, cj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] + __axpy( ej,1.0,ej,-omega*alpha, Tpaj,4*__ca_krylov_s+1); // ej[] = ej[] + alpha*aj[] + omega*cj[] - omega*alpha*T'aj[] + __axpy( cj,1.0,cj, -omega, Tpcj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] + __axpy( cj,1.0,cj, -alpha, Tpaj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + __axpy( cj,1.0,cj, omega*alpha,Tppaj,4*__ca_krylov_s+1); // cj[] = cj[] - omega*T'cj[] - alpha*T'aj[] + omega*alpha*T''aj[] + + + // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... + __gemv(temp1, 1.0, G, cj, 0.0,temp1,4*__ca_krylov_s+1,4*__ca_krylov_s+1); // temp1[] = Gcj + cj_dot_Gcj = __dot(cj,temp1,4*__ca_krylov_s+1); // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic + L2_norm_of_residual = 0.0;if(cj_dot_Gcj>0)L2_norm_of_residual=sqrt(cj_dot_Gcj); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + #ifdef __VERBOSE + if(domain->rank==0){printf("m=%8d, norm(r)=%0.20f (cj_dot_Gcj=%0.20e)\n",m+n,L2_norm_of_residual,cj_dot_Gcj);} + #endif + if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_rt){BiCGStabConverged=1;break;} // terminate the inner n-loop + + + delta_next = __dot( g,cj,4*__ca_krylov_s+1); // (g,cj) + #ifdef __VERBOSE // + if(domain->rank==0){ // + if(isinf(delta_next) ){printf("delta == inf\n");} // delta = big/tiny(overflow) = inf + if(delta_next == 0.0){printf("delta == 0.0\n");} // Lanczos breakdown + if(omega_numerator == 0.0){printf("omega_numerator == 0.0\n");} // stabilization breakdown + if(omega == 0.0){printf("omega == 0.0\n");} // stabilization breakdown + } // + #endif // + if(isinf(delta_next)){BiCGStabFailed =1;break;} // delta = inf? + if(delta_next ==0.0){BiCGStabFailed =1;break;} // Lanczos breakdown... + if(omega ==0.0){BiCGStabFailed =1;break;} // stabilization breakdown + beta = (delta_next/delta)*(alpha/omega); // (delta_next/delta)*(alpha/omega) + #ifdef __VERBOSE // + if(domain->rank==0){ // + if(isinf(beta) ){printf("beta == inf\n");} // beta = inf? + if(beta == 0.0){printf("beta == 0.0\n");} // beta = 0? can't make further progress(?) + } // + #endif // + if(isinf(beta) ){BiCGStabFailed =1;break;} // beta = inf? + if(beta == 0.0){BiCGStabFailed =1;break;} // beta = 0? can't make further progress(?) + __axpy( aj,1.0,cj, beta, aj,4*__ca_krylov_s+1); // aj[] = cj[] + beta*aj[] + __axpy( aj,1.0,aj, -omega*beta, Tpaj,4*__ca_krylov_s+1); // aj[] = cj[] + beta*aj[] - omega*beta*T'aj + delta = delta_next; // delta = delta_next + + } // inner n (j) loop + + // update iterates... + for(i=0;i<4*__ca_krylov_s+1;i++){add_grids(domain,level,e_id,1.0,e_id,ej[i],PRrt[i]);} // e_id[] = [P,R]ej + e_id[] + if(!BiCGStabFailed && !BiCGStabConverged){ // if we're done, then there is no point in updating these + add_grids(domain,level, __p,0.0, __p,aj[0],PRrt[0]); // p[] = [P,R]aj + for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __p,1.0, __p,aj[i],PRrt[i]);} // ... + add_grids(domain,level, __r,0.0, __r,cj[0],PRrt[0]); // r[] = [P,R]cj + for(i=1;i<4*__ca_krylov_s+1;i++){add_grids(domain,level, __r,1.0, __r,cj[i],PRrt[i]);} // ... + } // + m+=__ca_krylov_s; // m+=__ca_krylov_s; + } // } // outer m loop + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + #ifdef DiagonallyPrecondition + mul_grids(domain,level,e_id,1.0,__lambda,e_id); // e_id[] = lambda[]*e_id[] // i.e. e = D^{-1}e' + #endif +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void BiCGStab(domain_type * domain, int level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){ + // Algorithm 7.7 in Iterative Methods for Sparse Linear Systems(Yousef Saad) + // uses "right" preconditioning... AD^{-1}(Dx) = b ... AD^{-1}y = b ... solve for y, then solve for x = D^{-1}y + int jMax=200; + int j=0; + int BiCGStabFailed = 0; + int BiCGStabConverged = 0; + residual(domain,level,__r0,x_id,R_id,a,b); // r0[] = R_id[] - A(x_id) + scale_grid(domain,level,__r,1.0,__r0); // r[] = r0[] + scale_grid(domain,level,__p,1.0,__r0); // p[] = r0[] + double r_dot_r0 = dot(domain,level,__r,__r0); // r_dot_r0 = dot(r,r0) + double norm_of_r0 = norm(domain,level,__r); // the norm of the initial residual... + if(r_dot_r0 == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution + if(norm_of_r0 == 0.0){BiCGStabConverged=1;} // entered BiCGStab with exact solution + while( (jKrylov_iterations++; // + #ifdef DiagonallyPrecondition // + mul_grids(domain,level,__temp,1.0,__lambda,__p); // temp[] = lambda[]*p[] + apply_op(domain,level,__Ap,__temp,a,b); // Ap = AD^{-1}(p) + #else // + apply_op(domain,level,__Ap,__p,a,b); // Ap = A(p) + #endif // + double Ap_dot_r0 = dot(domain,level,__Ap,__r0); // Ap_dot_r0 = dot(Ap,r0) + if(Ap_dot_r0 == 0.0){BiCGStabFailed=1;break;} // pivot breakdown ??? + double alpha = r_dot_r0 / Ap_dot_r0; // alpha = r_dot_r0 / Ap_dot_r0 + if(isinf(alpha)){BiCGStabFailed=1;break;} // pivot breakdown ??? + add_grids(domain,level,x_id,1.0,x_id, alpha,__p ); // x_id[] = x_id[] + alpha*p[] + add_grids(domain,level,__s ,1.0,__r ,-alpha,__Ap); // s[] = r[] - alpha*Ap[] (intermediate residual?) + double norm_of_s = norm(domain,level,__s); // FIX - redundant?? norm of intermediate residual + if(norm_of_s == 0.0){BiCGStabConverged=1;break;} // FIX - redundant?? if As_dot_As==0, then As must be 0 which implies s==0 + if(norm_of_s < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;} + #ifdef DiagonallyPrecondition // + mul_grids(domain,level,__temp,1.0,__lambda,__s); // temp[] = lambda[]*s[] + apply_op(domain,level,__As,__temp,a,b); // As = AD^{-1}(s) + #else // + apply_op(domain,level,__As,__s,a,b); // As = A(s) + #endif // + double As_dot_As = dot(domain,level,__As,__As); // As_dot_As = dot(As,As) + double As_dot_s = dot(domain,level,__As, __s); // As_dot_s = dot(As, s) + if(As_dot_As == 0.0){BiCGStabConverged=1;break;} // converged ? + double omega = As_dot_s / As_dot_As; // omega = As_dot_s / As_dot_As + if(omega == 0.0){BiCGStabFailed=1;break;} // stabilization breakdown ??? + if(isinf(omega)){BiCGStabFailed=1;break;} // stabilization breakdown ??? + add_grids(domain,level, x_id, 1.0,x_id, omega,__s ); // x_id[] = x_id[] + omega*s[] + add_grids(domain,level,__r , 1.0,__s ,-omega,__As ); // r[] = s[] - omega*As[] (recursively computed / updated residual) + double norm_of_r = norm(domain,level,__r); // norm of recursively computed residual (good enough??) + if(norm_of_r == 0.0){BiCGStabConverged=1;break;} // + if(norm_of_r < desired_reduction_in_norm*norm_of_r0){BiCGStabConverged=1;break;} + #ifdef __DEBUG // + residual(domain,level,__temp,x_id,R_id,a,b); // + double norm_of_residual = norm(domain,level,__temp); // + if(domain->rank==0)printf("j=%8d, norm=%12.6e, norm_inital=%12.6e, reduction=%e\n",j,norm_of_residual,norm_of_r0,norm_of_residual/norm_of_r0); // + #endif // + double r_dot_r0_new = dot(domain,level,__r,__r0); // r_dot_r0_new = dot(r,r0) + if(r_dot_r0_new == 0.0){BiCGStabFailed=1;break;} // Lanczos breakdown ??? + double beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega); // beta = (r_dot_r0_new/r_dot_r0) * (alpha/omega) + if(isinf(beta)){BiCGStabFailed=1;break;} // ??? + add_grids(domain,level,__temp,1.0,__p,-omega,__Ap ); // __temp = (p[]-omega*Ap[]) + add_grids(domain,level,__p ,1.0,__r, beta,__temp); // p[] = r[] + beta*(p[]-omega*Ap[]) + r_dot_r0 = r_dot_r0_new; // r_dot_r0 = r_dot_r0_new (save old r_dot_r0) + } // } + #ifdef DiagonallyPrecondition // + mul_grids(domain,level,x_id,1.0,__lambda,x_id); // x_id[] = lambda[]*x_id[] // i.e. x = D^{-1}x' + #endif // +} + +//------------------------------------------------------------------------------------------------------------------------------ +void CACG(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){ + // based on Lauren Goodfriend, Yinghui Huang, and David Thorman's derivation in their Spring 2013 CS267 Report + + double temp1[2*__CA_KRYLOV_S+1]; // + double temp2[2*__CA_KRYLOV_S+1]; // + double temp3[2*__CA_KRYLOV_S+1]; // + double aj[2*__CA_KRYLOV_S+1]; // + double cj[2*__CA_KRYLOV_S+1]; // + double ej[2*__CA_KRYLOV_S+1]; // + double Tpaj[2*__CA_KRYLOV_S+1]; // + double Tp[2*__CA_KRYLOV_S+1][2*__CA_KRYLOV_S+1]; // T' indexed as [row][col] + double G[2*__CA_KRYLOV_S+1][2*__CA_KRYLOV_S+1]; // extracted from first 2*__CA_KRYLOV_S+1 columns of Gg[][]. indexed as [row][col] + double Gbuf[(2*__CA_KRYLOV_S+1)*(2*__CA_KRYLOV_S+1)]; // buffer to hold the Gram-like matrix produced by matmul(). indexed as [row*(2*__CA_KRYLOV_S+1) + col] + int PR[2*__CA_KRYLOV_S+1]; // grid_id's of the concatenation of the S+1 matrix powers of P, and the S matrix powers of R + int *P = PR+ 0; // grid_id's of the S+1 Matrix Powers of P. P[i] is the grid_id of A^i(p) + int *R = PR+__CA_KRYLOV_S+1; // grid_id's of the S Matrix Powers of R. R[i] is the grid_id of A^i(r) + + int mMax=200; + int m=0,n; + int i,j,k; + int CGFailed = 0; + int CGConverged = 0; + + double aj_dot_GTpaj,cj_dot_Gcj,alpha,cj_dot_Gcj_new,beta,L2_norm_of_r0,L2_norm_of_residual,delta; + + residual(domain,level,__r0,e_id,R_id,a,b); // r0[] = R_id[] - A(e_id) + scale_grid(domain,level,__r,1.0,__r0); // r[] = r0[] + scale_grid(domain,level,__p,1.0,__r0); // p[] = r0[] + double norm_of_r0 = norm(domain,level,__r0); // the norm of the initial residual... + if(norm_of_r0 == 0.0){CGConverged=1;} // entered CG with exact solution + + delta = dot(domain,level,__r,__r0); // delta = dot(r,r0) + if(delta==0.0){CGConverged=1;} // entered CG with exact solution (square of L2 norm of __r) + L2_norm_of_r0 = sqrt(delta); // + + + + // initialize Tp[][] ... + for(i=0;i<2*__CA_KRYLOV_S+1;i++)for(j=0;j<2*__CA_KRYLOV_S+1;j++) Tp[i][j]=0; // zero Tp + for(i= 0;i< __CA_KRYLOV_S ;i++){ Tp[i+1][i]=1;} // monomial basis + for(i=__CA_KRYLOV_S+1;i<2*__CA_KRYLOV_S ;i++){ Tp[i+1][i]=1;} // + + for(i=0;i<2*__CA_KRYLOV_S+1;i++){PR[i] = __PRrt+i;} // columns of PR map to the consecutive spare grids allocated for the bottom solver starting at __PRrt + + + while( (mCAKrylov_formations_of_G++; // Record the number of times CACG formed G[][] + matmul_grids(domain,level,Gbuf,PR,PR,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1,1); // Compute Gbuf[][] = [P,R]^T * [P,R] (Matmul with grids but only one MPI_AllReduce) + for(i=0,k=0;i<2*__CA_KRYLOV_S+1;i++){ // extract G[][] from Gbuf[] + for(j=0 ;j<2*__CA_KRYLOV_S+1;j++){G[i][j] = Gbuf[k++];} // first 2*__CA_KRYLOV_S+1 elements in each row go to G[][]. + } + + + for(i=0;i<2*__CA_KRYLOV_S+1;i++)aj[i]=0.0;aj[ 0]=1.0; // initialized based on (???) + for(i=0;i<2*__CA_KRYLOV_S+1;i++)cj[i]=0.0;cj[__CA_KRYLOV_S+1]=1.0; // initialized based on (???) + for(i=0;i<2*__CA_KRYLOV_S+1;i++)ej[i]=0.0; // initialized based on (???) + + for(n=0;n<__CA_KRYLOV_S;n++){ // for(n=0;n<__CA_KRYLOV_S;n++){ + domain->Krylov_iterations++; // record number of inner-loop (j) iterations for comparison + __gemv( Tpaj,1.0,Tp, aj,0.0, Tpaj,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1); // T'aj + __gemv(temp1,1.0, G,Tpaj,0.0,temp1,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1); // temp1[] = GT'aj + __gemv(temp2,1.0, G, cj,0.0,temp2,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1); // temp2[] = Gcj + aj_dot_GTpaj = __dot(aj,temp1,2*__CA_KRYLOV_S+1); // (aj,GT'aj) + cj_dot_Gcj = __dot(cj,temp2,2*__CA_KRYLOV_S+1); // (cj, Gcj) + // FIX, can cj_dot_Gcj ever be zero ? + if(aj_dot_GTpaj == 0.0){ // pivot breakdown ??? + CGFailed=1;break; // + } // + alpha = cj_dot_Gcj / aj_dot_GTpaj; // alpha = (cj,Gcj) / (aj,GT'aj) + if(isinf(alpha)){ // alpha = big/tiny(overflow) = inf -> breakdown + CGFailed=1;break; // + } // + __axpy( ej,1.0,ej, alpha, aj,2*__CA_KRYLOV_S+1); // ej[] = ej[] + alpha*aj[] + __axpy( cj,1.0,cj, -alpha, Tpaj,2*__CA_KRYLOV_S+1); // cj[] = cj[] - alpha*T'*aj[] + __gemv(temp2,1.0, G, cj,0.0,temp2,2*__CA_KRYLOV_S+1,2*__CA_KRYLOV_S+1); // temp2[] = Gcj + cj_dot_Gcj_new = __dot(cj,temp2,2*__CA_KRYLOV_S+1); // (cj, Gcj) + // calculate the norm of the incremental residual (Saad's vector 'r') to check intra s-step convergence... == cj_dot_Gcj_new?? + L2_norm_of_residual = 0.0;if(cj_dot_Gcj_new>0)L2_norm_of_residual=sqrt(cj_dot_Gcj_new); // finite precision can lead to the norm^2 being < 0 (Demmel says flush to 0.0) + if(L2_norm_of_residual < desired_reduction_in_norm*L2_norm_of_r0){CGConverged=1;break;} // terminate the inner n-loop + if(cj_dot_Gcj_new == 0.0){ // Lanczos breakdown ??? + CGFailed=1;break; // + } // + beta = cj_dot_Gcj_new / cj_dot_Gcj; // + if(isinf(beta)){CGFailed=1;break;} // beta = inf? + if(beta == 0.0){CGFailed=1;break;} // beta = 0? can't make further progress(?) + __axpy( aj,1.0,cj, beta, aj,2*__CA_KRYLOV_S+1); // cj[] = cj[] + beta*aj[] + + } // inner n (j) loop + + // update iterates... + for(i=0;i<2*__CA_KRYLOV_S+1;i++){add_grids(domain,level,e_id,1.0,e_id,ej[i],PR[i]);} // e_id[] = [P,R]ej + e_id[] + if(!CGFailed && !CGConverged){ // if we're done, then there is no point in updating these + add_grids(domain,level, __p,0.0, __p,aj[0],PR[0]); // p[] = [P,R]aj + for(i=1;i<2*__CA_KRYLOV_S+1;i++){add_grids(domain,level, __p,1.0, __p,aj[i],PR[i]);} // ... + add_grids(domain,level, __r,0.0, __r,cj[0],PR[0]); // r[] = [P,R]cj + for(i=1;i<2*__CA_KRYLOV_S+1;i++){add_grids(domain,level, __r,1.0, __r,cj[i],PR[i]);} // ... + } + m+=__CA_KRYLOV_S; // m+=__CA_KRYLOV_S; + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + } // } // outer m loop + +} + + +//------------------------------------------------------------------------------------------------------------------------------ +void CG(domain_type * domain, int level, int x_id, int R_id, double a, double b, double desired_reduction_in_norm){ + // Algorithm 6.18 in Iterative Methods for Sparse Linear Systems(Yousef Saad) + int jMax=200; + int j=0; + int CGFailed = 0; + int CGConverged = 0; + residual(domain,level,__r0,x_id,R_id,a,b); // r0[] = R_id[] - A(x_id) + scale_grid(domain,level,__r,1.0,__r0); // r[] = r0[] + scale_grid(domain,level,__p,1.0,__r0); // p[] = r0[] + double norm_of_r0 = norm(domain,level,__r); // the norm of the initial residual... + if(norm_of_r0 == 0.0){CGConverged=1;} // entered CG with exact solution + double r_dot_r = dot(domain,level,__r,__r); // r_dot_r = dot(r,r) + while( (jKrylov_iterations++; // + apply_op(domain,level,__Ap,__p,a,b); // Ap = A(p) + double Ap_dot_p = dot(domain,level,__Ap,__p); // Ap_dot_p = dot(Ap,p) + if(Ap_dot_p == 0.0){CGFailed=1;break;} // pivot breakdown ??? + double alpha = r_dot_r / Ap_dot_p; // alpha = r_dot_r / Ap_dot_p + if(isinf(alpha)){CGFailed=1;break;} // ??? + add_grids(domain,level,x_id,1.0,x_id, alpha,__p ); // x_id[] = x_id[] + alpha*p[] + add_grids(domain,level,__r ,1.0,__r ,-alpha,__Ap); // r[] = r[] - alpha*Ap[] (intermediate residual?) + double norm_of_r = norm(domain,level,__r); // norm of intermediate residual + if(norm_of_r == 0.0){CGConverged=1;break;} // + if(norm_of_r < desired_reduction_in_norm*norm_of_r0){CGConverged=1;break;} // + double r_dot_r_new = dot(domain,level,__r,__r); // r_dot_r_new = dot(r_{j+1},r_{j+1}) + if(r_dot_r_new == 0.0){CGFailed=1;break;} // Lanczos breakdown ??? + double beta = (r_dot_r_new/r_dot_r); // beta = (r_dot_r_new/r_dot_r) + if(isinf(beta)){CGFailed=1;break;} // ??? + add_grids(domain,level,__p,1.0,__r,beta,__p ); // p[] = r[] + beta*p[] + r_dot_r = r_dot_r_new; // r_dot_r = r_dot_r_new (save old r_dot_r) + } // } +} + +//------------------------------------------------------------------------------------------------------------------------------ +void IterativeSolver(domain_type * domain, int level, int e_id, int R_id, double a, double b, double desired_reduction_in_norm){ + // code presumes e_id was an externally initialized with the initial guess + #ifdef __USE_BICGSTAB + #warning Using BiCGStab bottom solver... + BiCGStab(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); + #elif __USE_CG + #warning Using CG bottom solver... + CG(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); + #elif __USE_CABICGSTAB + #warning Using Communication-Avoiding BiCGStab bottom solver... + // CABiCGStab(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); + TelescopingCABiCGStab(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); + #elif __USE_CACG + #warning Using Communication-Avoiding CG bottom solver... + CACG(domain,level,e_id,R_id,a,b,desired_reduction_in_norm); + #else // just multiple smooth()'s + #warning Defaulting to simple bottom solver with fixed number of smooths... + int s,numSmoothsBottom=48; + for(s=0;s +uint64_t CycleTime(); +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/timer.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/timer.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/timer.c @@ -0,0 +1,21 @@ +//------------------------------------------------------------------------------------------------------------------------------ +// Samuel Williams +// SWWilliams@lbl.gov +// Lawrence Berkeley National Lab +//------------------------------------------------------------------------------------------------------------------------------ +#include +uint64_t CycleTime(){ + #ifndef __has_builtin + #define __has_builtin(x) 0 + #endif + + #if !defined(USE_CYCLE_COUNTER) + return 0; + #elif __has_builtin(__builtin_readcyclecounter) + return __builtin_readcyclecounter(); + #else + uint64_t lo, hi; + __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); + return( (((uint64_t)hi) << 32) | ((uint64_t)lo) ); + #endif +}