Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt =================================================================== --- MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(HPCCG) add_subdirectory(PENNANT) +add_subdirectory(lulesh) Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile =================================================================== --- MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile @@ -1,6 +1,6 @@ # MultiSource/DOE-ProxyApps-C++ Makefile: Build all subdirectories automatically LEVEL = ../../.. -PARALLEL_DIRS = HPCCG PENNANT +PARALLEL_DIRS = HPCCG PENNANT lulesh include $(LEVEL)/Makefile.programs Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/CMakeLists.txt =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/CMakeLists.txt @@ -0,0 +1,5 @@ +set(PROG lulesh2.0) +list(APPEND LDFLAGS -g -O3) +list(APPEND CXXFLAGS -g -O3 -I. -Wall -DUSE_MPI=0 -D_OPENMP=0) +set(RUN_OPTIONS -s 10 -b 1) +llvm_multisource() Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/Makefile =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/Makefile @@ -0,0 +1,7 @@ +LEVEL = ../../../.. + +PROG = lulesh2.0 +LDFLAGS = -g -O3 +CXXFLAGS = -g -O3 -I. -Wall -DUSE_MPI=0 -D_OPENMP=0 +RUN_OPTIONS = -s 10 -b 1 +include $(LEVEL)/MultiSource/Makefile.multisrc Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/glibc_compat_rand.h =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/glibc_compat_rand.h @@ -0,0 +1,16 @@ +/*===------------- glibc_compat_rand.h- glibc rand emulation --------------===*\ + * |* + * |* The LLVM Compiler Infrastructure + * |* + * |* This file is distributed under the University of Illinois Open Source + * |* License. See LICENSE.TXT for details. + * |* + * \*===----------------------------------------------------------------------===*/ + +#ifndef GLIBC_COMPAT_RAND_H +#define GLIBC_COMPAT_RAND_H + +int glibc_compat_rand(void); +void glibc_compat_srand(unsigned int seed); + +#endif /* GLIBC_COMPAT_RAND_H */ Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/glibc_compat_rand.cc =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/glibc_compat_rand.cc @@ -0,0 +1,59 @@ +/*===------------ glibc_compat_rand.c - glibc rand emulation --------------===*\ +* +* The LLVM Compiler Infrastructure +* This file is distributed under the University of Illinois Open Source +* License. See LICENSE.TXT for details. +* +\*===----------------------------------------------------------------------===*/ + +#include "glibc_compat_rand.h" + +/** +* This rand implementation is designed to emulate the implementation of +* rand/srand in recent versions of glibc. This is used for programs which +* require this specific rand implementation in order to pass verification +* tests. +* +* For more information, see: http://www.mathstat.dal.ca/~selinger/random/ +**/ + +#define TABLE_SIZE 344 +static unsigned int table[TABLE_SIZE]; +static int next; + +int glibc_compat_rand(void) { + /* Calculate the indices i-3 and i-31 in the circular vector. */ + int i3 = (next < 3) ? (TABLE_SIZE + next - 3) : (next - 3); + int i31 = (next < 31) ? (TABLE_SIZE + next - 31) : (next - 31); + + table[next] = table[i3] + table[i31]; + unsigned int r = table[next] >> 1; + + ++next; + if (next >= TABLE_SIZE) + next = 0; + + return r; +} + +void glibc_compat_srand(unsigned int seed) { + if (seed == 0) + seed = 1; + + table[0] = seed; + + for (int i = 1; i < 31; i++) { + int r = (16807ll * table[i - 1]) % 2147483647; + if (r < 0) + r += 2147483647; + + table[i] = r; + } + + for (int i = 31; i < 34; i++) + table[i] = table[i - 31]; + for (int i = 34; i < TABLE_SIZE; i++) + table[i] = table[i - 31] + table[i - 3]; + + next = 0; +} Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh-comm.cc =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh-comm.cc @@ -0,0 +1,1837 @@ +#include "lulesh.h" + +// If no MPI, then this whole file is stubbed out +#if USE_MPI + +#include +#include + +/* Comm Routines */ + +#define ALLOW_UNPACKED_PLANE false +#define ALLOW_UNPACKED_ROW false +#define ALLOW_UNPACKED_COL false + +/* + There are coherence issues for packing and unpacking message + buffers. Ideally, you would like a lot of threads to + cooperate in the assembly/dissassembly of each message. + To do that, each thread should really be operating in a + different coherence zone. + + Let's assume we have three fields, f1 through f3, defined on + a 61x61x61 cube. If we want to send the block boundary + information for each field to each neighbor processor across + each cube face, then we have three cases for the + memory layout/coherence of data on each of the six cube + boundaries: + + (a) Two of the faces will be in contiguous memory blocks + (b) Two of the faces will be comprised of pencils of + contiguous memory. + (c) Two of the faces will have large strides between + every value living on the face. + + How do you pack and unpack this data in buffers to + simultaneous achieve the best memory efficiency and + the most thread independence? + + Do do you pack field f1 through f3 tighly to reduce message + size? Do you align each field on a cache coherence boundary + within the message so that threads can pack and unpack each + field independently? For case (b), do you align each + boundary pencil of each field separately? This increases + the message size, but could improve cache coherence so + each pencil could be processed independently by a separate + thread with no conflicts. + + Also, memory access for case (c) would best be done without + going through the cache (the stride is so large it just causes + a lot of useless cache evictions). Is it worth creating + a special case version of the packing algorithm that uses + non-coherent load/store opcodes? +*/ + +/******************************************/ + + +/* doRecv flag only works with regular block structure */ +void CommRecv(Domain& domain, int msgType, Index_t xferFields, + Index_t dx, Index_t dy, Index_t dz, bool doRecv, bool planeOnly) { + + if (domain.numRanks() == 1) + return ; + + /* post recieve buffers for all incoming messages */ + int myRank ; + Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ; + Index_t maxEdgeComm = xferFields * domain.maxEdgeSize() ; + Index_t pmsg = 0 ; /* plane comm msg */ + Index_t emsg = 0 ; /* edge comm msg */ + Index_t cmsg = 0 ; /* corner comm msg */ + MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ; + bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ; + + /* assume communication to 6 neighbors by default */ + rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ; + + if (domain.rowLoc() == 0) { + rowMin = false ; + } + if (domain.rowLoc() == (domain.tp()-1)) { + rowMax = false ; + } + if (domain.colLoc() == 0) { + colMin = false ; + } + if (domain.colLoc() == (domain.tp()-1)) { + colMax = false ; + } + if (domain.planeLoc() == 0) { + planeMin = false ; + } + if (domain.planeLoc() == (domain.tp()-1)) { + planeMax = false ; + } + + for (Index_t i=0; i<26; ++i) { + domain.recvRequest[i] = MPI_REQUEST_NULL ; + } + + MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ; + + /* post receives */ + + /* receive data from neighboring domain faces */ + if (planeMin && doRecv) { + /* contiguous memory */ + int fromRank = myRank - domain.tp()*domain.tp() ; + int recvCount = dx * dy * xferFields ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], + recvCount, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; + ++pmsg ; + } + if (planeMax) { + /* contiguous memory */ + int fromRank = myRank + domain.tp()*domain.tp() ; + int recvCount = dx * dy * xferFields ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], + recvCount, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; + ++pmsg ; + } + if (rowMin && doRecv) { + /* semi-contiguous memory */ + int fromRank = myRank - domain.tp() ; + int recvCount = dx * dz * xferFields ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], + recvCount, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; + ++pmsg ; + } + if (rowMax) { + /* semi-contiguous memory */ + int fromRank = myRank + domain.tp() ; + int recvCount = dx * dz * xferFields ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], + recvCount, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; + ++pmsg ; + } + if (colMin && doRecv) { + /* scattered memory */ + int fromRank = myRank - 1 ; + int recvCount = dy * dz * xferFields ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], + recvCount, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; + ++pmsg ; + } + if (colMax) { + /* scattered memory */ + int fromRank = myRank + 1 ; + int recvCount = dy * dz * xferFields ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], + recvCount, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; + ++pmsg ; + } + + if (!planeOnly) { + /* receive data from domains connected only by an edge */ + if (rowMin && colMin && doRecv) { + int fromRank = myRank - domain.tp() - 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dz * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (rowMin && planeMin && doRecv) { + int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dx * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (colMin && planeMin && doRecv) { + int fromRank = myRank - domain.tp()*domain.tp() - 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dy * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (rowMax && colMax) { + int fromRank = myRank + domain.tp() + 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dz * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (rowMax && planeMax) { + int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dx * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (colMax && planeMax) { + int fromRank = myRank + domain.tp()*domain.tp() + 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dy * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (rowMax && colMin) { + int fromRank = myRank + domain.tp() - 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dz * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (rowMin && planeMax) { + int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dx * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (colMin && planeMax) { + int fromRank = myRank + domain.tp()*domain.tp() - 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dy * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (rowMin && colMax && doRecv) { + int fromRank = myRank - domain.tp() + 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dz * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (rowMax && planeMin && doRecv) { + int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dx * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + if (colMax && planeMin && doRecv) { + int fromRank = myRank - domain.tp()*domain.tp() + 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm], + dy * xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; + ++emsg ; + } + + /* receive data from domains connected only by a corner */ + if (rowMin && colMin && planeMin && doRecv) { + /* corner at domain logical coord (0, 0, 0) */ + int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; + ++cmsg ; + } + if (rowMin && colMin && planeMax) { + /* corner at domain logical coord (0, 0, 1) */ + int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; + ++cmsg ; + } + if (rowMin && colMax && planeMin && doRecv) { + /* corner at domain logical coord (1, 0, 0) */ + int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; + ++cmsg ; + } + if (rowMin && colMax && planeMax) { + /* corner at domain logical coord (1, 0, 1) */ + int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; + ++cmsg ; + } + if (rowMax && colMin && planeMin && doRecv) { + /* corner at domain logical coord (0, 1, 0) */ + int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; + ++cmsg ; + } + if (rowMax && colMin && planeMax) { + /* corner at domain logical coord (0, 1, 1) */ + int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; + ++cmsg ; + } + if (rowMax && colMax && planeMin && doRecv) { + /* corner at domain logical coord (1, 1, 0) */ + int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; + ++cmsg ; + } + if (rowMax && colMax && planeMax) { + /* corner at domain logical coord (1, 1, 1) */ + int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ; + MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + + emsg * maxEdgeComm + + cmsg * CACHE_COHERENCE_PAD_REAL], + xferFields, baseType, fromRank, msgType, + MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; + ++cmsg ; + } + } +} + +/******************************************/ + +void CommSend(Domain& domain, int msgType, + Index_t xferFields, Domain_member *fieldData, + Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly) +{ + + if (domain.numRanks() == 1) + return ; + + /* post recieve buffers for all incoming messages */ + int myRank ; + Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ; + Index_t maxEdgeComm = xferFields * domain.maxEdgeSize() ; + Index_t pmsg = 0 ; /* plane comm msg */ + Index_t emsg = 0 ; /* edge comm msg */ + Index_t cmsg = 0 ; /* corner comm msg */ + MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ; + MPI_Status status[26] ; + Real_t *destAddr ; + bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ; + /* assume communication to 6 neighbors by default */ + rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ; + if (domain.rowLoc() == 0) { + rowMin = false ; + } + if (domain.rowLoc() == (domain.tp()-1)) { + rowMax = false ; + } + if (domain.colLoc() == 0) { + colMin = false ; + } + if (domain.colLoc() == (domain.tp()-1)) { + colMax = false ; + } + if (domain.planeLoc() == 0) { + planeMin = false ; + } + if (domain.planeLoc() == (domain.tp()-1)) { + planeMax = false ; + } + + for (Index_t i=0; i<26; ++i) { + domain.sendRequest[i] = MPI_REQUEST_NULL ; + } + + MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ; + + /* post sends */ + + if (planeMin | planeMax) { + /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ + int sendCount = dx * dy ; + + if (planeMin) { + destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ; + for (Index_t fi=0 ; fi +#if USE_MPI +# include +#endif +#if _OPENMP +#include +#endif +#include +#include +#include +#include +#include +#include "lulesh.h" + +///////////////////////////////////////////////////////////////////// +Domain::Domain(Int_t numRanks, Index_t colLoc, + Index_t rowLoc, Index_t planeLoc, + Index_t nx, int tp, int nr, int balance, Int_t cost) + : + m_e_cut(Real_t(1.0e-7)), + m_p_cut(Real_t(1.0e-7)), + m_q_cut(Real_t(1.0e-7)), + m_v_cut(Real_t(1.0e-10)), + m_u_cut(Real_t(1.0e-7)), + m_hgcoef(Real_t(3.0)), + m_ss4o3(Real_t(4.0)/Real_t(3.0)), + m_qstop(Real_t(1.0e+12)), + m_monoq_max_slope(Real_t(1.0)), + m_monoq_limiter_mult(Real_t(2.0)), + m_qlc_monoq(Real_t(0.5)), + m_qqc_monoq(Real_t(2.0)/Real_t(3.0)), + m_qqc(Real_t(2.0)), + m_eosvmax(Real_t(1.0e+9)), + m_eosvmin(Real_t(1.0e-9)), + m_pmin(Real_t(0.)), + m_emin(Real_t(-1.0e+15)), + m_dvovmax(Real_t(0.1)), + m_refdens(Real_t(1.0)) +{ + + Index_t edgeElems = nx ; + Index_t edgeNodes = edgeElems+1 ; + this->cost() = cost; + + m_tp = tp ; + m_numRanks = numRanks ; + + /////////////////////////////// + // Initialize Sedov Mesh + /////////////////////////////// + + // construct a uniform box for this processor + + m_colLoc = colLoc ; + m_rowLoc = rowLoc ; + m_planeLoc = planeLoc ; + + m_sizeX = edgeElems ; + m_sizeY = edgeElems ; + m_sizeZ = edgeElems ; + m_numElem = edgeElems*edgeElems*edgeElems ; + + m_numNode = edgeNodes*edgeNodes*edgeNodes ; + + m_regNumList = new Index_t[numElem()] ; // material indexset + + // Elem-centered + AllocateElemPersistent(numElem()) ; + + // Node-centered + AllocateNodePersistent(numNode()) ; + + SetupCommBuffers(edgeNodes); + + // Basic Field Initialization + for (Index_t i=0; i 1) { + // set up node-centered indexing of elements + Index_t *nodeElemCount = new Index_t[numNode()] ; + + for (Index_t i=0; i numElem()*8)) { + fprintf(stderr, + "AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n"); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1); +#else + exit(-1); +#endif + } + } + + delete [] nodeElemCount ; + } + else { + // These arrays are not used if we're not threaded + m_nodeElemStart = NULL; + m_nodeElemCornerList = NULL; + } +} + + +//////////////////////////////////////////////////////////////////////////////// +void +Domain::SetupCommBuffers(Int_t edgeNodes) +{ + // allocate a buffer large enough for nodal ghost data + Index_t maxEdgeSize = MAX(this->sizeX(), MAX(this->sizeY(), this->sizeZ()))+1 ; + m_maxPlaneSize = CACHE_ALIGN_REAL(maxEdgeSize*maxEdgeSize) ; + m_maxEdgeSize = CACHE_ALIGN_REAL(maxEdgeSize) ; + + // assume communication to 6 neighbors by default + m_rowMin = (m_rowLoc == 0) ? 0 : 1; + m_rowMax = (m_rowLoc == m_tp-1) ? 0 : 1; + m_colMin = (m_colLoc == 0) ? 0 : 1; + m_colMax = (m_colLoc == m_tp-1) ? 0 : 1; + m_planeMin = (m_planeLoc == 0) ? 0 : 1; + m_planeMax = (m_planeLoc == m_tp-1) ? 0 : 1; + +#if USE_MPI + // account for face communication + Index_t comBufSize = + (m_rowMin + m_rowMax + m_colMin + m_colMax + m_planeMin + m_planeMax) * + m_maxPlaneSize * MAX_FIELDS_PER_MPI_COMM ; + + // account for edge communication + comBufSize += + ((m_rowMin & m_colMin) + (m_rowMin & m_planeMin) + (m_colMin & m_planeMin) + + (m_rowMax & m_colMax) + (m_rowMax & m_planeMax) + (m_colMax & m_planeMax) + + (m_rowMax & m_colMin) + (m_rowMin & m_planeMax) + (m_colMin & m_planeMax) + + (m_rowMin & m_colMax) + (m_rowMax & m_planeMin) + (m_colMax & m_planeMin)) * + m_maxEdgeSize * MAX_FIELDS_PER_MPI_COMM ; + + // account for corner communication + // factor of 16 is so each buffer has its own cache line + comBufSize += ((m_rowMin & m_colMin & m_planeMin) + + (m_rowMin & m_colMin & m_planeMax) + + (m_rowMin & m_colMax & m_planeMin) + + (m_rowMin & m_colMax & m_planeMax) + + (m_rowMax & m_colMin & m_planeMin) + + (m_rowMax & m_colMin & m_planeMax) + + (m_rowMax & m_colMax & m_planeMin) + + (m_rowMax & m_colMax & m_planeMax)) * CACHE_COHERENCE_PAD_REAL ; + + this->commDataSend = new Real_t[comBufSize] ; + this->commDataRecv = new Real_t[comBufSize] ; + // prevent floating point exceptions + memset(this->commDataSend, 0, comBufSize*sizeof(Real_t)) ; + memset(this->commDataRecv, 0, comBufSize*sizeof(Real_t)) ; +#endif + + // Boundary nodesets + if (m_colLoc == 0) + m_symmX.resize(edgeNodes*edgeNodes); + if (m_rowLoc == 0) + m_symmY.resize(edgeNodes*edgeNodes); + if (m_planeLoc == 0) + m_symmZ.resize(edgeNodes*edgeNodes); +} + + +//////////////////////////////////////////////////////////////////////////////// +void +Domain::CreateRegionIndexSets(Int_t nr, Int_t balance) +{ +#if USE_MPI + Index_t myRank; + MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ; + srand(myRank); +#else + srand(1); + Index_t myRank = 0; +#endif + this->numReg() = nr; + m_regElemSize = new Index_t[numReg()]; + m_regElemlist = new Index_t*[numReg()]; + Index_t nextIndex = 0; + //if we only have one region just fill it + // Fill out the regNumList with material numbers, which are always + // the region index plus one + if(numReg() == 1) { + while (nextIndex < numElem()) { + this->regNumList(nextIndex) = 1; + nextIndex++; + } + regElemSize(0) = 0; + } + //If we have more than one region distribute the elements. + else { + Int_t regionNum; + Int_t regionVar; + Int_t lastReg = -1; + Int_t binSize; + Index_t elements; + Index_t runto = 0; + Int_t costDenominator = 0; + Int_t* regBinEnd = new Int_t[numReg()]; + //Determine the relative weights of all the regions. This is based off the -b flag. Balance is the value passed into b. + for (Index_t i=0 ; i= regBinEnd[i]) + i++; + //rotate the regions based on MPI rank. Rotation is Rank % NumRegions this makes each domain have a different region with + //the highest representation + regionNum = ((i + myRank) % numReg()) + 1; + // make sure we don't pick the same region twice in a row + while(regionNum == lastReg) { + regionVar = rand() % costDenominator; + i = 0; + while(regionVar >= regBinEnd[i]) + i++; + regionNum = ((i + myRank) % numReg()) + 1; + } + //Pick the bin size of the region and determine the number of elements. + binSize = rand() % 1000; + if(binSize < 773) { + elements = rand() % 15 + 1; + } + else if(binSize < 937) { + elements = rand() % 16 + 16; + } + else if(binSize < 970) { + elements = rand() % 32 + 32; + } + else if(binSize < 974) { + elements = rand() % 64 + 64; + } + else if(binSize < 978) { + elements = rand() % 128 + 128; + } + else if(binSize < 981) { + elements = rand() % 256 + 256; + } + else + elements = rand() % 1537 + 512; + runto = elements + nextIndex; + //Store the elements. If we hit the end before we run out of elements then just stop. + while (nextIndex < runto && nextIndex < numElem()) { + this->regNumList(nextIndex) = regionNum; + nextIndex++; + } + lastReg = regionNum; + } + } + // Convert regNumList to region index sets + // First, count size of each region + for (Index_t i=0 ; iregNumList(i)-1; // region index == regnum-1 + regElemSize(r)++; + } + // Second, allocate each region index set + for (Index_t i=0 ; i CACHE_COHERENCE_PAD_REAL) { + printf("corner element comm buffers too small. Fix code.\n") ; +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1) ; +#else + exit(-1); +#endif + } + + dx = testProcs ; + dy = testProcs ; + dz = testProcs ; + + // temporary test + if (dx*dy*dz != numRanks) { + printf("error -- must have as many domains as procs\n") ; +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1) ; +#else + exit(-1); +#endif + } + Int_t remainder = dx*dy*dz % numRanks ; + if (myRank < remainder) { + myDom = myRank*( 1+ (dx*dy*dz / numRanks)) ; + } + else { + myDom = remainder*( 1+ (dx*dy*dz / numRanks)) + + (myRank - remainder)*(dx*dy*dz/numRanks) ; + } + + *col = myDom % dx ; + *row = (myDom / dx) % dy ; + *plane = myDom / (dx*dy) ; + *side = testProcs; + + return; +} + Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh-util.cc =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh-util.cc @@ -0,0 +1,222 @@ +#include +#include +#include +#include +#if USE_MPI +#include +#endif +#include "lulesh.h" + +/* Helper function for converting strings to ints, with error checking */ +int StrToInt(const char *token, int *retVal) +{ + const char *c ; + char *endptr ; + const int decimal_base = 10 ; + + if (token == NULL) + return 0 ; + + c = token ; + *retVal = (int)strtol(c, &endptr, decimal_base) ; + if((endptr != c) && ((*endptr == ' ') || (*endptr == '\0'))) + return 1 ; + else + return 0 ; +} + +static void PrintCommandLineOptions(char *execname, int myRank) +{ + if (myRank == 0) { + + printf("Usage: %s [opts]\n", execname); + printf(" where [opts] is one or more of:\n"); + printf(" -q : quiet mode - suppress all stdout\n"); + printf(" -i : number of cycles to run\n"); + printf(" -s : length of cube mesh along side\n"); + printf(" -r : Number of distinct regions (def: 11)\n"); + printf(" -b : Load balance between regions of a domain (def: 1)\n"); + printf(" -c : Extra cost of more expensive regions (def: 1)\n"); + printf(" -f : Number of files to split viz dump into (def: (np+10)/9)\n"); + printf(" -p : Print out progress\n"); + printf(" -v : Output viz file (requires compiling with -DVIZ_MESH\n"); + printf(" -h : This message\n"); + printf("\n\n"); + } +} + +static void ParseError(const char *message, int myRank) +{ + if (myRank == 0) { + printf("%s\n", message); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, -1); +#else + exit(-1); +#endif + } +} + +void ParseCommandLineOptions(int argc, char *argv[], + int myRank, struct cmdLineOpts *opts) +{ + if(argc > 1) { + int i = 1; + + while(i < argc) { + int ok; + /* -i */ + if(strcmp(argv[i], "-i") == 0) { + if (i+1 >= argc) { + ParseError("Missing integer argument to -i", myRank); + } + ok = StrToInt(argv[i+1], &(opts->its)); + if(!ok) { + ParseError("Parse Error on option -i integer value required after argument\n", myRank); + } + i+=2; + } + /* -s */ + else if(strcmp(argv[i], "-s") == 0) { + if (i+1 >= argc) { + ParseError("Missing integer argument to -s\n", myRank); + } + ok = StrToInt(argv[i+1], &(opts->nx)); + if(!ok) { + ParseError("Parse Error on option -s integer value required after argument\n", myRank); + } + i+=2; + } + /* -r */ + else if (strcmp(argv[i], "-r") == 0) { + if (i+1 >= argc) { + ParseError("Missing integer argument to -r\n", myRank); + } + ok = StrToInt(argv[i+1], &(opts->numReg)); + if (!ok) { + ParseError("Parse Error on option -r integer value required after argument\n", myRank); + } + i+=2; + } + /* -f */ + else if (strcmp(argv[i], "-f") == 0) { + if (i+1 >= argc) { + ParseError("Missing integer argument to -f\n", myRank); + } + ok = StrToInt(argv[i+1], &(opts->numFiles)); + if (!ok) { + ParseError("Parse Error on option -f integer value required after argument\n", myRank); + } + i+=2; + } + /* -p */ + else if (strcmp(argv[i], "-p") == 0) { + opts->showProg = 1; + i++; + } + /* -q */ + else if (strcmp(argv[i], "-q") == 0) { + opts->quiet = 1; + i++; + } + else if (strcmp(argv[i], "-b") == 0) { + if (i+1 >= argc) { + ParseError("Missing integer argument to -b\n", myRank); + } + ok = StrToInt(argv[i+1], &(opts->balance)); + if (!ok) { + ParseError("Parse Error on option -b integer value required after argument\n", myRank); + } + i+=2; + } + else if (strcmp(argv[i], "-c") == 0) { + if (i+1 >= argc) { + ParseError("Missing integer argument to -c\n", myRank); + } + ok = StrToInt(argv[i+1], &(opts->cost)); + if (!ok) { + ParseError("Parse Error on option -c integer value required after argument\n", myRank); + } + i+=2; + } + /* -v */ + else if (strcmp(argv[i], "-v") == 0) { +#if VIZ_MESH + opts->viz = 1; +#else + ParseError("Use of -v requires compiling with -DVIZ_MESH\n", myRank); +#endif + i++; + } + /* -h */ + else if (strcmp(argv[i], "-h") == 0) { + PrintCommandLineOptions(argv[0], myRank); +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, 0); +#else + exit(0); +#endif + } + else { + char msg[80]; + PrintCommandLineOptions(argv[0], myRank); + sprintf(msg, "ERROR: Unknown command line argument: %s\n", argv[i]); + ParseError(msg, myRank); + } + } + } +} + +///////////////////////////////////////////////////////////////////// + +void VerifyAndWriteFinalOutput(Real_t elapsed_time, + Domain& locDom, + Int_t nx, + Int_t numRanks, + Int_t displaytiming) +{ + // GrindTime1 only takes a single domain into account, and is thus a good way to measure + // processor speed indepdendent of MPI parallelism. + // GrindTime2 takes into account speedups from MPI parallelism + Real_t grindTime1 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx); + Real_t grindTime2 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx*numRanks); + + Index_t ElemId = 0; + printf("Run completed: \n"); + printf(" Problem size = %i \n", nx); + printf(" MPI tasks = %i \n", numRanks); + printf(" Iteration count = %i \n", locDom.cycle()); + printf(" Final Origin Energy = %12.6e \n", locDom.e(ElemId)); + + Real_t MaxAbsDiff = Real_t(0.0); + Real_t TotalAbsDiff = Real_t(0.0); + Real_t MaxRelDiff = Real_t(0.0); + + for (Index_t j=0; j 0){ + // Timing information + printf("\nElapsed time = %10.2f (s)\n", elapsed_time); + printf("Grind time (us/z/c) = %10.8g (per dom) (%10.8g overall)\n", grindTime1, grindTime2); + printf("FOM = %10.8g (z/s)\n\n", 1000.0/grindTime2); // zones per second + } + + return ; +} Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh-viz.cc =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh-viz.cc @@ -0,0 +1,432 @@ +#include +#include +#include +#include +#include "lulesh.h" + +#ifdef VIZ_MESH + +#ifdef __cplusplus + extern "C" { +#endif +#include "silo.h" +#if USE_MPI +# include "pmpio.h" +#endif +#ifdef __cplusplus + } +#endif + +// Function prototypes +static void +DumpDomainToVisit(DBfile *db, Domain& domain, int myRank); +static + + +#if USE_MPI +// For some reason, earlier versions of g++ (e.g. 4.2) won't let me +// put the 'static' qualifier on this prototype, even if it's done +// consistently in the prototype and definition +void +DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, + char basename[], int numRanks); + +// Callback prototypes for PMPIO interface (only useful if we're +// running parallel) +static void * +LULESH_PMPIO_Create(const char *fname, + const char *dname, + void *udata); +static void * +LULESH_PMPIO_Open(const char *fname, + const char *dname, + PMPIO_iomode_t ioMode, + void *udata); +static void +LULESH_PMPIO_Close(void *file, void *udata); + +#else +void +DumpMultiblockObjects(DBfile *db, char basename[], int numRanks); +#endif + + +/**********************************************************************/ +void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks) +{ + char subdirName[32]; + char basename[32]; + DBfile *db; + + + sprintf(basename, "lulesh_plot_c%d", domain.cycle()); + sprintf(subdirName, "data_%d", myRank); + +#if USE_MPI + + PMPIO_baton_t *bat = PMPIO_Init(numFiles, + PMPIO_WRITE, + MPI_COMM_WORLD, + 10101, + LULESH_PMPIO_Create, + LULESH_PMPIO_Open, + LULESH_PMPIO_Close, + NULL); + + int myiorank = PMPIO_GroupRank(bat, myRank); + + char fileName[64]; + + if (myiorank == 0) + strcpy(fileName, basename); + else + sprintf(fileName, "%s.%03d", basename, myiorank); + + db = (DBfile*)PMPIO_WaitForBaton(bat, fileName, subdirName); + + DumpDomainToVisit(db, domain, myRank); + + // Processor 0 writes out bit of extra data to its file that + // describes how to stitch all the pieces together + if (myRank == 0) { + DumpMultiblockObjects(db, bat, basename, numRanks); + } + + PMPIO_HandOffBaton(bat, db); + + PMPIO_Finish(bat); +#else + + db = (DBfile*)DBCreate(basename, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X); + + if (db) { + DBMkDir(db, subdirName); + DBSetDir(db, subdirName); + DumpDomainToVisit(db, domain, myRank); + DumpMultiblockObjects(db, basename, numRanks); + } + else { + printf("Error writing out viz file - rank %d\n", myRank); + } + +#endif +} + + + +/**********************************************************************/ + +static void +DumpDomainToVisit(DBfile *db, Domain& domain, int myRank) +{ + int ok = 0; + + /* Create an option list that will give some hints to VisIt for + * printing out the cycle and time in the annotations */ + DBoptlist *optlist; + + + /* Write out the mesh connectivity in fully unstructured format */ + int shapetype[1] = {DB_ZONETYPE_HEX}; + int shapesize[1] = {8}; + int shapecnt[1] = {domain.numElem()}; + int *conn = new int[domain.numElem()*8] ; + int ci = 0 ; + for (int ei=0; ei < domain.numElem(); ++ei) { + Index_t *elemToNode = domain.nodelist(ei) ; + for (int ni=0; ni < 8; ++ni) { + conn[ci++] = elemToNode[ni] ; + } + } + ok += DBPutZonelist2(db, "connectivity", domain.numElem(), 3, + conn, domain.numElem()*8, + 0,0,0, /* Not carrying ghost zones */ + shapetype, shapesize, shapecnt, + 1, NULL); + delete [] conn ; + + /* Write out the mesh coordinates associated with the mesh */ + const char* coordnames[3] = {"X", "Y", "Z"}; + float *coords[3] ; + coords[0] = new float[domain.numNode()] ; + coords[1] = new float[domain.numNode()] ; + coords[2] = new float[domain.numNode()] ; + for (int ni=0; ni < domain.numNode() ; ++ni) { + coords[0][ni] = float(domain.x(ni)) ; + coords[1][ni] = float(domain.y(ni)) ; + coords[2][ni] = float(domain.z(ni)) ; + } + optlist = DBMakeOptlist(2); + ok += DBAddOption(optlist, DBOPT_DTIME, &domain.time()); + ok += DBAddOption(optlist, DBOPT_CYCLE, &domain.cycle()); + ok += DBPutUcdmesh(db, "mesh", 3, (char**)&coordnames[0], (float**)coords, + domain.numNode(), domain.numElem(), "connectivity", + 0, DB_FLOAT, optlist); + ok += DBFreeOptlist(optlist); + delete [] coords[2] ; + delete [] coords[1] ; + delete [] coords[0] ; + + /* Write out the materials */ + int *matnums = new int[domain.numReg()]; + int dims[1] = {domain.numElem()}; // No mixed elements + for(int i=0 ; i + +/* + define one of these three symbols: + + SEDOV_SYNC_POS_VEL_NONE + SEDOV_SYNC_POS_VEL_EARLY + SEDOV_SYNC_POS_VEL_LATE +*/ + +#define SEDOV_SYNC_POS_VEL_EARLY 1 +#endif + +#include +#include + +#include "glibc_compat_rand.h" +#ifndef NO_GLIBC_COMPAT_RAND +#define rand glibc_compat_rand +#define srand glibc_compat_srand +#endif + +//************************************************** +// Allow flexibility for arithmetic representations +//************************************************** + +#define MAX(a, b) ( ((a) > (b)) ? (a) : (b)) + + +// Precision specification +typedef float real4 ; +typedef double real8 ; +typedef long double real10 ; // 10 bytes on x86 + +typedef int Index_t ; // array subscript and loop index +typedef real8 Real_t ; // floating point representation +typedef int Int_t ; // integer representation + +enum { VolumeError = -1, QStopError = -2 } ; + +inline real4 SQRT(real4 arg) { return sqrtf(arg) ; } +inline real8 SQRT(real8 arg) { return sqrt(arg) ; } +inline real10 SQRT(real10 arg) { return sqrtl(arg) ; } + +inline real4 CBRT(real4 arg) { return cbrtf(arg) ; } +inline real8 CBRT(real8 arg) { return cbrt(arg) ; } +inline real10 CBRT(real10 arg) { return cbrtl(arg) ; } + +inline real4 FABS(real4 arg) { return fabsf(arg) ; } +inline real8 FABS(real8 arg) { return fabs(arg) ; } +inline real10 FABS(real10 arg) { return fabsl(arg) ; } + + +// Stuff needed for boundary conditions +// 2 BCs on each of 6 hexahedral faces (12 bits) +#define XI_M 0x00007 +#define XI_M_SYMM 0x00001 +#define XI_M_FREE 0x00002 +#define XI_M_COMM 0x00004 + +#define XI_P 0x00038 +#define XI_P_SYMM 0x00008 +#define XI_P_FREE 0x00010 +#define XI_P_COMM 0x00020 + +#define ETA_M 0x001c0 +#define ETA_M_SYMM 0x00040 +#define ETA_M_FREE 0x00080 +#define ETA_M_COMM 0x00100 + +#define ETA_P 0x00e00 +#define ETA_P_SYMM 0x00200 +#define ETA_P_FREE 0x00400 +#define ETA_P_COMM 0x00800 + +#define ZETA_M 0x07000 +#define ZETA_M_SYMM 0x01000 +#define ZETA_M_FREE 0x02000 +#define ZETA_M_COMM 0x04000 + +#define ZETA_P 0x38000 +#define ZETA_P_SYMM 0x08000 +#define ZETA_P_FREE 0x10000 +#define ZETA_P_COMM 0x20000 + +// MPI Message Tags +#define MSG_COMM_SBN 1024 +#define MSG_SYNC_POS_VEL 2048 +#define MSG_MONOQ 3072 + +#define MAX_FIELDS_PER_MPI_COMM 6 + +// Assume 128 byte coherence +// Assume Real_t is an "integral power of 2" bytes wide +#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t)) + +#define CACHE_ALIGN_REAL(n) \ + (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1)) + +////////////////////////////////////////////////////// +// Primary data structure +////////////////////////////////////////////////////// + +/* + * The implementation of the data abstraction used for lulesh + * resides entirely in the Domain class below. You can change + * grouping and interleaving of fields here to maximize data layout + * efficiency for your underlying architecture or compiler. + * + * For example, fields can be implemented as STL objects or + * raw array pointers. As another example, individual fields + * m_x, m_y, m_z could be budled into + * + * struct { Real_t x, y, z ; } *m_coord ; + * + * allowing accessor functions such as + * + * "Real_t &x(Index_t idx) { return m_coord[idx].x ; }" + * "Real_t &y(Index_t idx) { return m_coord[idx].y ; }" + * "Real_t &z(Index_t idx) { return m_coord[idx].z ; }" + */ + +class Domain { + + public: + + // Constructor + Domain(Int_t numRanks, Index_t colLoc, + Index_t rowLoc, Index_t planeLoc, + Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost); + + // + // ALLOCATION + // + + void AllocateNodePersistent(Int_t numNode) // Node-centered + { + m_x.resize(numNode); // coordinates + m_y.resize(numNode); + m_z.resize(numNode); + + m_xd.resize(numNode); // velocities + m_yd.resize(numNode); + m_zd.resize(numNode); + + m_xdd.resize(numNode); // accelerations + m_ydd.resize(numNode); + m_zdd.resize(numNode); + + m_fx.resize(numNode); // forces + m_fy.resize(numNode); + m_fz.resize(numNode); + + m_nodalMass.resize(numNode); // mass + } + + void AllocateElemPersistent(Int_t numElem) // Elem-centered + { + m_nodelist.resize(8*numElem); + + // elem connectivities through face + m_lxim.resize(numElem); + m_lxip.resize(numElem); + m_letam.resize(numElem); + m_letap.resize(numElem); + m_lzetam.resize(numElem); + m_lzetap.resize(numElem); + + m_elemBC.resize(numElem); + + m_e.resize(numElem); + m_p.resize(numElem); + + m_q.resize(numElem); + m_ql.resize(numElem); + m_qq.resize(numElem); + + m_v.resize(numElem); + + m_volo.resize(numElem); + m_delv.resize(numElem); + m_vdov.resize(numElem); + + m_arealg.resize(numElem); + + m_ss.resize(numElem); + + m_elemMass.resize(numElem); + } + + void AllocateGradients(Int_t numElem, Int_t allElem) + { + // Position gradients + m_delx_xi.resize(numElem) ; + m_delx_eta.resize(numElem) ; + m_delx_zeta.resize(numElem) ; + + // Velocity gradients + m_delv_xi.resize(allElem) ; + m_delv_eta.resize(allElem); + m_delv_zeta.resize(allElem) ; + } + + void DeallocateGradients() + { + m_delx_zeta.clear() ; + m_delx_eta.clear() ; + m_delx_xi.clear() ; + + m_delv_zeta.clear() ; + m_delv_eta.clear() ; + m_delv_xi.clear() ; + } + + void AllocateStrains(Int_t numElem) + { + m_dxx.resize(numElem) ; + m_dyy.resize(numElem) ; + m_dzz.resize(numElem) ; + } + + void DeallocateStrains() + { + m_dzz.clear() ; + m_dyy.clear() ; + m_dxx.clear() ; + } + + // + // ACCESSORS + // + + // Node-centered + + // Nodal coordinates + Real_t& x(Index_t idx) { return m_x[idx] ; } + Real_t& y(Index_t idx) { return m_y[idx] ; } + Real_t& z(Index_t idx) { return m_z[idx] ; } + + // Nodal velocities + Real_t& xd(Index_t idx) { return m_xd[idx] ; } + Real_t& yd(Index_t idx) { return m_yd[idx] ; } + Real_t& zd(Index_t idx) { return m_zd[idx] ; } + + // Nodal accelerations + Real_t& xdd(Index_t idx) { return m_xdd[idx] ; } + Real_t& ydd(Index_t idx) { return m_ydd[idx] ; } + Real_t& zdd(Index_t idx) { return m_zdd[idx] ; } + + // Nodal forces + Real_t& fx(Index_t idx) { return m_fx[idx] ; } + Real_t& fy(Index_t idx) { return m_fy[idx] ; } + Real_t& fz(Index_t idx) { return m_fz[idx] ; } + + // Nodal mass + Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; } + + // Nodes on symmertry planes + Index_t symmX(Index_t idx) { return m_symmX[idx] ; } + Index_t symmY(Index_t idx) { return m_symmY[idx] ; } + Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; } + bool symmXempty() { return m_symmX.empty(); } + bool symmYempty() { return m_symmY.empty(); } + bool symmZempty() { return m_symmZ.empty(); } + + // + // Element-centered + // + Index_t& regElemSize(Index_t idx) { return m_regElemSize[idx] ; } + Index_t& regNumList(Index_t idx) { return m_regNumList[idx] ; } + Index_t* regNumList() { return &m_regNumList[0] ; } + Index_t* regElemlist(Int_t r) { return m_regElemlist[r] ; } + Index_t& regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; } + + Index_t* nodelist(Index_t idx) { return &m_nodelist[Index_t(8)*idx] ; } + + // elem connectivities through face + Index_t& lxim(Index_t idx) { return m_lxim[idx] ; } + Index_t& lxip(Index_t idx) { return m_lxip[idx] ; } + Index_t& letam(Index_t idx) { return m_letam[idx] ; } + Index_t& letap(Index_t idx) { return m_letap[idx] ; } + Index_t& lzetam(Index_t idx) { return m_lzetam[idx] ; } + Index_t& lzetap(Index_t idx) { return m_lzetap[idx] ; } + + // elem face symm/free-surface flag + Int_t& elemBC(Index_t idx) { return m_elemBC[idx] ; } + + // Principal strains - temporary + Real_t& dxx(Index_t idx) { return m_dxx[idx] ; } + Real_t& dyy(Index_t idx) { return m_dyy[idx] ; } + Real_t& dzz(Index_t idx) { return m_dzz[idx] ; } + + // Velocity gradient - temporary + Real_t& delv_xi(Index_t idx) { return m_delv_xi[idx] ; } + Real_t& delv_eta(Index_t idx) { return m_delv_eta[idx] ; } + Real_t& delv_zeta(Index_t idx) { return m_delv_zeta[idx] ; } + + // Position gradient - temporary + Real_t& delx_xi(Index_t idx) { return m_delx_xi[idx] ; } + Real_t& delx_eta(Index_t idx) { return m_delx_eta[idx] ; } + Real_t& delx_zeta(Index_t idx) { return m_delx_zeta[idx] ; } + + // Energy + Real_t& e(Index_t idx) { return m_e[idx] ; } + + // Pressure + Real_t& p(Index_t idx) { return m_p[idx] ; } + + // Artificial viscosity + Real_t& q(Index_t idx) { return m_q[idx] ; } + + // Linear term for q + Real_t& ql(Index_t idx) { return m_ql[idx] ; } + // Quadratic term for q + Real_t& qq(Index_t idx) { return m_qq[idx] ; } + + // Relative volume + Real_t& v(Index_t idx) { return m_v[idx] ; } + Real_t& delv(Index_t idx) { return m_delv[idx] ; } + + // Reference volume + Real_t& volo(Index_t idx) { return m_volo[idx] ; } + + // volume derivative over volume + Real_t& vdov(Index_t idx) { return m_vdov[idx] ; } + + // Element characteristic length + Real_t& arealg(Index_t idx) { return m_arealg[idx] ; } + + // Sound speed + Real_t& ss(Index_t idx) { return m_ss[idx] ; } + + // Element mass + Real_t& elemMass(Index_t idx) { return m_elemMass[idx] ; } + + Index_t nodeElemCount(Index_t idx) + { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; } + + Index_t *nodeElemCornerList(Index_t idx) + { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; } + + // Parameters + + // Cutoffs + Real_t u_cut() const { return m_u_cut ; } + Real_t e_cut() const { return m_e_cut ; } + Real_t p_cut() const { return m_p_cut ; } + Real_t q_cut() const { return m_q_cut ; } + Real_t v_cut() const { return m_v_cut ; } + + // Other constants (usually are settable via input file in real codes) + Real_t hgcoef() const { return m_hgcoef ; } + Real_t qstop() const { return m_qstop ; } + Real_t monoq_max_slope() const { return m_monoq_max_slope ; } + Real_t monoq_limiter_mult() const { return m_monoq_limiter_mult ; } + Real_t ss4o3() const { return m_ss4o3 ; } + Real_t qlc_monoq() const { return m_qlc_monoq ; } + Real_t qqc_monoq() const { return m_qqc_monoq ; } + Real_t qqc() const { return m_qqc ; } + + Real_t eosvmax() const { return m_eosvmax ; } + Real_t eosvmin() const { return m_eosvmin ; } + Real_t pmin() const { return m_pmin ; } + Real_t emin() const { return m_emin ; } + Real_t dvovmax() const { return m_dvovmax ; } + Real_t refdens() const { return m_refdens ; } + + // Timestep controls, etc... + Real_t& time() { return m_time ; } + Real_t& deltatime() { return m_deltatime ; } + Real_t& deltatimemultlb() { return m_deltatimemultlb ; } + Real_t& deltatimemultub() { return m_deltatimemultub ; } + Real_t& stoptime() { return m_stoptime ; } + Real_t& dtcourant() { return m_dtcourant ; } + Real_t& dthydro() { return m_dthydro ; } + Real_t& dtmax() { return m_dtmax ; } + Real_t& dtfixed() { return m_dtfixed ; } + + Int_t& cycle() { return m_cycle ; } + Index_t& numRanks() { return m_numRanks ; } + + Index_t& colLoc() { return m_colLoc ; } + Index_t& rowLoc() { return m_rowLoc ; } + Index_t& planeLoc() { return m_planeLoc ; } + Index_t& tp() { return m_tp ; } + + Index_t& sizeX() { return m_sizeX ; } + Index_t& sizeY() { return m_sizeY ; } + Index_t& sizeZ() { return m_sizeZ ; } + Index_t& numReg() { return m_numReg ; } + Int_t& cost() { return m_cost ; } + Index_t& numElem() { return m_numElem ; } + Index_t& numNode() { return m_numNode ; } + + Index_t& maxPlaneSize() { return m_maxPlaneSize ; } + Index_t& maxEdgeSize() { return m_maxEdgeSize ; } + + // + // MPI-Related additional data + // + +#if USE_MPI + // Communication Work space + Real_t *commDataSend ; + Real_t *commDataRecv ; + + // Maximum number of block neighbors + MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners + MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners +#endif + + private: + + void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems); + void SetupThreadSupportStructures(); + void CreateRegionIndexSets(Int_t nreg, Int_t balance); + void SetupCommBuffers(Int_t edgeNodes); + void SetupSymmetryPlanes(Int_t edgeNodes); + void SetupElementConnectivities(Int_t edgeElems); + void SetupBoundaryConditions(Int_t edgeElems); + + // + // IMPLEMENTATION + // + + /* Node-centered */ + std::vector m_x ; /* coordinates */ + std::vector m_y ; + std::vector m_z ; + + std::vector m_xd ; /* velocities */ + std::vector m_yd ; + std::vector m_zd ; + + std::vector m_xdd ; /* accelerations */ + std::vector m_ydd ; + std::vector m_zdd ; + + std::vector m_fx ; /* forces */ + std::vector m_fy ; + std::vector m_fz ; + + std::vector m_nodalMass ; /* mass */ + + std::vector m_symmX ; /* symmetry plane nodesets */ + std::vector m_symmY ; + std::vector m_symmZ ; + + // Element-centered + + // Region information + Int_t m_numReg ; + Int_t m_cost; //imbalance cost + Index_t *m_regElemSize ; // Size of region sets + Index_t *m_regNumList ; // Region number per domain element + Index_t **m_regElemlist ; // region indexset + + std::vector m_nodelist ; /* elemToNode connectivity */ + + std::vector m_lxim ; /* element connectivity across each face */ + std::vector m_lxip ; + std::vector m_letam ; + std::vector m_letap ; + std::vector m_lzetam ; + std::vector m_lzetap ; + + std::vector m_elemBC ; /* symmetry/free-surface flags for each elem face */ + + std::vector m_dxx ; /* principal strains -- temporary */ + std::vector m_dyy ; + std::vector m_dzz ; + + std::vector m_delv_xi ; /* velocity gradient -- temporary */ + std::vector m_delv_eta ; + std::vector m_delv_zeta ; + + std::vector m_delx_xi ; /* coordinate gradient -- temporary */ + std::vector m_delx_eta ; + std::vector m_delx_zeta ; + + std::vector m_e ; /* energy */ + + std::vector m_p ; /* pressure */ + std::vector m_q ; /* q */ + std::vector m_ql ; /* linear term for q */ + std::vector m_qq ; /* quadratic term for q */ + + std::vector m_v ; /* relative volume */ + std::vector m_volo ; /* reference volume */ + std::vector m_vnew ; /* new relative volume -- temporary */ + std::vector m_delv ; /* m_vnew - m_v */ + std::vector m_vdov ; /* volume derivative over volume */ + + std::vector m_arealg ; /* characteristic length of an element */ + + std::vector m_ss ; /* "sound speed" */ + + std::vector m_elemMass ; /* mass */ + + // Cutoffs (treat as constants) + const Real_t m_e_cut ; // energy tolerance + const Real_t m_p_cut ; // pressure tolerance + const Real_t m_q_cut ; // q tolerance + const Real_t m_v_cut ; // relative volume tolerance + const Real_t m_u_cut ; // velocity tolerance + + // Other constants (usually setable, but hardcoded in this proxy app) + + const Real_t m_hgcoef ; // hourglass control + const Real_t m_ss4o3 ; + const Real_t m_qstop ; // excessive q indicator + const Real_t m_monoq_max_slope ; + const Real_t m_monoq_limiter_mult ; + const Real_t m_qlc_monoq ; // linear term coef for q + const Real_t m_qqc_monoq ; // quadratic term coef for q + const Real_t m_qqc ; + const Real_t m_eosvmax ; + const Real_t m_eosvmin ; + const Real_t m_pmin ; // pressure floor + const Real_t m_emin ; // energy floor + const Real_t m_dvovmax ; // maximum allowable volume change + const Real_t m_refdens ; // reference density + + // Variables to keep track of timestep, simulation time, and cycle + Real_t m_dtcourant ; // courant constraint + Real_t m_dthydro ; // volume change constraint + Int_t m_cycle ; // iteration count for simulation + Real_t m_dtfixed ; // fixed time increment + Real_t m_time ; // current time + Real_t m_deltatime ; // variable time increment + Real_t m_deltatimemultlb ; + Real_t m_deltatimemultub ; + Real_t m_dtmax ; // maximum allowable time increment + Real_t m_stoptime ; // end time for simulation + + + Int_t m_numRanks ; + + Index_t m_colLoc ; + Index_t m_rowLoc ; + Index_t m_planeLoc ; + Index_t m_tp ; + + Index_t m_sizeX ; + Index_t m_sizeY ; + Index_t m_sizeZ ; + Index_t m_numElem ; + Index_t m_numNode ; + + Index_t m_maxPlaneSize ; + Index_t m_maxEdgeSize ; + + // OMP hack + Index_t *m_nodeElemStart ; + Index_t *m_nodeElemCornerList ; + + // Used in setup + Index_t m_rowMin, m_rowMax; + Index_t m_colMin, m_colMax; + Index_t m_planeMin, m_planeMax ; + +} ; + +typedef Real_t &(Domain::* Domain_member )(Index_t) ; + +struct cmdLineOpts { + Int_t its; // -i + Int_t nx; // -s + Int_t numReg; // -r + Int_t numFiles; // -f + Int_t showProg; // -p + Int_t quiet; // -q + Int_t viz; // -v + Int_t cost; // -c + Int_t balance; // -b + Int_t displaytiming; +}; + + + +// Function Prototypes + +// lulesh-par +Real_t CalcElemVolume( const Real_t x[8], + const Real_t y[8], + const Real_t z[8]); + +// lulesh-util +void ParseCommandLineOptions(int argc, char *argv[], + Int_t myRank, struct cmdLineOpts *opts); +void VerifyAndWriteFinalOutput(Real_t elapsed_time, + Domain& locDom, + Int_t nx, + Int_t numRanks, + Int_t displaytiming); + +// lulesh-viz +void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks); + +// lulesh-comm +void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields, + Index_t dx, Index_t dy, Index_t dz, + bool doRecv, bool planeOnly); +void CommSend(Domain& domain, Int_t msgType, + Index_t xferFields, Domain_member *fieldData, + Index_t dx, Index_t dy, Index_t dz, + bool doSend, bool planeOnly); +void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData); +void CommSyncPosVel(Domain& domain); +void CommMonoQ(Domain& domain); + +// lulesh-init +void InitMeshDecomp(Int_t numRanks, Int_t myRank, + Int_t *col, Int_t *row, Int_t *plane, Int_t *side); Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh.cc =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh.cc @@ -0,0 +1,2814 @@ +/* + This is a Version 2.0 MPI + OpenMP implementation of LULESH + + Copyright (c) 2010-2013. + Lawrence Livermore National Security, LLC. +Produced at the Lawrence Livermore National Laboratory. + LLNL-CODE-461231 + All rights reserved. + +This file is part of LULESH, Version 2.0. +Please also read this link -- http://www.opensource.org/licenses/index.php + +////////////// +DIFFERENCES BETWEEN THIS VERSION (2.x) AND EARLIER VERSIONS: +* Addition of regions to make work more representative of multi-material codes +* Default size of each domain is 30^3 (27000 elem) instead of 45^3. This is + more representative of our actual working set sizes +* Single source distribution supports pure serial, pure OpenMP, MPI-only, + and MPI+OpenMP +* Addition of ability to visualize the mesh using VisIt + https://wci.llnl.gov/codes/visit/download.html +* Various command line options (see ./lulesh2.0 -h) + -q : quiet mode - suppress stdout + -i : number of cycles to run + -s : length of cube mesh along side + -r : Number of distinct regions (def: 11) + -b : Load balance between regions of a domain (def: 1) + -c : Extra cost of more expensive regions (def: 1) + -f : Number of file parts for viz output (def: np/9) + -p : Print out progress + -v : Output viz file (requires compiling with -DVIZ_MESH + -h : This message + + printf("Usage: %s [opts]\n", execname); + printf(" where [opts] is one or more of:\n"); + printf(" -q : quiet mode - suppress all stdout\n"); + printf(" -i : number of cycles to run\n"); + printf(" -s : length of cube mesh along side\n"); + printf(" -r : Number of distinct regions (def: 11)\n"); + printf(" -b : Load balance between regions of a domain (def: 1)\n"); + printf(" -c : Extra cost of more expensive regions (def: 1)\n"); + printf(" -f : Number of files to split viz dump into (def: (np+10)/9)\n"); + printf(" -p : Print out progress\n"); + printf(" -v : Output viz file (requires compiling with -DVIZ_MESH\n"); + printf(" -h : This message\n"); + printf("\n\n"); + +*Notable changes in LULESH 2.0 + +* Split functionality into different files +lulesh.cc - where most (all?) of the timed functionality lies +lulesh-comm.cc - MPI functionality +lulesh-init.cc - Setup code +lulesh-viz.cc - Support for visualization option +lulesh-util.cc - Non-timed functions +* +* The concept of "regions" was added, although every region is the same ideal +* gas material, and the same sedov blast wave problem is still the only +* problem its hardcoded to solve. +* Regions allow two things important to making this proxy app more representative: +* Four of the LULESH routines are now performed on a region-by-region basis, +* making the memory access patterns non-unit stride +* Artificial load imbalances can be easily introduced that could impact +* parallelization strategies. +* The load balance flag changes region assignment. Region number is raised to +* the power entered for assignment probability. Most likely regions changes +* with MPI process id. +* The cost flag raises the cost of ~45% of the regions to evaluate EOS by the +* entered multiple. The cost of 5% is 10x the entered multiple. +* MPI and OpenMP were added, and coalesced into a single version of the source +* that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP +* Added support to write plot files using "poor mans parallel I/O" when linked +* with the silo library, which in turn can be read by VisIt. +* Enabled variable timestep calculation by default (courant condition), which +* results in an additional reduction. +* Default domain (mesh) size reduced from 45^3 to 30^3 +* Command line options to allow numerous test cases without needing to recompile +* Performance optimizations and code cleanup beyond LULESH 1.0 +* Added a "Figure of Merit" calculation (elements solved per microsecond) and +* output in support of using LULESH 2.0 for the 2017 CORAL procurement +* +* Possible Differences in Final Release (other changes possible) +* +* High Level mesh structure to allow data structure transformations +* Different default parameters +* Minor code performance changes and cleanup + +TODO in future versions +* Add reader for (truly) unstructured meshes, probably serial only +* CMake based build system + +////////////// + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the disclaimer below. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the disclaimer (as noted below) + in the documentation and/or other materials provided with the + distribution. + + * Neither the name of the LLNS/LLNL nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Additional BSD Notice + +1. This notice is required to be provided under our contract with the U.S. + Department of Energy (DOE). This work was produced at Lawrence Livermore + National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. + +2. Neither the United States Government nor Lawrence Livermore National + Security, LLC nor any of their employees, makes any warranty, express + or implied, or assumes any liability or responsibility for the accuracy, + completeness, or usefulness of any information, apparatus, product, or + process disclosed, or represents that its use would not infringe + privately-owned rights. + +3. Also, reference herein to any specific commercial products, process, or + services by trade name, trademark, manufacturer or otherwise does not + necessarily constitute or imply its endorsement, recommendation, or + favoring by the United States Government or Lawrence Livermore National + Security, LLC. The views and opinions of authors expressed herein do not + necessarily state or reflect those of the United States Government or + Lawrence Livermore National Security, LLC, and shall not be used for + advertising or product endorsement purposes. + +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if _OPENMP +# include +#endif + +#include "lulesh.h" + + +/*********************************/ +/* Data structure implementation */ +/*********************************/ + +/* might want to add access methods so that memory can be */ +/* better managed, as in luleshFT */ + +template +T *Allocate(size_t size) +{ + return static_cast(malloc(sizeof(T)*size)) ; +} + +template +void Release(T **ptr) +{ + if (*ptr != NULL) { + free(*ptr) ; + *ptr = NULL ; + } +} + + + +/******************************************/ + +/* Work Routines */ + +static inline +void TimeIncrement(Domain& domain) +{ + Real_t targetdt = domain.stoptime() - domain.time() ; + + if ((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) { + Real_t ratio ; + Real_t olddt = domain.deltatime() ; + + /* This will require a reduction in parallel */ + Real_t gnewdt = Real_t(1.0e+20) ; + Real_t newdt ; + if (domain.dtcourant() < gnewdt) { + gnewdt = domain.dtcourant() / Real_t(2.0) ; + } + if (domain.dthydro() < gnewdt) { + gnewdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0) ; + } + +#if USE_MPI + MPI_Allreduce(&gnewdt, &newdt, 1, + ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE), + MPI_MIN, MPI_COMM_WORLD) ; +#else + newdt = gnewdt; +#endif + + ratio = newdt / olddt ; + if (ratio >= Real_t(1.0)) { + if (ratio < domain.deltatimemultlb()) { + newdt = olddt ; + } + else if (ratio > domain.deltatimemultub()) { + newdt = olddt*domain.deltatimemultub() ; + } + } + + if (newdt > domain.dtmax()) { + newdt = domain.dtmax() ; + } + domain.deltatime() = newdt ; + } + + /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */ + if ((targetdt > domain.deltatime()) && + (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0))) ) { + targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0) ; + } + + if (targetdt < domain.deltatime()) { + domain.deltatime() = targetdt ; + } + + domain.time() += domain.deltatime() ; + + ++domain.cycle() ; +} + +/******************************************/ + +static inline +void CollectDomainNodesToElemNodes(Domain &domain, + const Index_t* elemToNode, + Real_t elemX[8], + Real_t elemY[8], + Real_t elemZ[8]) +{ + Index_t nd0i = elemToNode[0] ; + Index_t nd1i = elemToNode[1] ; + Index_t nd2i = elemToNode[2] ; + Index_t nd3i = elemToNode[3] ; + Index_t nd4i = elemToNode[4] ; + Index_t nd5i = elemToNode[5] ; + Index_t nd6i = elemToNode[6] ; + Index_t nd7i = elemToNode[7] ; + + elemX[0] = domain.x(nd0i); + elemX[1] = domain.x(nd1i); + elemX[2] = domain.x(nd2i); + elemX[3] = domain.x(nd3i); + elemX[4] = domain.x(nd4i); + elemX[5] = domain.x(nd5i); + elemX[6] = domain.x(nd6i); + elemX[7] = domain.x(nd7i); + + elemY[0] = domain.y(nd0i); + elemY[1] = domain.y(nd1i); + elemY[2] = domain.y(nd2i); + elemY[3] = domain.y(nd3i); + elemY[4] = domain.y(nd4i); + elemY[5] = domain.y(nd5i); + elemY[6] = domain.y(nd6i); + elemY[7] = domain.y(nd7i); + + elemZ[0] = domain.z(nd0i); + elemZ[1] = domain.z(nd1i); + elemZ[2] = domain.z(nd2i); + elemZ[3] = domain.z(nd3i); + elemZ[4] = domain.z(nd4i); + elemZ[5] = domain.z(nd5i); + elemZ[6] = domain.z(nd6i); + elemZ[7] = domain.z(nd7i); + +} + +/******************************************/ + +static inline +void InitStressTermsForElems(Domain &domain, + Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, + Index_t numElem) +{ + // + // pull in the stresses appropriate to the hydro integration + // + +#pragma omp parallel for firstprivate(numElem) + for (Index_t i = 0 ; i < numElem ; ++i){ + sigxx[i] = sigyy[i] = sigzz[i] = - domain.p(i) - domain.q(i) ; + } +} + +/******************************************/ + +static inline +void CalcElemShapeFunctionDerivatives( Real_t const x[], + Real_t const y[], + Real_t const z[], + Real_t b[][8], + Real_t* const volume ) +{ + const Real_t x0 = x[0] ; const Real_t x1 = x[1] ; + const Real_t x2 = x[2] ; const Real_t x3 = x[3] ; + const Real_t x4 = x[4] ; const Real_t x5 = x[5] ; + const Real_t x6 = x[6] ; const Real_t x7 = x[7] ; + + const Real_t y0 = y[0] ; const Real_t y1 = y[1] ; + const Real_t y2 = y[2] ; const Real_t y3 = y[3] ; + const Real_t y4 = y[4] ; const Real_t y5 = y[5] ; + const Real_t y6 = y[6] ; const Real_t y7 = y[7] ; + + const Real_t z0 = z[0] ; const Real_t z1 = z[1] ; + const Real_t z2 = z[2] ; const Real_t z3 = z[3] ; + const Real_t z4 = z[4] ; const Real_t z5 = z[5] ; + const Real_t z6 = z[6] ; const Real_t z7 = z[7] ; + + Real_t fjxxi, fjxet, fjxze; + Real_t fjyxi, fjyet, fjyze; + Real_t fjzxi, fjzet, fjzze; + Real_t cjxxi, cjxet, cjxze; + Real_t cjyxi, cjyet, cjyze; + Real_t cjzxi, cjzet, cjzze; + + fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) ); + fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) ); + fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) ); + + fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) ); + fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) ); + fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) ); + + fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) ); + fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) ); + fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) ); + + /* compute cofactors */ + cjxxi = (fjyet * fjzze) - (fjzet * fjyze); + cjxet = - (fjyxi * fjzze) + (fjzxi * fjyze); + cjxze = (fjyxi * fjzet) - (fjzxi * fjyet); + + cjyxi = - (fjxet * fjzze) + (fjzet * fjxze); + cjyet = (fjxxi * fjzze) - (fjzxi * fjxze); + cjyze = - (fjxxi * fjzet) + (fjzxi * fjxet); + + cjzxi = (fjxet * fjyze) - (fjyet * fjxze); + cjzet = - (fjxxi * fjyze) + (fjyxi * fjxze); + cjzze = (fjxxi * fjyet) - (fjyxi * fjxet); + + /* calculate partials : + this need only be done for l = 0,1,2,3 since , by symmetry , + (6,7,4,5) = - (0,1,2,3) . + */ + b[0][0] = - cjxxi - cjxet - cjxze; + b[0][1] = cjxxi - cjxet - cjxze; + b[0][2] = cjxxi + cjxet - cjxze; + b[0][3] = - cjxxi + cjxet - cjxze; + b[0][4] = -b[0][2]; + b[0][5] = -b[0][3]; + b[0][6] = -b[0][0]; + b[0][7] = -b[0][1]; + + b[1][0] = - cjyxi - cjyet - cjyze; + b[1][1] = cjyxi - cjyet - cjyze; + b[1][2] = cjyxi + cjyet - cjyze; + b[1][3] = - cjyxi + cjyet - cjyze; + b[1][4] = -b[1][2]; + b[1][5] = -b[1][3]; + b[1][6] = -b[1][0]; + b[1][7] = -b[1][1]; + + b[2][0] = - cjzxi - cjzet - cjzze; + b[2][1] = cjzxi - cjzet - cjzze; + b[2][2] = cjzxi + cjzet - cjzze; + b[2][3] = - cjzxi + cjzet - cjzze; + b[2][4] = -b[2][2]; + b[2][5] = -b[2][3]; + b[2][6] = -b[2][0]; + b[2][7] = -b[2][1]; + + /* calculate jacobian determinant (volume) */ + *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet); +} + +/******************************************/ + +static inline +void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0, + Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1, + Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2, + Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3, + const Real_t x0, const Real_t y0, const Real_t z0, + const Real_t x1, const Real_t y1, const Real_t z1, + const Real_t x2, const Real_t y2, const Real_t z2, + const Real_t x3, const Real_t y3, const Real_t z3) +{ + Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0); + Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0); + Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0); + Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0); + Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0); + Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0); + Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1); + Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1); + Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1); + + *normalX0 += areaX; + *normalX1 += areaX; + *normalX2 += areaX; + *normalX3 += areaX; + + *normalY0 += areaY; + *normalY1 += areaY; + *normalY2 += areaY; + *normalY3 += areaY; + + *normalZ0 += areaZ; + *normalZ1 += areaZ; + *normalZ2 += areaZ; + *normalZ3 += areaZ; +} + +/******************************************/ + +static inline +void CalcElemNodeNormals(Real_t pfx[8], + Real_t pfy[8], + Real_t pfz[8], + const Real_t x[8], + const Real_t y[8], + const Real_t z[8]) +{ + for (Index_t i = 0 ; i < 8 ; ++i) { + pfx[i] = Real_t(0.0); + pfy[i] = Real_t(0.0); + pfz[i] = Real_t(0.0); + } + /* evaluate face one: nodes 0, 1, 2, 3 */ + SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0], + &pfx[1], &pfy[1], &pfz[1], + &pfx[2], &pfy[2], &pfz[2], + &pfx[3], &pfy[3], &pfz[3], + x[0], y[0], z[0], x[1], y[1], z[1], + x[2], y[2], z[2], x[3], y[3], z[3]); + /* evaluate face two: nodes 0, 4, 5, 1 */ + SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0], + &pfx[4], &pfy[4], &pfz[4], + &pfx[5], &pfy[5], &pfz[5], + &pfx[1], &pfy[1], &pfz[1], + x[0], y[0], z[0], x[4], y[4], z[4], + x[5], y[5], z[5], x[1], y[1], z[1]); + /* evaluate face three: nodes 1, 5, 6, 2 */ + SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1], + &pfx[5], &pfy[5], &pfz[5], + &pfx[6], &pfy[6], &pfz[6], + &pfx[2], &pfy[2], &pfz[2], + x[1], y[1], z[1], x[5], y[5], z[5], + x[6], y[6], z[6], x[2], y[2], z[2]); + /* evaluate face four: nodes 2, 6, 7, 3 */ + SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2], + &pfx[6], &pfy[6], &pfz[6], + &pfx[7], &pfy[7], &pfz[7], + &pfx[3], &pfy[3], &pfz[3], + x[2], y[2], z[2], x[6], y[6], z[6], + x[7], y[7], z[7], x[3], y[3], z[3]); + /* evaluate face five: nodes 3, 7, 4, 0 */ + SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3], + &pfx[7], &pfy[7], &pfz[7], + &pfx[4], &pfy[4], &pfz[4], + &pfx[0], &pfy[0], &pfz[0], + x[3], y[3], z[3], x[7], y[7], z[7], + x[4], y[4], z[4], x[0], y[0], z[0]); + /* evaluate face six: nodes 4, 7, 6, 5 */ + SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4], + &pfx[7], &pfy[7], &pfz[7], + &pfx[6], &pfy[6], &pfz[6], + &pfx[5], &pfy[5], &pfz[5], + x[4], y[4], z[4], x[7], y[7], z[7], + x[6], y[6], z[6], x[5], y[5], z[5]); +} + +/******************************************/ + +static inline +void SumElemStressesToNodeForces( const Real_t B[][8], + const Real_t stress_xx, + const Real_t stress_yy, + const Real_t stress_zz, + Real_t fx[], Real_t fy[], Real_t fz[] ) +{ + for(Index_t i = 0; i < 8; i++) { + fx[i] = -( stress_xx * B[0][i] ); + fy[i] = -( stress_yy * B[1][i] ); + fz[i] = -( stress_zz * B[2][i] ); + } +} + +/******************************************/ + +static inline +void IntegrateStressForElems( Domain &domain, + Real_t *sigxx, Real_t *sigyy, Real_t *sigzz, + Real_t *determ, Index_t numElem, Index_t numNode) +{ +#if _OPENMP + Index_t numthreads = omp_get_max_threads(); +#else + Index_t numthreads = 1; +#endif + + Index_t numElem8 = numElem * 8 ; + Real_t *fx_elem; + Real_t *fy_elem; + Real_t *fz_elem; + Real_t fx_local[8] ; + Real_t fy_local[8] ; + Real_t fz_local[8] ; + + + if (numthreads > 1) { + fx_elem = Allocate(numElem8) ; + fy_elem = Allocate(numElem8) ; + fz_elem = Allocate(numElem8) ; + } + // loop over all elements + +#pragma omp parallel for firstprivate(numElem) + for( Index_t k=0 ; k 1) { + // Eliminate thread writing conflicts at the nodes by giving + // each element its own copy to write to + SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k], + &fx_elem[k*8], + &fy_elem[k*8], + &fz_elem[k*8] ) ; + } + else { + SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k], + fx_local, fy_local, fz_local ) ; + + // copy nodal force contributions to global force arrray. + for( Index_t lnode=0 ; lnode<8 ; ++lnode ) { + Index_t gnode = elemToNode[lnode]; + domain.fx(gnode) += fx_local[lnode]; + domain.fy(gnode) += fy_local[lnode]; + domain.fz(gnode) += fz_local[lnode]; + } + } + } + + if (numthreads > 1) { + // If threaded, then we need to copy the data out of the temporary + // arrays used above into the final forces field +#pragma omp parallel for firstprivate(numNode) + for( Index_t gnode=0 ; gnode 1) { + fx_elem = Allocate(numElem8) ; + fy_elem = Allocate(numElem8) ; + fz_elem = Allocate(numElem8) ; + } + + Real_t gamma[4][8]; + + gamma[0][0] = Real_t( 1.); + gamma[0][1] = Real_t( 1.); + gamma[0][2] = Real_t(-1.); + gamma[0][3] = Real_t(-1.); + gamma[0][4] = Real_t(-1.); + gamma[0][5] = Real_t(-1.); + gamma[0][6] = Real_t( 1.); + gamma[0][7] = Real_t( 1.); + gamma[1][0] = Real_t( 1.); + gamma[1][1] = Real_t(-1.); + gamma[1][2] = Real_t(-1.); + gamma[1][3] = Real_t( 1.); + gamma[1][4] = Real_t(-1.); + gamma[1][5] = Real_t( 1.); + gamma[1][6] = Real_t( 1.); + gamma[1][7] = Real_t(-1.); + gamma[2][0] = Real_t( 1.); + gamma[2][1] = Real_t(-1.); + gamma[2][2] = Real_t( 1.); + gamma[2][3] = Real_t(-1.); + gamma[2][4] = Real_t( 1.); + gamma[2][5] = Real_t(-1.); + gamma[2][6] = Real_t( 1.); + gamma[2][7] = Real_t(-1.); + gamma[3][0] = Real_t(-1.); + gamma[3][1] = Real_t( 1.); + gamma[3][2] = Real_t(-1.); + gamma[3][3] = Real_t( 1.); + gamma[3][4] = Real_t( 1.); + gamma[3][5] = Real_t(-1.); + gamma[3][6] = Real_t( 1.); + gamma[3][7] = Real_t(-1.); + +/*************************************************/ +/* compute the hourglass modes */ + + +#pragma omp parallel for firstprivate(numElem, hourg) + for(Index_t i2=0;i2 1) { + fx_local = &fx_elem[i3] ; + fx_local[0] = hgfx[0]; + fx_local[1] = hgfx[1]; + fx_local[2] = hgfx[2]; + fx_local[3] = hgfx[3]; + fx_local[4] = hgfx[4]; + fx_local[5] = hgfx[5]; + fx_local[6] = hgfx[6]; + fx_local[7] = hgfx[7]; + + fy_local = &fy_elem[i3] ; + fy_local[0] = hgfy[0]; + fy_local[1] = hgfy[1]; + fy_local[2] = hgfy[2]; + fy_local[3] = hgfy[3]; + fy_local[4] = hgfy[4]; + fy_local[5] = hgfy[5]; + fy_local[6] = hgfy[6]; + fy_local[7] = hgfy[7]; + + fz_local = &fz_elem[i3] ; + fz_local[0] = hgfz[0]; + fz_local[1] = hgfz[1]; + fz_local[2] = hgfz[2]; + fz_local[3] = hgfz[3]; + fz_local[4] = hgfz[4]; + fz_local[5] = hgfz[5]; + fz_local[6] = hgfz[6]; + fz_local[7] = hgfz[7]; + } + else { + domain.fx(n0si2) += hgfx[0]; + domain.fy(n0si2) += hgfy[0]; + domain.fz(n0si2) += hgfz[0]; + + domain.fx(n1si2) += hgfx[1]; + domain.fy(n1si2) += hgfy[1]; + domain.fz(n1si2) += hgfz[1]; + + domain.fx(n2si2) += hgfx[2]; + domain.fy(n2si2) += hgfy[2]; + domain.fz(n2si2) += hgfz[2]; + + domain.fx(n3si2) += hgfx[3]; + domain.fy(n3si2) += hgfy[3]; + domain.fz(n3si2) += hgfz[3]; + + domain.fx(n4si2) += hgfx[4]; + domain.fy(n4si2) += hgfy[4]; + domain.fz(n4si2) += hgfz[4]; + + domain.fx(n5si2) += hgfx[5]; + domain.fy(n5si2) += hgfy[5]; + domain.fz(n5si2) += hgfz[5]; + + domain.fx(n6si2) += hgfx[6]; + domain.fy(n6si2) += hgfy[6]; + domain.fz(n6si2) += hgfz[6]; + + domain.fx(n7si2) += hgfx[7]; + domain.fy(n7si2) += hgfy[7]; + domain.fz(n7si2) += hgfz[7]; + } + } + + if (numthreads > 1) { + // Collect the data from the local arrays into the final force arrays +#pragma omp parallel for firstprivate(numNode) + for( Index_t gnode=0 ; gnode(numElem8) ; + Real_t *dvdy = Allocate(numElem8) ; + Real_t *dvdz = Allocate(numElem8) ; + Real_t *x8n = Allocate(numElem8) ; + Real_t *y8n = Allocate(numElem8) ; + Real_t *z8n = Allocate(numElem8) ; + + /* start loop over elements */ +#pragma omp parallel for firstprivate(numElem) + for (Index_t i=0 ; i Real_t(0.) ) { + CalcFBHourglassForceForElems( domain, + determ, x8n, y8n, z8n, dvdx, dvdy, dvdz, + hgcoef, numElem, domain.numNode()) ; + } + + Release(&z8n) ; + Release(&y8n) ; + Release(&x8n) ; + Release(&dvdz) ; + Release(&dvdy) ; + Release(&dvdx) ; + + return ; +} + +/******************************************/ + +static inline +void CalcVolumeForceForElems(Domain& domain) +{ + Index_t numElem = domain.numElem() ; + if (numElem != 0) { + Real_t hgcoef = domain.hgcoef() ; + Real_t *sigxx = Allocate(numElem) ; + Real_t *sigyy = Allocate(numElem) ; + Real_t *sigzz = Allocate(numElem) ; + Real_t *determ = Allocate(numElem) ; + + /* Sum contributions to total stress tensor */ + InitStressTermsForElems(domain, sigxx, sigyy, sigzz, numElem); + + // call elemlib stress integration loop to produce nodal forces from + // material stresses. + IntegrateStressForElems( domain, + sigxx, sigyy, sigzz, determ, numElem, + domain.numNode()) ; + + // check for negative element volume +#pragma omp parallel for firstprivate(numElem) + for ( Index_t k=0 ; k 0) { + const Real_t deltatime = domain.deltatime() ; + + domain.AllocateStrains(numElem); + + CalcKinematicsForElems(domain, vnew, deltatime, numElem) ; + + // element loop to do some stuff not included in the elemlib function. +#pragma omp parallel for firstprivate(numElem) + for ( Index_t k=0 ; k monoq_max_slope) phixi = monoq_max_slope; + + + /* phieta */ + norm = Real_t(1.) / ( domain.delv_eta(i) + ptiny ) ; + + switch (bcMask & ETA_M) { + case ETA_M_COMM: /* needs comm data */ + case 0: delvm = domain.delv_eta(domain.letam(i)) ; break ; + case ETA_M_SYMM: delvm = domain.delv_eta(i) ; break ; + case ETA_M_FREE: delvm = Real_t(0.0) ; break ; + default: fprintf(stderr, "Error in switch at %s line %d\n", + __FILE__, __LINE__); + delvm = 0; /* ERROR - but quiets the compiler */ + break; + } + switch (bcMask & ETA_P) { + case ETA_P_COMM: /* needs comm data */ + case 0: delvp = domain.delv_eta(domain.letap(i)) ; break ; + case ETA_P_SYMM: delvp = domain.delv_eta(i) ; break ; + case ETA_P_FREE: delvp = Real_t(0.0) ; break ; + default: fprintf(stderr, "Error in switch at %s line %d\n", + __FILE__, __LINE__); + delvp = 0; /* ERROR - but quiets the compiler */ + break; + } + + delvm = delvm * norm ; + delvp = delvp * norm ; + + phieta = Real_t(.5) * ( delvm + delvp ) ; + + delvm *= monoq_limiter_mult ; + delvp *= monoq_limiter_mult ; + + if ( delvm < phieta ) phieta = delvm ; + if ( delvp < phieta ) phieta = delvp ; + if ( phieta < Real_t(0.)) phieta = Real_t(0.) ; + if ( phieta > monoq_max_slope) phieta = monoq_max_slope; + + /* phizeta */ + norm = Real_t(1.) / ( domain.delv_zeta(i) + ptiny ) ; + + switch (bcMask & ZETA_M) { + case ZETA_M_COMM: /* needs comm data */ + case 0: delvm = domain.delv_zeta(domain.lzetam(i)) ; break ; + case ZETA_M_SYMM: delvm = domain.delv_zeta(i) ; break ; + case ZETA_M_FREE: delvm = Real_t(0.0) ; break ; + default: fprintf(stderr, "Error in switch at %s line %d\n", + __FILE__, __LINE__); + delvm = 0; /* ERROR - but quiets the compiler */ + break; + } + switch (bcMask & ZETA_P) { + case ZETA_P_COMM: /* needs comm data */ + case 0: delvp = domain.delv_zeta(domain.lzetap(i)) ; break ; + case ZETA_P_SYMM: delvp = domain.delv_zeta(i) ; break ; + case ZETA_P_FREE: delvp = Real_t(0.0) ; break ; + default: fprintf(stderr, "Error in switch at %s line %d\n", + __FILE__, __LINE__); + delvp = 0; /* ERROR - but quiets the compiler */ + break; + } + + delvm = delvm * norm ; + delvp = delvp * norm ; + + phizeta = Real_t(.5) * ( delvm + delvp ) ; + + delvm *= monoq_limiter_mult ; + delvp *= monoq_limiter_mult ; + + if ( delvm < phizeta ) phizeta = delvm ; + if ( delvp < phizeta ) phizeta = delvp ; + if ( phizeta < Real_t(0.)) phizeta = Real_t(0.); + if ( phizeta > monoq_max_slope ) phizeta = monoq_max_slope; + + /* Remove length scale */ + + if ( domain.vdov(i) > Real_t(0.) ) { + qlin = Real_t(0.) ; + qquad = Real_t(0.) ; + } + else { + Real_t delvxxi = domain.delv_xi(i) * domain.delx_xi(i) ; + Real_t delvxeta = domain.delv_eta(i) * domain.delx_eta(i) ; + Real_t delvxzeta = domain.delv_zeta(i) * domain.delx_zeta(i) ; + + if ( delvxxi > Real_t(0.) ) delvxxi = Real_t(0.) ; + if ( delvxeta > Real_t(0.) ) delvxeta = Real_t(0.) ; + if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ; + + Real_t rho = domain.elemMass(i) / (domain.volo(i) * vnew[i]) ; + + qlin = -qlc_monoq * rho * + ( delvxxi * (Real_t(1.) - phixi) + + delvxeta * (Real_t(1.) - phieta) + + delvxzeta * (Real_t(1.) - phizeta) ) ; + + qquad = qqc_monoq * rho * + ( delvxxi*delvxxi * (Real_t(1.) - phixi*phixi) + + delvxeta*delvxeta * (Real_t(1.) - phieta*phieta) + + delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta) ) ; + } + + domain.qq(i) = qquad ; + domain.ql(i) = qlin ; + } +} + +/******************************************/ + +static inline +void CalcMonotonicQForElems(Domain& domain, Real_t vnew[]) +{ + // + // initialize parameters + // + const Real_t ptiny = Real_t(1.e-36) ; + + // + // calculate the monotonic q for all regions + // + for (Index_t r=0 ; r 0) { + CalcMonotonicQRegionForElems(domain, r, vnew, ptiny) ; + } + } +} + +/******************************************/ + +static inline +void CalcQForElems(Domain& domain, Real_t vnew[]) +{ + // + // MONOTONIC Q option + // + + Index_t numElem = domain.numElem() ; + + if (numElem != 0) { + Int_t allElem = numElem + /* local elem */ + 2*domain.sizeX()*domain.sizeY() + /* plane ghosts */ + 2*domain.sizeX()*domain.sizeZ() + /* row ghosts */ + 2*domain.sizeY()*domain.sizeZ() ; /* col ghosts */ + + domain.AllocateGradients(numElem, allElem); + +#if USE_MPI + CommRecv(domain, MSG_MONOQ, 3, + domain.sizeX(), domain.sizeY(), domain.sizeZ(), + true, true) ; +#endif + + /* Calculate velocity gradients */ + CalcMonotonicQGradientsForElems(domain, vnew); + +#if USE_MPI + Domain_member fieldData[3] ; + + /* Transfer veloctiy gradients in the first order elements */ + /* problem->commElements->Transfer(CommElements::monoQ) ; */ + + fieldData[0] = &Domain::delv_xi ; + fieldData[1] = &Domain::delv_eta ; + fieldData[2] = &Domain::delv_zeta ; + + CommSend(domain, MSG_MONOQ, 3, fieldData, + domain.sizeX(), domain.sizeY(), domain.sizeZ(), + true, true) ; + + CommMonoQ(domain) ; +#endif + + CalcMonotonicQForElems(domain, vnew) ; + + // Free up memory + domain.DeallocateGradients(); + + /* Don't allow excessive artificial viscosity */ + Index_t idx = -1; + for (Index_t i=0; i domain.qstop() ) { + idx = i ; + break ; + } + } + + if(idx >= 0) { +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, QStopError) ; +#else + exit(QStopError); +#endif + } + } +} + +/******************************************/ + +static inline +void CalcPressureForElems(Real_t* p_new, Real_t* bvc, + Real_t* pbvc, Real_t* e_old, + Real_t* compression, Real_t *vnewc, + Real_t pmin, + Real_t p_cut, Real_t eosvmax, + Index_t length, Index_t *regElemList) +{ +#pragma omp parallel for firstprivate(length) + for (Index_t i = 0; i < length ; ++i) { + Real_t c1s = Real_t(2.0)/Real_t(3.0) ; + bvc[i] = c1s * (compression[i] + Real_t(1.)); + pbvc[i] = c1s; + } + +#pragma omp parallel for firstprivate(length, pmin, p_cut, eosvmax) + for (Index_t i = 0 ; i < length ; ++i){ + Index_t elem = regElemList[i]; + + p_new[i] = bvc[i] * e_old[i] ; + + if (FABS(p_new[i]) < p_cut ) + p_new[i] = Real_t(0.0) ; + + if ( vnewc[elem] >= eosvmax ) /* impossible condition here? */ + p_new[i] = Real_t(0.0) ; + + if (p_new[i] < pmin) + p_new[i] = pmin ; + } +} + +/******************************************/ + +static inline +void CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new, + Real_t* bvc, Real_t* pbvc, + Real_t* p_old, Real_t* e_old, Real_t* q_old, + Real_t* compression, Real_t* compHalfStep, + Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin, + Real_t p_cut, Real_t e_cut, Real_t q_cut, Real_t emin, + Real_t* qq_old, Real_t* ql_old, + Real_t rho0, + Real_t eosvmax, + Index_t length, Index_t *regElemList) +{ + Real_t *pHalfStep = Allocate(length) ; + +#pragma omp parallel for firstprivate(length, emin) + for (Index_t i = 0 ; i < length ; ++i) { + e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i]) + + Real_t(0.5) * work[i]; + + if (e_new[i] < emin ) { + e_new[i] = emin ; + } + } + + CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc, + pmin, p_cut, eosvmax, length, regElemList); + +#pragma omp parallel for firstprivate(length, rho0) + for (Index_t i = 0 ; i < length ; ++i) { + Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ; + + if ( delvc[i] > Real_t(0.) ) { + q_new[i] /* = qq_old[i] = ql_old[i] */ = Real_t(0.) ; + } + else { + Real_t ssc = ( pbvc[i] * e_new[i] + + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ; + + if ( ssc <= Real_t(.1111111e-36) ) { + ssc = Real_t(.3333333e-18) ; + } else { + ssc = SQRT(ssc) ; + } + + q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; + } + + e_new[i] = e_new[i] + Real_t(0.5) * delvc[i] + * ( Real_t(3.0)*(p_old[i] + q_old[i]) + - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ; + } + +#pragma omp parallel for firstprivate(length, emin, e_cut) + for (Index_t i = 0 ; i < length ; ++i) { + + e_new[i] += Real_t(0.5) * work[i]; + + if (FABS(e_new[i]) < e_cut) { + e_new[i] = Real_t(0.) ; + } + if ( e_new[i] < emin ) { + e_new[i] = emin ; + } + } + + CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc, + pmin, p_cut, eosvmax, length, regElemList); + +#pragma omp parallel for firstprivate(length, rho0, emin, e_cut) + for (Index_t i = 0 ; i < length ; ++i){ + const Real_t sixth = Real_t(1.0) / Real_t(6.0) ; + Index_t elem = regElemList[i]; + Real_t q_tilde ; + + if (delvc[i] > Real_t(0.)) { + q_tilde = Real_t(0.) ; + } + else { + Real_t ssc = ( pbvc[i] * e_new[i] + + vnewc[elem] * vnewc[elem] * bvc[i] * p_new[i] ) / rho0 ; + + if ( ssc <= Real_t(.1111111e-36) ) { + ssc = Real_t(.3333333e-18) ; + } else { + ssc = SQRT(ssc) ; + } + + q_tilde = (ssc*ql_old[i] + qq_old[i]) ; + } + + e_new[i] = e_new[i] - ( Real_t(7.0)*(p_old[i] + q_old[i]) + - Real_t(8.0)*(pHalfStep[i] + q_new[i]) + + (p_new[i] + q_tilde)) * delvc[i]*sixth ; + + if (FABS(e_new[i]) < e_cut) { + e_new[i] = Real_t(0.) ; + } + if ( e_new[i] < emin ) { + e_new[i] = emin ; + } + } + + CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc, + pmin, p_cut, eosvmax, length, regElemList); + +#pragma omp parallel for firstprivate(length, rho0, q_cut) + for (Index_t i = 0 ; i < length ; ++i){ + Index_t elem = regElemList[i]; + + if ( delvc[i] <= Real_t(0.) ) { + Real_t ssc = ( pbvc[i] * e_new[i] + + vnewc[elem] * vnewc[elem] * bvc[i] * p_new[i] ) / rho0 ; + + if ( ssc <= Real_t(.1111111e-36) ) { + ssc = Real_t(.3333333e-18) ; + } else { + ssc = SQRT(ssc) ; + } + + q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; + + if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ; + } + } + + Release(&pHalfStep) ; + + return ; +} + +/******************************************/ + +static inline +void CalcSoundSpeedForElems(Domain &domain, + Real_t *vnewc, Real_t rho0, Real_t *enewc, + Real_t *pnewc, Real_t *pbvc, + Real_t *bvc, Real_t ss4o3, + Index_t len, Index_t *regElemList) +{ +#pragma omp parallel for firstprivate(rho0, ss4o3) + for (Index_t i = 0; i < len ; ++i) { + Index_t elem = regElemList[i]; + Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[elem] * vnewc[elem] * + bvc[i] * pnewc[i]) / rho0; + if (ssTmp <= Real_t(.1111111e-36)) { + ssTmp = Real_t(.3333333e-18); + } + else { + ssTmp = SQRT(ssTmp); + } + domain.ss(elem) = ssTmp ; + } +} + +/******************************************/ + +static inline +void EvalEOSForElems(Domain& domain, Real_t *vnewc, + Int_t numElemReg, Index_t *regElemList, Int_t rep) +{ + Real_t e_cut = domain.e_cut() ; + Real_t p_cut = domain.p_cut() ; + Real_t ss4o3 = domain.ss4o3() ; + Real_t q_cut = domain.q_cut() ; + + Real_t eosvmax = domain.eosvmax() ; + Real_t eosvmin = domain.eosvmin() ; + Real_t pmin = domain.pmin() ; + Real_t emin = domain.emin() ; + Real_t rho0 = domain.refdens() ; + + // These temporaries will be of different size for + // each call (due to different sized region element + // lists) + Real_t *e_old = Allocate(numElemReg) ; + Real_t *delvc = Allocate(numElemReg) ; + Real_t *p_old = Allocate(numElemReg) ; + Real_t *q_old = Allocate(numElemReg) ; + Real_t *compression = Allocate(numElemReg) ; + Real_t *compHalfStep = Allocate(numElemReg) ; + Real_t *qq_old = Allocate(numElemReg) ; + Real_t *ql_old = Allocate(numElemReg) ; + Real_t *work = Allocate(numElemReg) ; + Real_t *p_new = Allocate(numElemReg) ; + Real_t *e_new = Allocate(numElemReg) ; + Real_t *q_new = Allocate(numElemReg) ; + Real_t *bvc = Allocate(numElemReg) ; + Real_t *pbvc = Allocate(numElemReg) ; + + //loop to add load imbalance based on region number + for(Int_t j = 0; j < rep; j++) { + /* compress data, minimal set */ +#pragma omp parallel + { +#pragma omp for nowait firstprivate(numElemReg) + for (Index_t i=0; i eosvmax or v < eosvmin */ + if ( eosvmin != Real_t(0.) ) { +#pragma omp for nowait firstprivate(numElemReg, eosvmin) + for(Index_t i=0 ; i= eosvmax) { /* impossible due to calling func? */ + p_old[i] = Real_t(0.) ; + compression[i] = Real_t(0.) ; + compHalfStep[i] = Real_t(0.) ; + } + } + } + +#pragma omp for nowait firstprivate(numElemReg) + for (Index_t i = 0 ; i < numElemReg ; ++i) { + work[i] = Real_t(0.) ; + } + } + CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc, + p_old, e_old, q_old, compression, compHalfStep, + vnewc, work, delvc, pmin, + p_cut, e_cut, q_cut, emin, + qq_old, ql_old, rho0, eosvmax, + numElemReg, regElemList); + } + +#pragma omp parallel for firstprivate(numElemReg) + for (Index_t i=0; i eosvmax) + vnew[i] = eosvmax ; + } + } + + // This check may not make perfect sense in LULESH, but + // it's representative of something in the full code - + // just leave it in, please +#pragma omp for nowait firstprivate(numElem) + for (Index_t i=0; i eosvmax) + vc = eosvmax ; + } + if (vc <= 0.) { +#if USE_MPI + MPI_Abort(MPI_COMM_WORLD, VolumeError) ; +#else + exit(VolumeError); +#endif + } + } + } + + for (Int_t r=0 ; r(numElem) ; /* new relative vol -- temp */ + + CalcLagrangeElements(domain, vnew) ; + + /* Calculate Q. (Monotonic q option requires communication) */ + CalcQForElems(domain, vnew) ; + + ApplyMaterialPropertiesForElems(domain, vnew) ; + + UpdateVolumesForElems(domain, vnew, + domain.v_cut(), numElem) ; + + Release(&vnew); +} + +/******************************************/ + +static inline +void CalcCourantConstraintForElems(Domain &domain, Index_t length, + Index_t *regElemlist, + Real_t qqc, Real_t& dtcourant) +{ +#if _OPENMP + Index_t threads = omp_get_max_threads(); + static Index_t *courant_elem_per_thread; + static Real_t *dtcourant_per_thread; + static bool first = true; + if (first) { + courant_elem_per_thread = new Index_t[threads]; + dtcourant_per_thread = new Real_t[threads]; + first = false; + } +#else + Index_t threads = 1; + Index_t courant_elem_per_thread[1]; + Real_t dtcourant_per_thread[1]; +#endif + + +#pragma omp parallel firstprivate(length, qqc) + { + Real_t qqc2 = Real_t(64.0) * qqc * qqc ; + Real_t dtcourant_tmp = dtcourant; + Index_t courant_elem = -1 ; + +#if _OPENMP + Index_t thread_num = omp_get_thread_num(); +#else + Index_t thread_num = 0; +#endif + +#pragma omp for + for (Index_t i = 0 ; i < length ; ++i) { + Index_t indx = regElemlist[i] ; + Real_t dtf = domain.ss(indx) * domain.ss(indx) ; + + if ( domain.vdov(indx) < Real_t(0.) ) { + dtf = dtf + + qqc2 * domain.arealg(indx) * domain.arealg(indx) + * domain.vdov(indx) * domain.vdov(indx) ; + } + + dtf = SQRT(dtf) ; + dtf = domain.arealg(indx) / dtf ; + + if (domain.vdov(indx) != Real_t(0.)) { + if ( dtf < dtcourant_tmp ) { + dtcourant_tmp = dtf ; + courant_elem = indx ; + } + } + } + + dtcourant_per_thread[thread_num] = dtcourant_tmp ; + courant_elem_per_thread[thread_num] = courant_elem ; + } + + for (Index_t i = 1; i < threads; ++i) { + if (dtcourant_per_thread[i] < dtcourant_per_thread[0] ) { + dtcourant_per_thread[0] = dtcourant_per_thread[i]; + courant_elem_per_thread[0] = courant_elem_per_thread[i]; + } + } + + if (courant_elem_per_thread[0] != -1) { + dtcourant = dtcourant_per_thread[0] ; + } + + return ; + +} + +/******************************************/ + +static inline +void CalcHydroConstraintForElems(Domain &domain, Index_t length, + Index_t *regElemlist, Real_t dvovmax, Real_t& dthydro) +{ +#if _OPENMP + Index_t threads = omp_get_max_threads(); + static Index_t *hydro_elem_per_thread; + static Real_t *dthydro_per_thread; + static bool first = true; + if (first) { + hydro_elem_per_thread = new Index_t[threads]; + dthydro_per_thread = new Real_t[threads]; + first = false; + } +#else + Index_t threads = 1; + Index_t hydro_elem_per_thread[1]; + Real_t dthydro_per_thread[1]; +#endif + +#pragma omp parallel firstprivate(length, dvovmax) + { + Real_t dthydro_tmp = dthydro ; + Index_t hydro_elem = -1 ; + +#if _OPENMP + Index_t thread_num = omp_get_thread_num(); +#else + Index_t thread_num = 0; +#endif + +#pragma omp for + for (Index_t i = 0 ; i < length ; ++i) { + Index_t indx = regElemlist[i] ; + + if (domain.vdov(indx) != Real_t(0.)) { + Real_t dtdvov = dvovmax / (FABS(domain.vdov(indx))+Real_t(1.e-20)) ; + + if ( dthydro_tmp > dtdvov ) { + dthydro_tmp = dtdvov ; + hydro_elem = indx ; + } + } + } + + dthydro_per_thread[thread_num] = dthydro_tmp ; + hydro_elem_per_thread[thread_num] = hydro_elem ; + } + + for (Index_t i = 1; i < threads; ++i) { + if(dthydro_per_thread[i] < dthydro_per_thread[0]) { + dthydro_per_thread[0] = dthydro_per_thread[i]; + hydro_elem_per_thread[0] = hydro_elem_per_thread[i]; + } + } + + if (hydro_elem_per_thread[0] != -1) { + dthydro = dthydro_per_thread[0] ; + } + + return ; +} + +/******************************************/ + +static inline +void CalcTimeConstraintsForElems(Domain& domain) { + + // Initialize conditions to a very large value + domain.dtcourant() = 1.0e+20; + domain.dthydro() = 1.0e+20; + + for (Index_t r=0 ; r < domain.numReg() ; ++r) { + /* evaluate time constraint */ + CalcCourantConstraintForElems(domain, domain.regElemSize(r), + domain.regElemlist(r), + domain.qqc(), + domain.dtcourant()) ; + + /* check hydro constraint */ + CalcHydroConstraintForElems(domain, domain.regElemSize(r), + domain.regElemlist(r), + domain.dvovmax(), + domain.dthydro()) ; + } +} + +/******************************************/ + +static inline +void LagrangeLeapFrog(Domain& domain) +{ +#ifdef SEDOV_SYNC_POS_VEL_LATE + Domain_member fieldData[6] ; +#endif + + /* calculate nodal forces, accelerations, velocities, positions, with + * applied boundary conditions and slide surface considerations */ + LagrangeNodal(domain); + + +#ifdef SEDOV_SYNC_POS_VEL_LATE +#endif + + /* calculate element quantities (i.e. velocity gradient & q), and update + * material states */ + LagrangeElements(domain, domain.numElem()); + +#if USE_MPI +#ifdef SEDOV_SYNC_POS_VEL_LATE + CommRecv(domain, MSG_SYNC_POS_VEL, 6, + domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1, + false, false) ; + + fieldData[0] = &Domain::x ; + fieldData[1] = &Domain::y ; + fieldData[2] = &Domain::z ; + fieldData[3] = &Domain::xd ; + fieldData[4] = &Domain::yd ; + fieldData[5] = &Domain::zd ; + + CommSend(domain, MSG_SYNC_POS_VEL, 6, fieldData, + domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1, + false, false) ; +#endif +#endif + + CalcTimeConstraintsForElems(domain); + +#if USE_MPI +#ifdef SEDOV_SYNC_POS_VEL_LATE + CommSyncPosVel(domain) ; +#endif +#endif +} + + +/******************************************/ + +int main(int argc, char *argv[]) +{ + Domain *locDom ; + Int_t numRanks ; + Int_t myRank ; + struct cmdLineOpts opts; + +#if USE_MPI + Domain_member fieldData ; + + MPI_Init(&argc, &argv) ; + MPI_Comm_size(MPI_COMM_WORLD, &numRanks) ; + MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ; +#else + numRanks = 1; + myRank = 0; +#endif + + /* Set defaults that can be overridden by command line opts */ + opts.its = 9999999; + opts.nx = 30; + opts.numReg = 11; + opts.numFiles = (int)(numRanks+10)/9; + opts.showProg = 0; + opts.quiet = 0; + opts.viz = 0; + opts.balance = 1; + opts.cost = 1; + opts.displaytiming = 0; + + ParseCommandLineOptions(argc, argv, myRank, &opts); + + if ((myRank == 0) && (opts.quiet == 0)) { + printf("Running problem size %d^3 per domain until completion\n", opts.nx); + printf("Num processors: %d\n", numRanks); +#if _OPENMP + printf("Num threads: %d\n", omp_get_max_threads()); +#endif + printf("Total number of elements: %lld\n\n", (long long int)(numRanks*opts.nx*opts.nx*opts.nx)); + printf("To run other sizes, use -s .\n"); + printf("To run a fixed number of iterations, use -i .\n"); + printf("To run a more or less balanced region set, use -b .\n"); + printf("To change the relative costs of regions, use -c .\n"); + printf("To print out progress, use -p\n"); + printf("To write an output file for VisIt, use -v\n"); + printf("See help (-h) for more options\n\n"); + } + + // Set up the mesh and decompose. Assumes regular cubes for now + Int_t col, row, plane, side; + InitMeshDecomp(numRanks, myRank, &col, &row, &plane, &side); + + // Build the main data structure and initialize it + locDom = new Domain(numRanks, col, row, plane, opts.nx, + side, opts.numReg, opts.balance, opts.cost) ; + + +#if USE_MPI + fieldData = &Domain::nodalMass ; + + // Initial domain boundary communication + CommRecv(*locDom, MSG_COMM_SBN, 1, + locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() + 1, + true, false) ; + CommSend(*locDom, MSG_COMM_SBN, 1, &fieldData, + locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() + 1, + true, false) ; + CommSBN(*locDom, 1, &fieldData) ; + + // End initialization + MPI_Barrier(MPI_COMM_WORLD); +#endif + + // BEGIN timestep to solution */ +#if USE_MPI + double start = MPI_Wtime(); +#else + timeval start; + gettimeofday(&start, NULL) ; +#endif +//debug to see region sizes +// for(Int_t i = 0; i < locDom->numReg(); i++) +// std::cout << "region" << i + 1<< "size" << locDom->regElemSize(i) <time() < locDom->stoptime()) && (locDom->cycle() < opts.its)) { + + TimeIncrement(*locDom) ; + LagrangeLeapFrog(*locDom) ; + + if ((opts.showProg != 0) && (opts.quiet == 0) && (myRank == 0)) { + printf("cycle = %d, time = %e, dt=%e\n", + locDom->cycle(), double(locDom->time()), double(locDom->deltatime()) ) ; + } + } + + // Use reduced max elapsed time + double elapsed_time; +#if USE_MPI + elapsed_time = MPI_Wtime() - start; +#else + timeval end; + gettimeofday(&end, NULL) ; + elapsed_time = (double)(end.tv_sec - start.tv_sec) + ((double)(end.tv_usec - start.tv_usec))/1000000 ; +#endif + double elapsed_timeG; +#if USE_MPI + MPI_Reduce(&elapsed_time, &elapsed_timeG, 1, MPI_DOUBLE, + MPI_MAX, 0, MPI_COMM_WORLD); +#else + elapsed_timeG = elapsed_time; +#endif + + // Write out final viz file */ + if (opts.viz) { + DumpToVisit(*locDom, opts.numFiles, myRank, numRanks) ; + } + + if ((myRank == 0) && (opts.quiet == 0)) { + VerifyAndWriteFinalOutput(elapsed_timeG, *locDom, opts.nx, numRanks, opts.displaytiming); + } + +#if USE_MPI + MPI_Finalize() ; +#endif + + return 0 ; +} Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh2.0.reference_output =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh2.0.reference_output @@ -0,0 +1,23 @@ +Running problem size 10^3 per domain until completion +Num processors: 1 +Total number of elements: 1000 + +To run other sizes, use -s . +To run a fixed number of iterations, use -i . +To run a more or less balanced region set, use -b . +To change the relative costs of regions, use -c . +To print out progress, use -p +To write an output file for VisIt, use -v +See help (-h) for more options + +Run completed: + Problem size = 10 + MPI tasks = 1 + Iteration count = 231 + Final Origin Energy = 2.720531e+04 + Testing Plane 0 of Energy Array on rank 0: + MaxAbsDiff = 2.273737e-12 + TotalAbsDiff = 1.659646e-11 + MaxRelDiff = 4.649603e-14 + +exit 0 Index: MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh_tuple.h =================================================================== --- /dev/null +++ MultiSource/Benchmarks/DOE-ProxyApps-C++/lulesh/lulesh_tuple.h @@ -0,0 +1,610 @@ +#if !defined(USE_MPI) +# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line" +#endif + + +// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging +// used supports it (i.e. the _OPENMP symbol is defined) +#define USE_OMP 1 + +#if USE_MPI +#include +#endif + +#include + +/* + define one of these three symbols: + + SEDOV_SYNC_POS_VEL_NONE + SEDOV_SYNC_POS_VEL_EARLY + SEDOV_SYNC_POS_VEL_LATE +*/ + +#define SEDOV_SYNC_POS_VEL_EARLY 1 + +#include +#include + +//************************************************** +// Allow flexibility for arithmetic representations +//************************************************** + +#define MAX(a, b) ( ((a) > (b)) ? (a) : (b)) + + +// Precision specification +typedef float real4 ; +typedef double real8 ; +typedef long double real10 ; // 10 bytes on x86 + +typedef int Index_t ; // array subscript and loop index +typedef real8 Real_t ; // floating point representation +typedef int Int_t ; // integer representation + +enum { VolumeError = -1, QStopError = -2 } ; + +inline real4 SQRT(real4 arg) { return sqrtf(arg) ; } +inline real8 SQRT(real8 arg) { return sqrt(arg) ; } +inline real10 SQRT(real10 arg) { return sqrtl(arg) ; } + +inline real4 CBRT(real4 arg) { return cbrtf(arg) ; } +inline real8 CBRT(real8 arg) { return cbrt(arg) ; } +inline real10 CBRT(real10 arg) { return cbrtl(arg) ; } + +inline real4 FABS(real4 arg) { return fabsf(arg) ; } +inline real8 FABS(real8 arg) { return fabs(arg) ; } +inline real10 FABS(real10 arg) { return fabsl(arg) ; } + + +// Stuff needed for boundary conditions +// 2 BCs on each of 6 hexahedral faces (12 bits) +#define XI_M 0x00007 +#define XI_M_SYMM 0x00001 +#define XI_M_FREE 0x00002 +#define XI_M_COMM 0x00004 + +#define XI_P 0x00038 +#define XI_P_SYMM 0x00008 +#define XI_P_FREE 0x00010 +#define XI_P_COMM 0x00020 + +#define ETA_M 0x001c0 +#define ETA_M_SYMM 0x00040 +#define ETA_M_FREE 0x00080 +#define ETA_M_COMM 0x00100 + +#define ETA_P 0x00e00 +#define ETA_P_SYMM 0x00200 +#define ETA_P_FREE 0x00400 +#define ETA_P_COMM 0x00800 + +#define ZETA_M 0x07000 +#define ZETA_M_SYMM 0x01000 +#define ZETA_M_FREE 0x02000 +#define ZETA_M_COMM 0x04000 + +#define ZETA_P 0x38000 +#define ZETA_P_SYMM 0x08000 +#define ZETA_P_FREE 0x10000 +#define ZETA_P_COMM 0x20000 + +// MPI Message Tags +#define MSG_COMM_SBN 1024 +#define MSG_SYNC_POS_VEL 2048 +#define MSG_MONOQ 3072 + +#define MAX_FIELDS_PER_MPI_COMM 6 + +// Assume 128 byte coherence +// Assume Real_t is an "integral power of 2" bytes wide +#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t)) + +#define CACHE_ALIGN_REAL(n) \ + (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1)) + +////////////////////////////////////////////////////// +// Primary data structure +////////////////////////////////////////////////////// + +/* + * The implementation of the data abstraction used for lulesh + * resides entirely in the Domain class below. You can change + * grouping and interleaving of fields here to maximize data layout + * efficiency for your underlying architecture or compiler. + * + * For example, fields can be implemented as STL objects or + * raw array pointers. As another example, individual fields + * m_x, m_y, m_z could be budled into + * + * struct { Real_t x, y, z ; } *m_coord ; + * + * allowing accessor functions such as + * + * "Real_t &x(Index_t idx) { return m_coord[idx].x ; }" + * "Real_t &y(Index_t idx) { return m_coord[idx].y ; }" + * "Real_t &z(Index_t idx) { return m_coord[idx].z ; }" + */ + +class Domain { + + public: + + // Constructor + Domain(Int_t numRanks, Index_t colLoc, + Index_t rowLoc, Index_t planeLoc, + Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost); + + // + // ALLOCATION + // + + void AllocateNodePersistent(Int_t numNode) // Node-centered + { + m_coord.resize(numNode); // coordinates + + m_vel.resize(numNode); // velocities + + m_acc.resize(numNode); // accelerations + + m_force.resize(numNode); // forces + + m_nodalMass.resize(numNode); // mass + } + + void AllocateElemPersistent(Int_t numElem) // Elem-centered + { + m_nodelist.resize(8*numElem); + + // elem connectivities through face + m_faceToElem.resize(numElem); + + m_elemBC.resize(numElem); + + m_e.resize(numElem); + + m_pq.resize(numElem); + + m_qlqq.resize(numElem); + + m_vol.resize(numElem); + + m_delv.resize(numElem); + m_vdov.resize(numElem); + + m_arealg.resize(numElem); + + m_ss.resize(numElem); + + m_elemMass.resize(numElem); + } + + void AllocateGradients(Int_t numElem, Int_t allElem) + { + // Position gradients + m_delx_xi.resize(numElem) ; + m_delx_eta.resize(numElem) ; + m_delx_zeta.resize(numElem) ; + + // Velocity gradients + m_delv_xi.resize(allElem) ; + m_delv_eta.resize(allElem); + m_delv_zeta.resize(allElem) ; + } + + void DeallocateGradients() + { + m_delx_zeta.clear() ; + m_delx_eta.clear() ; + m_delx_xi.clear() ; + + m_delv_zeta.clear() ; + m_delv_eta.clear() ; + m_delv_xi.clear() ; + } + + void AllocateStrains(Int_t numElem) + { + m_dxx.resize(numElem) ; + m_dyy.resize(numElem) ; + m_dzz.resize(numElem) ; + } + + void DeallocateStrains() + { + m_dzz.clear() ; + m_dyy.clear() ; + m_dxx.clear() ; + } + + // + // ACCESSORS + // + + // Node-centered + + // Nodal coordinates + Real_t& x(Index_t idx) { return m_coord[idx].x ; } + Real_t& y(Index_t idx) { return m_coord[idx].y ; } + Real_t& z(Index_t idx) { return m_coord[idx].z ; } + + // Nodal velocities + Real_t& xd(Index_t idx) { return m_vel[idx].x ; } + Real_t& yd(Index_t idx) { return m_vel[idx].y ; } + Real_t& zd(Index_t idx) { return m_vel[idx].z ; } + + // Nodal accelerations + Real_t& xdd(Index_t idx) { return m_acc[idx].x ; } + Real_t& ydd(Index_t idx) { return m_acc[idx].y ; } + Real_t& zdd(Index_t idx) { return m_acc[idx].z ; } + + // Nodal forces + Real_t& fx(Index_t idx) { return m_force[idx].x ; } + Real_t& fy(Index_t idx) { return m_force[idx].y ; } + Real_t& fz(Index_t idx) { return m_force[idx].z ; } + + // Nodal mass + Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; } + + // Nodes on symmertry planes + Index_t symmX(Index_t idx) { return m_symmX[idx] ; } + Index_t symmY(Index_t idx) { return m_symmY[idx] ; } + Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; } + bool symmXempty() { return m_symmX.empty(); } + bool symmYempty() { return m_symmY.empty(); } + bool symmZempty() { return m_symmZ.empty(); } + + // + // Element-centered + // + Index_t& regElemSize(Index_t idx) { return m_regElemSize[idx] ; } + Index_t& regNumList(Index_t idx) { return m_regNumList[idx] ; } + Index_t* regNumList() { return &m_regNumList[0] ; } + Index_t* regElemlist(Int_t r) { return m_regElemlist[r] ; } + Index_t& regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; } + + Index_t* nodelist(Index_t idx) { return &m_nodelist[Index_t(8)*idx] ; } + + // elem connectivities through face + Index_t& lxim(Index_t idx) { return m_faceToElem[idx].lxim ; } + Index_t& lxip(Index_t idx) { return m_faceToElem[idx].lxip ; } + Index_t& letam(Index_t idx) { return m_faceToElem[idx].letam ; } + Index_t& letap(Index_t idx) { return m_faceToElem[idx].letap ; } + Index_t& lzetam(Index_t idx) { return m_faceToElem[idx].lzetam ; } + Index_t& lzetap(Index_t idx) { return m_faceToElem[idx].lzetap ; } + + // elem face symm/free-surface flag + Int_t& elemBC(Index_t idx) { return m_elemBC[idx] ; } + + // Principal strains - temporary + Real_t& dxx(Index_t idx) { return m_dxx[idx] ; } + Real_t& dyy(Index_t idx) { return m_dyy[idx] ; } + Real_t& dzz(Index_t idx) { return m_dzz[idx] ; } + + // Velocity gradient - temporary + Real_t& delv_xi(Index_t idx) { return m_delv_xi[idx] ; } + Real_t& delv_eta(Index_t idx) { return m_delv_eta[idx] ; } + Real_t& delv_zeta(Index_t idx) { return m_delv_zeta[idx] ; } + + // Position gradient - temporary + Real_t& delx_xi(Index_t idx) { return m_delx_xi[idx] ; } + Real_t& delx_eta(Index_t idx) { return m_delx_eta[idx] ; } + Real_t& delx_zeta(Index_t idx) { return m_delx_zeta[idx] ; } + + // Energy + Real_t& e(Index_t idx) { return m_e[idx] ; } + + // Pressure + Real_t& p(Index_t idx) { return m_pq[idx].p ; } + + // Artificial viscosity + Real_t& q(Index_t idx) { return m_pq[idx].q ; } + + // Linear term for q + Real_t& ql(Index_t idx) { return m_qlqq[idx].ql ; } + // Quadratic term for q + Real_t& qq(Index_t idx) { return m_qlqq[idx].qq ; } + + Real_t& delv(Index_t idx) { return m_delv[idx] ; } + + // Relative volume + Real_t& v(Index_t idx) { return m_vol[idx].v ; } + // Reference volume + Real_t& volo(Index_t idx) { return m_vol[idx].volo ; } + + // volume derivative over volume + Real_t& vdov(Index_t idx) { return m_vdov[idx] ; } + + // Element characteristic length + Real_t& arealg(Index_t idx) { return m_arealg[idx] ; } + + // Sound speed + Real_t& ss(Index_t idx) { return m_ss[idx] ; } + + // Element mass + Real_t& elemMass(Index_t idx) { return m_elemMass[idx] ; } + + Index_t nodeElemCount(Index_t idx) + { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; } + + Index_t *nodeElemCornerList(Index_t idx) + { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; } + + // Parameters + + // Cutoffs + Real_t u_cut() const { return m_u_cut ; } + Real_t e_cut() const { return m_e_cut ; } + Real_t p_cut() const { return m_p_cut ; } + Real_t q_cut() const { return m_q_cut ; } + Real_t v_cut() const { return m_v_cut ; } + + // Other constants (usually are settable via input file in real codes) + Real_t hgcoef() const { return m_hgcoef ; } + Real_t qstop() const { return m_qstop ; } + Real_t monoq_max_slope() const { return m_monoq_max_slope ; } + Real_t monoq_limiter_mult() const { return m_monoq_limiter_mult ; } + Real_t ss4o3() const { return m_ss4o3 ; } + Real_t qlc_monoq() const { return m_qlc_monoq ; } + Real_t qqc_monoq() const { return m_qqc_monoq ; } + Real_t qqc() const { return m_qqc ; } + + Real_t eosvmax() const { return m_eosvmax ; } + Real_t eosvmin() const { return m_eosvmin ; } + Real_t pmin() const { return m_pmin ; } + Real_t emin() const { return m_emin ; } + Real_t dvovmax() const { return m_dvovmax ; } + Real_t refdens() const { return m_refdens ; } + + // Timestep controls, etc... + Real_t& time() { return m_time ; } + Real_t& deltatime() { return m_deltatime ; } + Real_t& deltatimemultlb() { return m_deltatimemultlb ; } + Real_t& deltatimemultub() { return m_deltatimemultub ; } + Real_t& stoptime() { return m_stoptime ; } + Real_t& dtcourant() { return m_dtcourant ; } + Real_t& dthydro() { return m_dthydro ; } + Real_t& dtmax() { return m_dtmax ; } + Real_t& dtfixed() { return m_dtfixed ; } + + Int_t& cycle() { return m_cycle ; } + Index_t& numRanks() { return m_numRanks ; } + + Index_t& colLoc() { return m_colLoc ; } + Index_t& rowLoc() { return m_rowLoc ; } + Index_t& planeLoc() { return m_planeLoc ; } + Index_t& tp() { return m_tp ; } + + Index_t& sizeX() { return m_sizeX ; } + Index_t& sizeY() { return m_sizeY ; } + Index_t& sizeZ() { return m_sizeZ ; } + Index_t& numReg() { return m_numReg ; } + Int_t& cost() { return m_cost ; } + Index_t& numElem() { return m_numElem ; } + Index_t& numNode() { return m_numNode ; } + + Index_t& maxPlaneSize() { return m_maxPlaneSize ; } + Index_t& maxEdgeSize() { return m_maxEdgeSize ; } + + // + // MPI-Related additional data + // + +#if USE_MPI + // Communication Work space + Real_t *commDataSend ; + Real_t *commDataRecv ; + + // Maximum number of block neighbors + MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners + MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners +#endif + + private: + + void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems); + void SetupThreadSupportStructures(); + void CreateRegionIndexSets(Int_t nreg, Int_t balance); + void SetupCommBuffers(Int_t edgeNodes); + void SetupSymmetryPlanes(Int_t edgeNodes); + void SetupElementConnectivities(Int_t edgeElems); + void SetupBoundaryConditions(Int_t edgeElems); + + // + // IMPLEMENTATION + // + + /* Node-centered */ + + struct Tuple3 { + Real_t x, y, z ; + } ; + + std::vector m_coord ; /* coordinates */ + + std::vector m_vel ; /* velocities */ + + std::vector m_acc ; /* accelerations */ + + std::vector m_force ; /* forces */ + + std::vector m_nodalMass ; /* mass */ + + std::vector m_symmX ; /* symmetry plane nodesets */ + std::vector m_symmY ; + std::vector m_symmZ ; + + // Element-centered + + // Region information + Int_t m_numReg ; + Int_t m_cost; //imbalance cost + Index_t *m_regElemSize ; // Size of region sets + Index_t *m_regNumList ; // Region number per domain element + Index_t **m_regElemlist ; // region indexset + + std::vector m_nodelist ; /* elemToNode connectivity */ + + struct FaceElemConn { + Index_t lxim, lxip, letam, letap, lzetam, lzetap ; + } ; + + std::vector m_faceToElem ; /* element conn across faces */ + + std::vector m_elemBC ; /* symmetry/free-surface flags for each elem face */ + + std::vector m_dxx ; /* principal strains -- temporary */ + std::vector m_dyy ; + std::vector m_dzz ; + + std::vector m_delv_xi ; /* velocity gradient -- temporary */ + std::vector m_delv_eta ; + std::vector m_delv_zeta ; + + std::vector m_delx_xi ; /* coordinate gradient -- temporary */ + std::vector m_delx_eta ; + std::vector m_delx_zeta ; + + std::vector m_e ; /* energy */ + + struct Pcomponents { + Real_t p, q ; + } ; + + std::vector m_pq ; /* pressure and artificial viscosity */ + + struct Qcomponents { + Real_t ql, qq ; + } ; + + std::vector m_qlqq ; /* linear and quadratic terms for q */ + + struct Volume { + Real_t v, volo ; + } ; + + std::vector m_vol ; /* relative and reference volume */ + + std::vector m_vnew ; /* new relative volume -- temporary */ + std::vector m_delv ; /* m_vnew - m_v */ + std::vector m_vdov ; /* volume derivative over volume */ + + std::vector m_arealg ; /* characteristic length of an element */ + + std::vector m_ss ; /* "sound speed" */ + + std::vector m_elemMass ; /* mass */ + + // Cutoffs (treat as constants) + const Real_t m_e_cut ; // energy tolerance + const Real_t m_p_cut ; // pressure tolerance + const Real_t m_q_cut ; // q tolerance + const Real_t m_v_cut ; // relative volume tolerance + const Real_t m_u_cut ; // velocity tolerance + + // Other constants (usually setable, but hardcoded in this proxy app) + + const Real_t m_hgcoef ; // hourglass control + const Real_t m_ss4o3 ; + const Real_t m_qstop ; // excessive q indicator + const Real_t m_monoq_max_slope ; + const Real_t m_monoq_limiter_mult ; + const Real_t m_qlc_monoq ; // linear term coef for q + const Real_t m_qqc_monoq ; // quadratic term coef for q + const Real_t m_qqc ; + const Real_t m_eosvmax ; + const Real_t m_eosvmin ; + const Real_t m_pmin ; // pressure floor + const Real_t m_emin ; // energy floor + const Real_t m_dvovmax ; // maximum allowable volume change + const Real_t m_refdens ; // reference density + + // Variables to keep track of timestep, simulation time, and cycle + Real_t m_dtcourant ; // courant constraint + Real_t m_dthydro ; // volume change constraint + Int_t m_cycle ; // iteration count for simulation + Real_t m_dtfixed ; // fixed time increment + Real_t m_time ; // current time + Real_t m_deltatime ; // variable time increment + Real_t m_deltatimemultlb ; + Real_t m_deltatimemultub ; + Real_t m_dtmax ; // maximum allowable time increment + Real_t m_stoptime ; // end time for simulation + + + Int_t m_numRanks ; + + Index_t m_colLoc ; + Index_t m_rowLoc ; + Index_t m_planeLoc ; + Index_t m_tp ; + + Index_t m_sizeX ; + Index_t m_sizeY ; + Index_t m_sizeZ ; + Index_t m_numElem ; + Index_t m_numNode ; + + Index_t m_maxPlaneSize ; + Index_t m_maxEdgeSize ; + + // OMP hack + Index_t *m_nodeElemStart ; + Index_t *m_nodeElemCornerList ; + + // Used in setup + Index_t m_rowMin, m_rowMax; + Index_t m_colMin, m_colMax; + Index_t m_planeMin, m_planeMax ; + +} ; + +typedef Real_t &(Domain::* Domain_member )(Index_t) ; + +struct cmdLineOpts { + Int_t its; // -i + Int_t nx; // -s + Int_t numReg; // -r + Int_t numFiles; // -f + Int_t showProg; // -p + Int_t quiet; // -q + Int_t viz; // -v + Int_t cost; // -c + Int_t balance; // -b +}; + + + +// Function Prototypes + +// lulesh-par +Real_t CalcElemVolume( const Real_t x[8], + const Real_t y[8], + const Real_t z[8]); + +// lulesh-util +void ParseCommandLineOptions(int argc, char *argv[], + Int_t myRank, struct cmdLineOpts *opts); +void VerifyAndWriteFinalOutput(Real_t elapsed_time, + Domain& locDom, + Int_t nx, + Int_t numRanks); + +// lulesh-viz +void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks); + +// lulesh-comm +void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields, + Index_t dx, Index_t dy, Index_t dz, + bool doRecv, bool planeOnly); +void CommSend(Domain& domain, Int_t msgType, + Index_t xferFields, Domain_member *fieldData, + Index_t dx, Index_t dy, Index_t dz, + bool doSend, bool planeOnly); +void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData); +void CommSyncPosVel(Domain& domain); +void CommMonoQ(Domain& domain); + +// lulesh-init +void InitMeshDecomp(Int_t numRanks, Int_t myRank, + Int_t *col, Int_t *row, Int_t *plane, Int_t *side);