Index: test-suite/trunk/LICENSE.TXT =================================================================== --- test-suite/trunk/LICENSE.TXT +++ test-suite/trunk/LICENSE.TXT @@ -88,6 +88,7 @@ XSBench: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench miniGMG: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG RSBench: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/RSBench +SimpleMOC: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC CLAMR: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR HPCCG: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/HPCCG PENNANT: llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt @@ -3,3 +3,4 @@ add_subdirectory(miniAMR) add_subdirectory(miniGMG) add_subdirectory(RSBench) +add_subdirectory(SimpleMOC) Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile @@ -5,6 +5,7 @@ miniAMR \ Pathfinder \ miniGMG \ - RSBench + RSBench \ + SimpleMOC include $(LEVEL)/Makefile.programs Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt @@ -0,0 +1,5 @@ +set(PROG SimpleMOC) +list(APPEND LDFLAGS -lm) +list(APPEND CFLAGS -std=gnu99) +set(RUN_OPTIONS -i ${CMAKE_CURRENT_SOURCE_DIR}/default.in) +llvm_multisource() Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 John Tramm + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile @@ -0,0 +1,7 @@ +LEVEL = ../../../.. + +PROG = SimpleMOC +LDFLAGS = -lm +CFLAGS = -std=gnu99 +RUN_OPTIONS = -i $(PROJ_SRC_DIR)/default.in +include $(LEVEL)/MultiSource/Makefile.multisrc Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output @@ -0,0 +1,12 @@ + INITIALIZATION +================================================================================ +Initializing 2D tracks... +Initializing 3D tracks... +Initializing flat source regions... +Beginning XS Allocation... +Beginning Source and Flux Parameter Allocation... +================================================================================ +Starting transport sweep ... +Renormalizing Flux... +Renormalizing Flux Complete. +exit 0 Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h @@ -0,0 +1,262 @@ +#ifndef __SimpleMOC_header +#define __SimpleMOC_header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef MPI +#include +#endif + +#ifdef OPENMP +#include +#endif + +#ifdef PAPI +#include +#endif + +#define TIMIING_INFO 0 + +#include "glibc_compat_rand.h" +#ifndef NO_GLIBC_COMPAT_RAND +#define rand glibc_compat_rand +#define srand glibc_compat_srand +#endif + +// User inputs +typedef struct{ + int x_assemblies; /* Number of assemblies in the x-axis of the + reactor */ + int y_assemblies; /* Number of assemblies in the y-axis of the + reactor */ + int cai; // Number of course axial intervals + int fai; /* Number of fine axial intervals per coarse + axial interval */ + int axial_exp; // Axial source expansion order + float radial_ray_sep; // Radial ray separation + float axial_z_sep; // Axial stacked z-ray separation + int n_azimuthal; // Number of azimuthal angles + int n_polar_angles; // Number of polar angles + int n_egroups; // Number of energy groups + bool decompose; // Turn decomposition on/off + int decomp_assemblies_ax; // Number of sub-domains per assembly (axially) + long segments_per_track; // Average number of segments per track + float assembly_width; // Width of an assembly - 1.26 x 17 cm + float height; // Height of the reactor - 400 cm + float domain_height; // Z Height of a domain + float precision; // precision for source convergence + long mype; // MPI Rank + long ntracks_2D; // Number of 2D tracks (derived) + int z_stacked; // Number of z rays (derived) + long ntracks; /* Total number of 3D tracks per assembly + (derived) */ + int nthreads; // Number of OpenMP Threads + int papi_event_set; // PAPI event set + // 0 - FLOPS 1 - Bandwidth 2 - CPU Stall reason + + // Source regions per assembly (3M estimate) + long n_2D_source_regions_per_assembly; + + // Source regions per node (derived) + long n_source_regions_per_node; + + #ifdef PAPI + // String for command line PAPI event + char event_name[PAPI_MAX_STR_LEN]; + // Array to accumulate PAPI counts across all threads + long long *vals_accum; + #endif + + bool load_tracks; // Turn on/off loading 2D tracks from file + char* track_file; // Name/address of tracking file to load + + long segments_processed; // Total number of segments processed per node + +} Input; + +// Localized geometrical region ID +typedef struct{ + long assembly; // Assembly ID + long pin; // Pin Cell ID + long zone; // Zone (inside pin cell) ID +} RegionID; + +// Segment Structure +typedef struct{ + float length; + long source_id; +} Segment; + +// Track2D Structure +typedef struct{ + float az_weight; // Azimuthal Quadrature Weight (rand) + long n_segments; // Number of Segments (gaussian) + Segment * segments; // Array of Segments + int n_3D_segments; // Number of intersections in 3D tracks +} Track2D; + +// Track Structure +typedef struct{ + float p_weight; // Polar Quadrature Weight (rand) + float z_height; // Z-height + long rank_in; // MPI rank to receive from + long rank_out; // MPI rank to send to + float * f_psi; // Forward angular flux along track + float * b_psi; // Backward angular flux along track +} Track; + +// Source Region Structure +typedef struct{ + float ** fine_flux; + float ** fine_source; + float vol; + float * sigT; + float ** XS; + float ** scattering_matrix; + #ifdef OPENMP + omp_lock_t * locks; + #endif +} Source; + +// Table structure for computing exponential +typedef struct{ + float * values; + float dx; + float maxVal; + int N; +} Table; + +// Params Structure for easier data pointer passing +typedef struct{ + Track2D * tracks_2D; + Track *** tracks; + Source * sources; + float * polar_angles; + float * leakage; + Table expTable; +} Params; + +// MPI 3D Grid info +typedef struct{ + #ifdef MPI + MPI_Comm cart_comm_3d; + MPI_Datatype Flux_Array; + #endif + int x_pos_src; + int x_pos_dest; + int x_neg_src; + int x_neg_dest; + int y_pos_src; + int y_pos_dest; + int y_neg_src; + int y_neg_dest; + int z_pos_src; + int z_pos_dest; + int z_neg_src; + int z_neg_dest; +} CommGrid; + +// Attenuation Arrays +typedef struct{ + float * q0; + float * q1; + float * q2; + float * sigT; + float * tau; + float * sigT2; + float * expVal; + float * reuse; + float * flux_integral; + float * tally; + float * t1; + float * t2; + float * t3; + float * t4; +} AttenuateVars; + + +// init.c +Input set_default_input( void ); +void set_small_input( Input * I ); +Params build_tracks( Input* I ); +CommGrid init_mpi_grid( Input I ); +void calculate_derived_inputs( Input * I ); +#ifdef OPENMP +omp_lock_t * init_locks( Input I ); +#endif + +// io.c +void logo(int version); +void center_print(const char *s, int width); +void border_print(void); +void fancy_int( int a ); +void print_input_summary(Input input); +void read_CLI( int argc, char * argv[], Input * input ); +void print_CLI_error(void); +void read_input_file( Input * I, char * fname); + +// tracks.c +Track2D * generate_2D_tracks( Input input, size_t * nbytes ); +void generate_2D_segments( Input input, Track2D * tracks, + size_t * nbytes ); +void free_2D_tracks( Track2D * tracks ); +Track *** generate_tracks(Input input, Track2D * tracks_2D, size_t * nbytes); +void free_tracks( Track *** tracks ); +long segments_per_2D_track_distribution( Input I ); +float * generate_polar_angles( Input I ); +Track2D * load_OpenMOC_tracks(char* fname, bool CMFD_obj, Input* I, size_t* nbytes); + +// utils.c +float urand(void); +float nrand(float mean, float sigma); +float pairwise_sum(float * vector, long size); +Table buildExponentialTable( float precision, float maxVal ); +float interpolateTable( Table table, float x); +double get_time(void); +size_t est_mem_usage( Input I ); +double time_per_intersection( Input I, double time ); + +// source.c +Source * initialize_sources( Input I, size_t * nbytes ); +void free_sources( Input I, Source * sources ); + +// solver.c +void transport_sweep( Params * params, Input * I ); +int get_pos_interval( float z, float dz); +int get_neg_interval( float z, float dz); +int get_alt_neg_interval( float z, float dz); +void attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I, + Params * params, float ds, float mu, float az_weight, AttenuateVars * A ); +void attenuate_FSR_fluxes( Track * track, bool forward, Source * FSR, Input * I, + Params * params, float ds, float mu, float az_weight, AttenuateVars * A ); +void alt_attenuate_fluxes( Track * track, bool forward, Source * FSR, Input * I, + Params * params, float ds, float mu, float az_weight ); +void renormalize_flux( Params params, Input I, CommGrid grid ); +float update_sources( Params params, Input I, float keff ); +float compute_keff( Params params, Input I, CommGrid grid); + +// test.c +void gen_norm_pts(float mean, float sigma, int n_pts); +void print_Input_struct( Input I ); + +// papi.c +void papi_serial_init(void); +void counter_init( int *eventset, int *num_papi_events, Input * I ); +void counter_stop( int * eventset, int num_papi_events, Input * I ); + +// comms.c +#ifdef MPI +void fast_transfer_boundary_fluxes( Params params, Input I, CommGrid grid); +void transfer_boundary_fluxes( Params params, Input I, CommGrid grid); +#endif + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c @@ -0,0 +1,199 @@ +#include"SimpleMOC_header.h" + +#ifdef MPI +// Transfer information between nodes (angular fluxes) +void fast_transfer_boundary_fluxes( Params params, Input I, CommGrid grid) +{ + MPI_Barrier(grid.cart_comm_3d); + if(I.mype==0) printf("Beginning Inter-Node Border Flux Transfer...\n"); + + int tracks_per_msg = 10000; + + float h = I.domain_height; + float x = I.assembly_width; + + // calculate number of tracks to each surface + long ntracks_per_axial_direction = I.ntracks * x / (2*x + 4*h); + long ntracks_per_radial_direction = I.ntracks * h / (2*x + 4*h); + + // correct so that all tracks are used and are symmetric + long remaining_tracks = I.ntracks - 2 * ntracks_per_axial_direction + - 4 * ntracks_per_radial_direction; + + long add_radial = remaining_tracks * ( 4*h / (2*x + 4*h) ); + add_radial = 4 * (add_radial / 4); + ntracks_per_radial_direction += add_radial / 4; + + long add_axial = remaining_tracks - add_radial; + ntracks_per_axial_direction += add_axial / 2; + + // Calculate all requests needed + long max_requests = ntracks_per_radial_direction / tracks_per_msg; + max_requests *= 4; + max_requests += 2 * (ntracks_per_axial_direction / tracks_per_msg ); + + // One for send, one for receive + max_requests *= 2; + + long send_idx = 0; + MPI_Status stat; + + // Computer Message Size + size_t bytes = I.n_egroups * sizeof(float) * tracks_per_msg; +#ifdef PRINT_MEM_SIZES + if(I.mype==0) printf("MPI Message Size: %.2lf (MB)\n", + bytes / 1024. / 1024. ); +#endif + + + // Use knowledge of underlying flux structure for efficiency + float * flux_array = params.tracks[0][0][0].f_psi; + + // TODO: Send reverse direction as well!!! + + // make an array of radial sending destinations + int send_dest[6] = + { + grid.x_pos_dest, + grid.x_neg_dest, + grid.y_pos_dest, + grid.y_neg_dest, + grid.z_pos_dest, + grid.z_neg_dest + }; + // make an array of radial receiving sources + int rec_sources[6] = + { + grid.x_pos_src, + grid.x_neg_src, + grid.y_pos_src, + grid.y_neg_src, + grid.z_pos_src, + grid.z_neg_src + }; + + // make an array of number of messages + // NOTE: There is some rounding here, should be corrected in real app + long num_messages[6] = + { + ntracks_per_radial_direction / tracks_per_msg, + ntracks_per_radial_direction / tracks_per_msg, + ntracks_per_radial_direction / tracks_per_msg, + ntracks_per_radial_direction / tracks_per_msg, + ntracks_per_axial_direction / tracks_per_msg, + ntracks_per_axial_direction / tracks_per_msg + }; + + + // send_idx is now the beginning of the non-border region memory + // i.e., we need to actually MPI Send/Recv the rest of the data + + // calculate the maximum number of messages sent in any direction + long max_msgs_per_dir = 0; + for( int i = 0; i < 6; i++ ) + if( num_messages[i] > max_msgs_per_dir ) + max_msgs_per_dir = num_messages[i]; + + // Flux Memory is assumed to be laid out as follows: + // (Border Flux) --- (Send_MSG_Dir_0) (Recv_MSG_Dir_0) (Send_MSG_Dir_1) (Recv_MSG_Dir_1).... + + // New Comms + float ** buffer = (float **) malloc(6 * sizeof(float*)); + float * _buffer = (float *) malloc( 6 * tracks_per_msg * I.n_egroups * sizeof(float)); + for( int i = 0; i < 6; i++ ) + buffer[i] = &_buffer[i * tracks_per_msg * I.n_egroups]; + + long idx = 0; + for( long i = 0; i < max_msgs_per_dir; i++ ) + { + MPI_Request request[12]; + int active[6] = {0}; + int mpi_send[6] = {0}; + int mpi_recv[6] = {0}; + long bookmark = idx; + for( int j = 0; j < 6; j++ ) + { + if( i >= num_messages[j] ) + continue; + + // check if border assembly + else if( send_dest[j] == -1 ) + { + * params.leakage += pairwise_sum( &flux_array[idx], + I.n_egroups * tracks_per_msg ); + idx += (long) I.n_egroups * tracks_per_msg; + } + else + { + MPI_Isend( + &flux_array[idx], // Send Buffer + tracks_per_msg, // Number of Elements + grid.Flux_Array, /* Type of element + (all energy group array) */ + send_dest[j], // Destination MPI rank + j, // Message ID + grid.cart_comm_3d, // MPI Communicator + &request[j] ); /* MPI Request (to monitor + when call finishes) */ + idx += (long) I.n_egroups * tracks_per_msg; + mpi_send[j] = 1; + } + } + for( int j = 0; j < 6; j++ ) + { + if( i >= num_messages[j] ) + continue; + + // Check if Border Case + else if( rec_sources[j] == -1) + for( long k =0; k < I.n_egroups * tracks_per_msg; k++) + buffer[j][k] = 0; + else + { + MPI_Irecv( + buffer[j], // Recv Buffer + tracks_per_msg, // Number of Elements + grid.Flux_Array, /* Type of element + (all energy group array) */ + rec_sources[j], // MPI rank to Receive From + j, // Message ID + grid.cart_comm_3d, // MPI Communicator + &request[6+j] ); /* MPI Request (to monitor + when call finishes) */ + mpi_recv[j] = 1; + } + active[j] = 1; + } + + // Block for Comm Round to complete & copy received data out of buffer + for( int j = 0; j < 6; j++ ) + { + if( mpi_send[j] == 1 ) + { + MPI_Wait( &request[j], &stat ); + } + if( mpi_recv[j] == 1 ) + { + MPI_Wait( &request[6+j], &stat ); + } + if( active[j] == 1 ) + { + memcpy(&flux_array[bookmark], buffer[j], + I.n_egroups * tracks_per_msg * sizeof(float)); + bookmark += (long) I.n_egroups*tracks_per_msg; + } + } + + + } + + free(&buffer[0][0]); + free(buffer); + + + MPI_Barrier( grid.cart_comm_3d ); + MPI_Barrier( MPI_COMM_WORLD); + + if(I.mype==0) printf("Finished Inter-Node Border Flux Transfer.\n"); +} +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in @@ -0,0 +1,20 @@ +10 - x_assemblies - Number of assemblies in the x-axis of the reactor +10 - y_assemblies - Number of assemblies in the y-axis of the reactor +9 - cai - Number of coarse axial intervals +5 - fai - Number of fine axial intervals per coarse axial interval +2 - axial_exp - Axial source expansion order +0.05 - radial_ray_sep - Radial ray separation +0.25 - axial_z_sep - Axial stacked z-ray separation +6 - n_azimuthal - Number of azimuthal angles (should be 32) +10 - n_polar_angles - Number of polar angles +10 - n_egroups - Number of energy groups +1 - decompose - Turn decomposition on/off (true = 1, false = 0) +20 - decomp_assemblies_ax - Number of assemblies per sub-domain (axially) +20 - segments_per_track - Average number of segments per track +21.42 - assembly_width - Width of an assembly - 1.26 x 17 cm +50.0 - height - Height of the reactor +0.01 - precision - precision for source convergence +1000 - n_2D_source_regions_per_assembly - 2D src regions per assembly +0 - papi_event_set - PAPI Event Set Choice +0 - Turn on/off load tracks from OpenMOC (true = 1, false = 0) +file_name - name of OpenMOC tracking file to load Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h @@ -0,0 +1,16 @@ +/*===------------- glibc_compat_rand.h- glibc rand emulation --------------===*\ + * |* + * |* The LLVM Compiler Infrastructure + * |* + * |* This file is distributed under the University of Illinois Open Source + * |* License. See LICENSE.TXT for details. + * |* + * \*===----------------------------------------------------------------------===*/ + +#ifndef GLIBC_COMPAT_RAND_H +#define GLIBC_COMPAT_RAND_H + +int glibc_compat_rand(void); +void glibc_compat_srand(unsigned int seed); + +#endif /* GLIBC_COMPAT_RAND_H */ Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c @@ -0,0 +1,60 @@ +/*===------------ glibc_compat_rand.c - glibc rand emulation --------------===*\ + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * +\*===----------------------------------------------------------------------===*/ + +#include "glibc_compat_rand.h" + +/** + * This rand implementation is designed to emulate the implementation of + * rand/srand in recent versions of glibc. This is used for programs which + * require this specific rand implementation in order to pass verification + * tests. + * + * For more information, see: http://www.mathstat.dal.ca/~selinger/random/ + **/ + +#define TABLE_SIZE 344 +static unsigned int table[TABLE_SIZE]; +static int next; + +int glibc_compat_rand(void) { + /* Calculate the indices i-3 and i-31 in the circular vector. */ + int i3 = (next < 3) ? (TABLE_SIZE + next - 3) : (next - 3); + int i31 = (next < 31) ? (TABLE_SIZE + next - 31) : (next - 31); + + table[next] = table[i3] + table[i31]; + unsigned int r = table[next] >> 1; + + ++next; + if (next >= TABLE_SIZE) + next = 0; + + return r; +} + +void glibc_compat_srand(unsigned int seed) { + if (seed == 0) + seed = 1; + + table[0] = seed; + + for (int i = 1; i < 31; i++) { + int r = (16807ll * table[i - 1]) % 2147483647; + if (r < 0) + r += 2147483647; + + table[i] = r; + } + + for (int i = 31; i < 34; i++) + table[i] = table[i - 31]; + for (int i = 34; i < TABLE_SIZE; i++) + table[i] = table[i - 31] + table[i - 3]; + + next = 0; +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c @@ -0,0 +1,249 @@ +#include"SimpleMOC_header.h" + +// Calculate Derived Inputs +void calculate_derived_inputs( Input * I ) +{ + + #ifdef MPI + int mype; + MPI_Comm_rank(MPI_COMM_WORLD, &mype); + I->mype = mype; + #endif + + /* Divide number of azimuthal angles by 2 to accound for forward/backward + * tracking */ + I->n_azimuthal /= 2; + + // calculate number of 2D tracks, enforcing divisible by 2 + I->ntracks_2D = I->n_azimuthal * + (I->assembly_width * sqrt(2) / I->radial_ray_sep); + + I->ntracks_2D = 2 * ( I->ntracks_2D / 2 ); + + I->z_stacked = (int) ( I->height / (I->axial_z_sep * I->decomp_assemblies_ax)); + I->ntracks = I->ntracks_2D * I->n_polar_angles * I->z_stacked; + I->domain_height = I->height / I->decomp_assemblies_ax; + + I->n_source_regions_per_node = I->n_2D_source_regions_per_assembly * + I->cai / I->decomp_assemblies_ax; + +} + +// Gets I from user and sets defaults +Input set_default_input( void ) +{ + Input I; + + I.x_assemblies = 17; /* Number of assemblies in the x-axis + of the reactor */ + I.y_assemblies = 17; /* Number of assemblies in the y-axis + of the reactor */ + I.cai = 27; // Number of coarse axial intervals + I.fai = 5; /* Number of fine axial intervals per coarse + axial interval */ + I.axial_exp = 2; // Axial source expansion order + I.radial_ray_sep = 0.05; // Radial ray separation + I.axial_z_sep = 0.25; // Axial stacked z-ray separation + I.n_azimuthal = 64; // Number of azimuthal angles (should be 32) + I.n_polar_angles = 10; // Number of polar angles + I.n_egroups = 104; // Number of energy groups + I.decompose = true; /* Turn decomposition on/off (true = on, + flase = off) */ + I.decomp_assemblies_ax =20; /* Number of subdomains per assembly + (axially) */ + I.segments_per_track = 120; // Average number of segments per track (123) + I.assembly_width = 21.42; // Width of an assembly - 1.26 x 17 cm + I.height = 400.0; // Height of the reactor - 400 cm + I.precision = 0.01; // precision for source convergence + I.mype = 0; // MPI Rank + + // source regions per assembly (estimate) + I.n_2D_source_regions_per_assembly = 5000; + + #ifdef PAPI + I.papi_event_set = 4; + #endif + + #ifdef OPENMP + I.nthreads = omp_get_max_threads(); + #endif + + I.load_tracks = false; + + return I; +} + +// Changes defaults to small problem size +void set_small_input( Input * I ) +{ + I->x_assemblies = 15; /* Number of assemblies in the x-axis + of the reactor */ + I->y_assemblies = 15; /* Number of assemblies in the y-axis + of the reactor */ + I->cai = 5; // Number of coarse axial intervals + I->fai = 3; /* Number of fine axial intervals per + coarse axial interval */ + I->axial_exp = 2; // Axial source expansion order + I->radial_ray_sep = 0.5; // Radial ray separation + I->axial_z_sep = 0.2; // Axial stacked z-ray separation + I->n_azimuthal = 5; // Number of azimuthal angles + I->n_polar_angles = 5; // Number of polar angles + I->n_egroups = 104; // Number of energy groups + I->decompose = false; /* Turn decomposition on/off (true = on, + flase = off) */ + I->decomp_assemblies_ax = 1; /* Number of sub-domains per assembly + (axially) */ + I->segments_per_track = 120; // Average number of segments per track + I->assembly_width = 1.26*17; // Width of an assembly - 1.26 x 17 cm + I->height = 400.0; // Height of the reactor - 400 cm + I->precision = 0.01; // precision for source convergence + + // source regions per assembly (estimate) + I->n_2D_source_regions_per_assembly = 3000; +} + +// Initializes all track data +Params build_tracks( Input* input ) +{ + Input I = *input; + size_t nbytes = 0; + Params params; + + if(I.mype == 0) + { + center_print("INITIALIZATION", 79); + border_print(); + printf("Initializing 2D tracks...\n"); + } + + if(I.load_tracks) + { + params.tracks_2D = load_OpenMOC_tracks( + I.track_file,false, input, &nbytes); + I = *input; + } + else + params.tracks_2D = generate_2D_tracks(I, &nbytes); + + if(I.mype == 0) + { +#ifdef PRINT_MEM_SIZES + printf("Memory allocated thus far (MB): %zu\n", nbytes / 1024 / 1014 ); +#endif + printf("Initializing 3D tracks...\n"); + } + + params.tracks = generate_tracks(I, params.tracks_2D, &nbytes); + params.polar_angles = generate_polar_angles( I ); + + if(I.mype == 0) + { +#ifdef PRINT_MEM_SIZES + printf("Memory allocated thus far (MB): %zu\n", nbytes / 1024 / 1014 ); +#endif + printf("Initializing flat source regions...\n"); + } + + params.sources = initialize_sources(I, &nbytes); + + if(I.mype == 0) + { +#ifdef PRINT_MEM_SIZES + printf("Memory allocated thus far (MB): %zu\n", nbytes / 1024 / 1014 ); +#endif + border_print(); + } + + // initialize to zero leakage + float * leakage = calloc( 1, sizeof(float) ); + params.leakage = leakage; + + // build exponential table for interpolation + params.expTable = buildExponentialTable( I.precision, 10.0 ); + + return params; +} + +// Initializes 3D Cartesian Communication Grid + Shift Ranks +CommGrid init_mpi_grid( Input I ) +{ + CommGrid grid; + + #ifdef MPI + MPI_Comm cart_comm_3d; + int ndims = 3; + int dims[3] = {2,2,1}; + int period[3] = {0,0,0}; + int reorder = 1; + + MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, period, reorder, + &cart_comm_3d); + + // X Shifts + int x_pos_src; + int x_pos_dest; + int x_neg_src; + int x_neg_dest; + MPI_Cart_shift( cart_comm_3d, 0, 1, &x_pos_src, &x_pos_dest ); + MPI_Cart_shift( cart_comm_3d, 0, -1, &x_neg_src, &x_neg_dest ); + + // Y Shifts + int y_pos_src; + int y_pos_dest; + int y_neg_src; + int y_neg_dest; + MPI_Cart_shift( cart_comm_3d, 1, 1, &y_pos_src, &y_pos_dest ); + MPI_Cart_shift( cart_comm_3d, 1, -1, &y_neg_src, &y_neg_dest ); + + // Z Shifts + int z_pos_src; + int z_pos_dest; + int z_neg_src; + int z_neg_dest; + MPI_Cart_shift( cart_comm_3d, 2, 1, &z_pos_src, &z_pos_dest ); + MPI_Cart_shift( cart_comm_3d, 2, -1, &z_neg_src, &z_neg_dest ); + + grid.cart_comm_3d = cart_comm_3d; + grid.x_pos_src = x_pos_src; + grid.x_pos_dest = x_pos_dest; + grid.x_neg_src = x_neg_src; + grid.x_neg_dest = x_neg_dest; + grid.y_pos_src = y_pos_src; + grid.y_pos_dest = y_pos_dest; + grid.y_neg_src = y_neg_src; + grid.y_neg_dest = y_neg_dest; + grid.z_pos_src = z_pos_src; + grid.z_pos_dest = z_pos_dest; + grid.z_neg_src = z_neg_src; + grid.z_neg_dest = z_neg_dest; + + + // Init flux buffer MPI type + MPI_Datatype flux_array; + MPI_Type_contiguous(I.n_egroups, MPI_FLOAT, &flux_array); + MPI_Type_commit(&flux_array); + + grid.Flux_Array = flux_array; + + #endif + + return grid; +} + +#ifdef OPENMP +// Intialized OpenMP Source Region Locks +omp_lock_t * init_locks( Input I ) +{ + // Allocate locks array + long n_locks = I.n_source_regions_per_node * I.fai; + omp_lock_t * locks = (omp_lock_t *) malloc( n_locks* sizeof(omp_lock_t)); + + // Initialize locks array + for( long i = 0; i < n_locks; i++ ) + omp_init_lock(&locks[i]); + + return locks; +} +#endif + + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c @@ -0,0 +1,276 @@ +#include "SimpleMOC_header.h" + +// Prints program logo +void logo(int version) +{ + border_print(); + printf( +" _____ _ _ __ __ ____ _____ \n" +" / ____(_) | | | \\/ |/ __ \\ / ____|\n" +" | (___ _ _ __ ___ _ __ | | ___| \\ / | | | | | \n" +" \\___ \\| | '_ ` _ \\| '_ \\| |/ _ \\ |\\/| | | | | | \n" +" ____) | | | | | | | |_) | | __/ | | | |__| | |____ \n" +" |_____/|_|_| |_| |_| .__/|_|\\___|_| |_|\\____/ \\_____|\n" +" | | \n" +" |_| \n" + ); + border_print(); + printf("\n"); + center_print("Developed at", 79); + center_print("The Massachusetts Institute of Technology", 79); + center_print("and", 79); + center_print("Argonne National Laboratory", 79); + printf("\n"); + char v[100]; + sprintf(v, "Version: %d", version); + center_print(v, 79); + printf("\n"); + border_print(); +} + +// Prints Section titles in center of 80 char terminal +void center_print(const char *s, int width) +{ + int length = strlen(s); + int i; + for (i=0; i<=(width-length)/2; i++) { + fputs(" ", stdout); + } + fputs(s, stdout); + fputs("\n", stdout); +} + +// Prints a border +void border_print(void) +{ + printf( + "===================================================================" + "=============\n"); +} + +// Prints comma separated integers - for ease of reading +void fancy_int( int a ) +{ + if( a < 1000 ) + printf("%d\n",a); + + else if( a >= 1000 && a < 1000000 ) + printf("%d,%03d\n", a / 1000, a % 1000); + + else if( a >= 1000000 && a < 1000000000 ) + printf("%d,%03d,%03d\n", a / 1000000, (a % 1000000) / 1000, a % 1000 ); + + else if( a >= 1000000000 ) + printf("%d,%03d,%03d,%03d\n", + a / 1000000000, + (a % 1000000000) / 1000000, + (a % 1000000) / 1000, + a % 1000 ); + else + printf("%d\n",a); +} + +// Prints out the summary of User input +void print_input_summary(Input I) +{ + center_print("INPUT SUMMARY", 79); + border_print(); + #ifdef MPI + int nranks; + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + printf("%-35s%d\n", "MPI Ranks:", nranks); + #endif + #ifdef OPENMP + printf("%-35s%d\n", "Number of Threads:", I.nthreads); + #endif + printf("%-35s%d\n", "x-axis assemblies:", I.x_assemblies); + printf("%-35s%d\n", "y-axis assemblies:", I.y_assemblies); + printf("%-35s%d\n", "coarse axial intervals:", I.cai); + printf("%-35s%d\n", "fine axial intervals:", I.fai); + printf("%-35s%d\n", "axial source expansion order:", I.axial_exp); + printf("%-35s%.2lf\n", "radial ray separation:", I.radial_ray_sep); + printf("%-35s%.2lf\n", "axial z-ray separation:", I.axial_z_sep); + printf("%-35s%d\n", "azimuthal angles:", I.n_azimuthal); + printf("%-35s%d\n", "polar angles:", I.n_polar_angles); + printf("%-35s%d\n", "energy groups:", I.n_egroups); + printf("%-35s%d\n", "assemblies per axial sub-domain:", + I.decomp_assemblies_ax); + printf("%-35s%ld\n", "avg segments per track:", I.segments_per_track); + printf("%-35s%.2lf\n", "assembly width:", I.assembly_width); + printf("%-35s%.2lf\n", "reactor height:", I.height); + printf("%-35s%ld\n", "2D Src regions per assembly:", + I.n_2D_source_regions_per_assembly); + printf("%-35s%ld\n", "2D Tracks:", I.ntracks_2D); + printf("%-35s", "3D Tracks:"); fancy_int(I.ntracks); +#ifdef PRINT_MEM_SIZES + printf("%-35s%zu (MB)\n", "Estimated Memory Usage:", + est_mem_usage(I) / 1024 / 1024); +#endif + #ifdef PAPI + if( I.papi_event_set == -1) + printf("%-35s%s\n", "PAPI event to count:", I.event_name); + #endif + border_print(); +} + +// reads command line inputs and applies options +void read_CLI( int argc, char * argv[], Input * input ) +{ + // defaults to max threads on the system + #ifdef OPENMP + input->nthreads = omp_get_num_procs(); + #else + input->nthreads = 1; + #endif + + // Collect Raw Input + for( int i = 1; i < argc; i++ ) + { + char * arg = argv[i]; + + // nthreads (-t) + if( strcmp(arg, "-t") == 0 ) + { + if( ++i < argc ) + input->nthreads = atoi(argv[i]); + else + print_CLI_error(); + } + // input file (-i) + else if( strcmp(arg, "-i") == 0 ) + { + if( ++i < argc ) + read_input_file( input, argv[i]); + else + print_CLI_error(); + } + // set input for small problem (-s) + else if( strcmp(arg, "-s") == 0) + set_small_input( input ); + + #ifdef PAPI + // Add single PAPI event + else if( strcmp(arg, "-p") == 0 ) + { + if( ++i < argc ){ + input->papi_event_set = -1; + strcpy(input->event_name, argv[i]); + } + else + print_CLI_error(); + } + #endif + // Load OpenMOC track data file + else if( strcmp(arg,"-d") == 0) + { + if( ++i < argc ) + { + input->track_file = argv[i]; + input->load_tracks = true; + } + else + print_CLI_error(); + } + else + print_CLI_error(); + } + + // Validate Input + + // Validate nthreads + if( input->nthreads < 1 ) + print_CLI_error(); +} + +// print error to screen, inform program options +void print_CLI_error(void) +{ + printf("Usage: ./SimpleMOC \n"); + printf("Options include:\n"); + printf(" -t Number of OpenMP threads to run\n"); + printf(" -i Input file name\n"); + printf(" -p PAPI event name to count (1 only) \n"); + printf(" -s Small problem flag \n"); + printf(" -d OpenMOC tracking file\n"); + printf("See readme for full description of default run values\n"); + exit(1); +} + +// read input file describing problem parameters +void read_input_file( Input * I, char * fname) +{ + #if TIMING_INFO | 0 + printf("Attempting to open FIle: %s\n", fname); + #endif + FILE * fp = fopen(fname, "r"); + #if TIMING_INFO | 0 + printf("Opened FIle: %s\n", fname); + #endif + if( fp == NULL ) + printf("FIle Open FAILED\n"); + char c[255]; + + char * stat; + int err; + + err = fscanf(fp, "%d", &I->x_assemblies); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%d", &I->y_assemblies); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%d", &I->cai); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%d", &I->fai); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%d", &I->axial_exp); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%f", &I->radial_ray_sep); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%f", &I->axial_z_sep); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%d", &I->n_azimuthal); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%d", &I->n_polar_angles); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%d", &I->n_egroups); + stat = fgets(c, 255, fp); + + int decompose; + err = fscanf(fp, "%d", &decompose); + stat = fgets(c, 255, fp); + if(decompose == 0) + I->decompose = false; + else + I->decompose = true; + + err = fscanf(fp, "%d", &I->decomp_assemblies_ax); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%ld", &I->segments_per_track); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%f", &I->assembly_width); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%f", &I->height); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%f", &I->precision); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%ld", &I->n_2D_source_regions_per_assembly); + stat = fgets(c, 255, fp); + + err = fscanf(fp, "%d", &I->papi_event_set); + stat = fgets(c, 255, fp); + + fclose(fp); +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c @@ -0,0 +1,147 @@ +#include"SimpleMOC_header.h" + +int main( int argc, char * argv[] ) +{ + int version = 3; + int mype = 1; + + #ifdef MPI + int nranks; + MPI_Status stat; + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + MPI_Comm_rank(MPI_COMM_WORLD, &mype); + #endif + + #ifdef PAPI + papi_serial_init(); + #endif + + srand(time(NULL) * (mype+1)); + + Input input = set_default_input(); + read_CLI( argc, argv, &input ); + calculate_derived_inputs( &input ); + + if( mype == 0 ) + logo(version); + + #ifdef OPENMP + omp_set_num_threads(input.nthreads); + #endif + + Params params = build_tracks( &input ); + CommGrid grid = init_mpi_grid( input ); + + if( mype == 0 ) + print_input_summary(input); + + float res; + float keff = 1.0; + int num_iters = 1; + + + double time_transport = 0; + double time_flux_exchange = 0; + double time_renormalize_flux = 0; + double time_update_sources = 0; + double time_compute_keff = 0; + double start, stop; + + if(mype==0) + { + center_print("SIMULATION", 79); + border_print(); + } + + for( int i = 0; i < num_iters; i++) + { + // Transport Sweep + start = get_time(); + transport_sweep(¶ms, &input); + stop = get_time(); + time_transport += stop-start; + + // Domain Boundary Flux Exchange (MPI) + #ifdef MPI + start = get_time(); + fast_transfer_boundary_fluxes(params, input, grid); + stop = get_time(); + time_flux_exchange += stop-start; + #endif + + // Flux Renormalization + start = get_time(); + renormalize_flux(params,input, grid); + stop = get_time(); + time_renormalize_flux += stop-start; + + // Update Source Regions + start = get_time(); + res = update_sources(params, input, keff); + stop = get_time(); + time_update_sources += stop-start; + + // Calculate K-Effective + start = get_time(); + keff = compute_keff(params, input, grid); + stop = get_time(); + time_compute_keff += stop-start; + if( mype == 0 ) + printf("keff = %f\n", keff); + } + + double time_total = time_transport + time_flux_exchange + + time_renormalize_flux + time_update_sources + time_compute_keff; + + if( mype == 0 ) + { + border_print(); + center_print("RESULTS SUMMARY", 79); + border_print(); + + printf("Transport Sweep Time: %6.2lf sec (%4.1lf%%)\n", + time_transport, 100*time_transport/time_total); + printf("Domain Flux Exchange Time: %6.2lf sec (%4.1lf%%)\n", + time_flux_exchange, 100*time_flux_exchange/time_total); + printf("Flux Renormalization Time: %6.2lf sec (%4.1lf%%)\n", + time_renormalize_flux, 100*time_renormalize_flux/time_total); + printf("Update Source Time: %6.2lf sec (%4.1lf%%)\n", + time_update_sources, 100*time_update_sources/time_total); + printf("K-Effective Calc Time: %6.2lf sec (%4.1lf%%)\n", + time_compute_keff, 100*time_compute_keff/time_total); + printf("Total Time: %6.2lf sec\n", time_total); + } + + long tracks_per_second = 2 * input.ntracks/time_transport; + + #ifdef MPI + MPI_Barrier(grid.cart_comm_3d); + long global_tps = 0; + MPI_Reduce( &tracks_per_second, // Send Buffer + &global_tps, // Receive Buffer + 1, // Element Count + MPI_LONG, // Element Type + MPI_SUM, // Reduciton Operation Type + 0, // Master Rank + grid.cart_comm_3d ); // MPI Communicator + MPI_Barrier(grid.cart_comm_3d); + tracks_per_second = global_tps; + #endif + + if( mype == 0 ) + { + printf("Time per Intersection: "); + printf("%.2lf ns\n", time_per_intersection( input, time_transport )); + border_print(); + } + + free_2D_tracks( params.tracks_2D ); + free_tracks( params.tracks ); + + #ifdef MPI + MPI_Finalize(); + #endif + + return 0; +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c @@ -0,0 +1,493 @@ +#include"SimpleMOC_header.h" + +#ifdef PAPI + +// initialize papi with one thread first +void papi_serial_init(void) +{ + if ( PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT){ + fprintf(stderr, "PAPI library init error!\n"); + exit(1); + } + if (( PAPI_thread_init((long unsigned int (*)(void)) + pthread_self )) != PAPI_OK){ + PAPI_perror("PAPI_thread_init"); + exit(1); + } +} + +void counter_init( int *eventset, int *num_papi_events, Input * I ) +{ + char error_str[PAPI_MAX_STR_LEN]; + int stat; + + int * events; + + // Command line event + if( I->papi_event_set == -1){ + *num_papi_events = 1; + events = (int *) malloc( *num_papi_events * sizeof(int)); + PAPI_event_name_to_code( I->event_name, &events[0]); + } + // FLOPS + if( I->papi_event_set == 0 ) + { + *num_papi_events = 2; + events = (int *) malloc( *num_papi_events * sizeof(int)); + events[0] = PAPI_SP_OPS; + events[1] = PAPI_TOT_CYC; + } + // Bandwidth + if( I->papi_event_set == 1 ) + { + *num_papi_events = 2; + events = (int *) malloc( *num_papi_events * sizeof(int)); + events[0] = PAPI_L3_TCM; + events[1] = PAPI_TOT_CYC; + } + // CPU Stall Reason + if( I->papi_event_set == 2 ) + { + *num_papi_events = 4; + events = (int *) malloc( *num_papi_events * sizeof(int)); + int EventCode; + char * event1 = "RESOURCE_STALLS:ANY"; + char * event2 = "RESOURCE_STALLS:SB"; + char * event3 = "RESOURCE_STALLS:RS"; + char * event4 = "RESOURCE_STALLS2:OOO_RSRC"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + PAPI_event_name_to_code( event3, &EventCode ); + events[2] = EventCode; + PAPI_event_name_to_code( event4, &EventCode ); + events[3] = EventCode; + } + // CPU Stall Percentage + if( I->papi_event_set == 3 ) + { + *num_papi_events = 2; + events = (int *) malloc( *num_papi_events * sizeof(int)); + int EventCode; + char * event1 = "RESOURCE_STALLS:ANY"; + char * event2 = "PAPI_TOT_CYC"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + } + // Memory Loads + if( I->papi_event_set == 4 ) + { + *num_papi_events = 4; + events = (int *) malloc( *num_papi_events * sizeof(int)); + int EventCode; + char * event1 = "MEM_LOAD_UOPS_RETIRED"; + char * event2 = "MEM_LOAD_UOPS_RETIRED:L1_HIT"; + char * event3 = "MEM_LOAD_UOPS_RETIRED:L2_HIT"; + char * event4 = "MEM_LOAD_UOPS_RETIRED:L3_HIT"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + PAPI_event_name_to_code( event3, &EventCode ); + events[2] = EventCode; + PAPI_event_name_to_code( event4, &EventCode ); + events[3] = EventCode; + } + // LLC Miss Rate + if( I->papi_event_set == 5 ) + { + *num_papi_events = 2; + events = (int *) malloc( *num_papi_events * sizeof(int)); + events[0] = PAPI_L3_TCM; + events[1] = PAPI_L3_TCA; + } + // Branch MisPrediction + if( I->papi_event_set == 6 ) + { + *num_papi_events = 3; + events = (int *) malloc( *num_papi_events * sizeof(int)); + events[0] = PAPI_BR_MSP; + events[1] = PAPI_BR_CN; + events[2] = PAPI_BR_PRC; + } + // TLB Misses + if( I->papi_event_set == 7 ) + { + *num_papi_events = 4; + events = (int *) malloc( *num_papi_events * sizeof(int)); + int EventCode; + char * event1 = "perf::DTLB-LOADS"; + char * event2 = "perf::DTLB-LOAD-MISSES"; + char * event3 = "perf::DTLB-STORES"; + char * event4 = "perf::DTLB-STORE-MISSES"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + PAPI_event_name_to_code( event3, &EventCode ); + events[2] = EventCode; + PAPI_event_name_to_code( event4, &EventCode ); + events[3] = EventCode; + } + + ///////////////////////////////////////////////////////////////////////// + // PAPI EVENT SELECTION + ///////////////////////////////////////////////////////////////////////// + // User can comment/uncomment blocks as they see fit within this seciton + + // Some Standard Events + //int events[] = {PAPI_TOT_INS,PAPI_LD_INS,PAPI_FP_INS}; + + // Bandwidth Used + // ((PAPI_Lx_TCM * Lx_linesize) / PAPI_TOT_CYC) * Clock(MHz) + //int events[] = {PAPI_L3_TCM, PAPI_TOT_CYC}; + + // L3 Total Cache Miss Ratio + // PAPI_L3_TCM / PAPI_L3_TCA + // (On Xeon dual octo - 65%, not dependent on # of threads) + //int events[] = {PAPI_L3_TCM, PAPI_L3_TCA}; + + // % Cycles with no instruction use + // PAPI_STL_ICY / PAPI_TOT_CYC + //int events[] = { PAPI_STL_ICY, PAPI_TOT_CYC }; + + // % Branch instructions Mispredicted + // PAPI_BR_MSP / PAPI_BR_CN + //int events[] = { PAPI_BR_MSP, PAPI_BR_CN, PAPI_BR_PRC }; + + // TLB Misses + //int events[] = { PAPI_TLB_DM }; + + // MFlops + // (PAPI_FP_INS/PAPI_TOT_CYC) * Clock(MHz) + //int events[] = { PAPI_FP_INS, PAPI_TOT_CYC }; + + // MFlops (Alternate?) + // (PAPI_FP_INS/PAPI_TOT_CYC) * Clock(MHz) + //int events[] = { PAPI_DP_OPS, PAPI_TOT_CYC }; + + + // TLB misses (Using native counters) + /* + int events[2]; + int EventCode; + char * event1 = "perf::DTLB-LOADS"; + char * event2 = "perf::DTLB-LOAD-MISSES"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + */ + + /* + // Stalled Cycles, front v back (Using native counters) + int events[3]; + int EventCode; + char * event1 = "perf::STALLED-CYCLES-FRONTEND"; + char * event2 = "perf::STALLED-CYCLES-BACKEND"; + char * event3 = "perf::PERF_COUNT_HW_CPU_CYCLES"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + PAPI_event_name_to_code( event3, &EventCode ); + events[2] = EventCode; + */ + /* + // LLC Cache Misses (Using native counters) + int events[2]; + int EventCode; + char * event1 = "ix86arch::LLC_REFERENCES"; + char * event2 = "ix86arch::LLC_MISSES"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + */ + + /* + // Node Prefetch Misses (Using native counters) + int events[1]; + int EventCode; + //char * event1 = "perf::NODE-PREFETCHES"; + //char * event2 = "perf::NODE-PREFETCH-MISSES"; + char * event1 = "perf::NODE-PREFETCHES"; + char * event2 = "perf::NODE-LOAD-MISSES:COUNT"; + //PAPI_event_name_to_code( event1, &EventCode ); + //events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[0] = EventCode; + */ + + /* + // CPU Stalls Due to lack of Load Buffers (Using native counters) + int events[2]; + int EventCode; + char * event1 = "RESOURCE_STALLS:LB"; + char * event2 = "perf::PERF_COUNT_HW_CPU_CYCLES"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + */ + /* + // CPU Stalls Due to ANY Resource (Using native counters) + int events[2]; + int EventCode; + char * event1 = "RESOURCE_STALLS:ANY"; + char * event2 = "PAPI_TOT_CYC"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + */ + + /* + // CPU Stalls at Reservation Station (Using native counters) + int events[2]; + int EventCode; + char * event1 = "RESOURCE_STALLS:RS"; + char * event2 = "perf::PERF_COUNT_HW_CPU_CYCLES"; + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + */ + + /* + // CPU Stall Reason Breakdown (Using native counters) + int events[4]; + int EventCode; + + // Set 1 + char * event1 = "RESOURCE_STALLS:ANY"; + char * event2 = "RESOURCE_STALLS:LB"; + char * event3 = "RESOURCE_STALLS:RS"; + char * event4 = "RESOURCE_STALLS:SB"; + // Set 1 + + // Set 2 + char * event1 = "RESOURCE_STALLS:ANY"; + char * event2 = "RESOURCE_STALLS:ROB"; + char * event3 = "RESOURCE_STALLS:MEM_RS"; + char * event4 = "RESOURCE_STALLS2:ALL_FL_EMPTY"; + // Set 2 + // Set 3 + char * event1 = "RESOURCE_STALLS:ANY"; + char * event2 = "RESOURCE_STALLS2:ALL_PRF_CONTROL"; + char * event3 = "RESOURCE_STALLS2:ANY_PRF_CONTROL"; // duplicate + char * event4 = "RESOURCE_STALLS2:OOO_RSRC"; + // Set 3 + char * event1 = "RESOURCE_STALLS:ANY"; + char * event2 = "RESOURCE_STALLS:SB"; + char * event3 = "RESOURCE_STALLS:RS"; // duplicate + char * event4 = "RESOURCE_STALLS2:OOO_RSRC"; + + + // Events that don't need to be counted + // Don't bother measuring these + //char * event1 = "RESOURCE_STALLS:FCSW"; // Always 0, don't measure + //char * event1 = "RESOURCE_STALLS:MXCSR"; // Always 0, don't measure + //char * event3 = "RESOURCE_STALLS2:BOB_FULL"; // Always trivial + //char * event3 = "RESOURCE_STALLS2:ANY_PRF_CONTROL"; // duplicate + + PAPI_event_name_to_code( event1, &EventCode ); + events[0] = EventCode; + PAPI_event_name_to_code( event2, &EventCode ); + events[1] = EventCode; + PAPI_event_name_to_code( event3, &EventCode ); + events[2] = EventCode; + PAPI_event_name_to_code( event4, &EventCode ); + events[3] = EventCode; + */ + + ///////////////////////////////////////////////////////////////////////// + // PAPI EVENT LOADING + ///////////////////////////////////////////////////////////////////////// + // Users should not need to alter anything within this section + + int thread = omp_get_thread_num(); + + if ( (stat= PAPI_create_eventset(eventset)) != PAPI_OK) + { + PAPI_perror("PAPI_create_eventset"); + exit(1); + } + + for( int i = 0; i < *num_papi_events; i++ ) + { + if ((stat=PAPI_add_event(*eventset,events[i])) != PAPI_OK) + { + PAPI_perror("PAPI_add_event"); + exit(1); + } + } + + if ((stat=PAPI_start(*eventset)) != PAPI_OK) + { + PAPI_perror("PAPI_start"); + exit(1); + } +} + + + + +/* + void counter_init( int *eventset, int *num_papi_events ) + { + char error_str[PAPI_MAX_STR_LEN]; +// int events[] = {PAPI_TOT_INS,PAPI_BR_INS,PAPI_SR_INS}; +int events[] = {PAPI_TOT_INS,PAPI_LD_INS,PAPI_FP_INS}; +int events[] = {ix86arch::LLC_REFERENCES, +int stat; + +int thread = omp_get_thread_num(); +if( thread == 0 ) +printf("Initializing PAPI counters...\n"); + + *num_papi_events = sizeof(events) / sizeof(int); + + if ((stat = PAPI_thread_init((long unsigned int (*)(void)) omp_get_thread_num)) != PAPI_OK){ + PAPI_perror("PAPI_thread_init"); + exit(1); + } + + if ( (stat= PAPI_create_eventset(eventset)) != PAPI_OK){ + PAPI_perror("PAPI_create_eventset"); + exit(1); + } + + for( int i = 0; i < *num_papi_events; i++ ){ + if ((stat=PAPI_add_event(*eventset,events[i])) != PAPI_OK){ + PAPI_perror("PAPI_add_event"); + exit(1); + } + } + + if ((stat=PAPI_start(*eventset)) != PAPI_OK){ + PAPI_perror("PAPI_start"); + exit(1); + } + } + */ + +// Stops the papi counters and prints results +void counter_stop( int * eventset, int num_papi_events, Input * I ) +{ + int * events = malloc(num_papi_events * sizeof(int)); + int n = num_papi_events; + PAPI_list_events( *eventset, events, &n ); + PAPI_event_info_t info; + + long_long * values = malloc( num_papi_events * sizeof(long_long)); + PAPI_stop(*eventset, values); + int thread = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + + static long LLC_cache_miss = 0; + static long total_cycles = 0; + static long FLOPS = 0; + static long stall_any = 0; + static long stall_SB = 0; + static long stall_RS = 0; + static long stall_OO = 0; + static long tlb_load = 0; + static long tlb_load_m = 0; + static long tlb_store = 0; + static long tlb_store_m = 0; + + #pragma omp master + { + I->vals_accum = malloc( num_papi_events * sizeof(long long)); + for(int i=0; i < num_papi_events ; i ++) + I->vals_accum[i] = 0; + } + #pragma omp barrier + + #pragma omp critical (papi) + { + printf("Thread %d\n", thread); + for( int i = 0; i < num_papi_events; i++ ) + { + I->vals_accum[i] += values[i]; + PAPI_get_event_info(events[i], &info); + printf("%-15lld\t%s\t%s\n", values[i],info.symbol,info.long_descr); + if( strcmp(info.symbol, "PAPI_L3_TCM") == 0 ) + LLC_cache_miss += values[i]; + if( strcmp(info.symbol, "PAPI_TOT_CYC") == 0 ) + total_cycles += values[i]; + if( strcmp(info.symbol, "PAPI_SP_OPS") == 0 ) + FLOPS += values[i]; + if( strcmp(info.symbol, "RESOURCE_STALLS:ANY") == 0 ) + stall_any += values[i]; + if( strcmp(info.symbol, "RESOURCE_STALLS:SB") == 0 ) + stall_SB += values[i]; + if( strcmp(info.symbol, "RESOURCE_STALLS:RS") == 0 ) + stall_RS += values[i]; + if( strcmp(info.symbol, "RESOURCE_STALLS2:OOO_RSRC") == 0 ) + stall_OO += values[i]; + if( strcmp(info.symbol, "perf::DTLB-LOADS") == 0 ) + tlb_load += values[i]; + if( strcmp(info.symbol, "perf::DTLB-LOAD-MISSES") == 0 ) + tlb_load_m += values[i]; + if( strcmp(info.symbol, "perf::DTLB-STORES") == 0 ) + tlb_store += values[i]; + if( strcmp(info.symbol, "perf::DTLB-STORE-MISSES") == 0 ) + tlb_store_m += values[i]; + } + free(values); + } + { + #pragma omp barrier + } + #pragma omp master + { + if( omp_get_num_threads() > 1){ + printf("Thread Totals:\n"); + for( int i = 0; i < num_papi_events; i++ ) + { + PAPI_get_event_info(events[i], &info); + printf("%-15lld\t%s\t%s\n", I->vals_accum[i],info.symbol,info.long_descr); + } + } + free( I->vals_accum ); + + border_print(); + center_print("PERFORMANCE SUMMARY", 79); + border_print(); + long cycles = (long) (total_cycles / (double) nthreads); + double bw = LLC_cache_miss*64./cycles*2.8e9/1024./1024./1024.; + if( I->papi_event_set == 0 ) + printf("GFLOPs: %.3lf\n", FLOPS / (double) cycles * 2.8 ); + if( I->papi_event_set == 1 ) + printf("Bandwidth: %.3lf (GB/s)\n", bw); + if( I->papi_event_set == 2 ) + { + printf("%-30s %.2lf%%\n", "Store Buffer Full:", + stall_SB / (double) stall_any * 100.); + printf("%-30s %.2lf%%\n", "Reservation Station Full:", + stall_RS / (double) stall_any * 100.); + printf("%-30s %.2lf%%\n", "OO Pipeline Full:", + stall_OO / (double) stall_any * 100.); + } + if( I->papi_event_set == 3 ) + printf("CPU Stalled Cycles: %.2lf%%\n", + stall_any / (double) total_cycles * 100.); + if( I->papi_event_set == 7 ) + { + printf("%-30s %.2lf%%\n", "Data TLB Load Miss Rate: ", + tlb_load_m / (double) tlb_load * 100 ); + printf("%-30s %.2lf%%\n", "Data TLB Store Miss Rate: ", + tlb_store_m / (double) tlb_store * 100 ); + } + + border_print(); + } + free(events); +} + +#endif Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c @@ -0,0 +1,1469 @@ +#include"SimpleMOC_header.h" + +/* Efficient version of attenuate fluxes which determines the change in angular + * flux along a particular track across a fine axial region and tallies the + * contribution to the scalar flux in the fine axial region. This function + * assumes a quadratic source, which is calculated on the fly using neighboring + * source values. + * + * This version decomposes the work into many for loops for efficient SIMD + * instructions and to reduce register pressure. For a more descriptive + * (but less effiient) version of the code in terms of the underlying physics, + * see alt_attenuate_fluxes which solves the problem in a more naive, + * straightforward manner. */ +void attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I_in, + Params * params_in, float ds, float mu, float az_weight, + AttenuateVars * A ) +{ + Input I = *I_in; + Params params = *params_in; + + // unload attenuate vars + float * restrict q0 = A->q0; + float * restrict q1 = A->q1; + float * restrict q2 = A->q2; + float * restrict sigT = A->sigT; + float * restrict tau = A->tau; + float * restrict sigT2 = A->sigT2; + float * restrict expVal = A->expVal; + float * restrict reuse = A->reuse; + float * restrict flux_integral = A->flux_integral; + float * restrict tally = A->tally; + float * restrict t1 = A->t1; + float * restrict t2 = A->t2; + float * restrict t3 = A->t3; + float * restrict t4 = A->t4; + + // compute fine axial interval spacing + float dz = I.height / (I.fai * I.decomp_assemblies_ax * I.cai); + + // compute z height in cell + float zin = track->z_height - dz * + ( (int)( track->z_height / dz ) + 0.5f ); + + // compute fine axial region ID + int fine_id = (int) ( track->z_height / dz ) % I.fai; + + // compute weight (azimuthal * polar) + // NOTE: real app would also have volume weight component + float weight = track->p_weight * az_weight; + float mu2 = mu * mu; + + // load fine source region flux vector + float * FSR_flux = QSR -> fine_flux[fine_id]; + + if( fine_id == 0 ) + { + // adjust z height to account for edge + zin -= dz; + + // cycle over energy groups + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + // load neighboring sources + float y1 = QSR->fine_source[fine_id][g]; + float y2 = QSR->fine_source[fine_id+1][g]; + float y3 = QSR->fine_source[fine_id+2][g]; + + // do quadratic "fitting" + float c0 = y2; + float c1 = (y1 - y3) / (2.f*dz); + float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz); + + // calculate q0, q1, q2 + q0[g] = c0 + c1*zin + c2*zin*zin; + q1[g] = c1 + 2.f*c2*zin; + q2[g] = c2; + } + } + else if ( fine_id == I.fai - 1 ) + { + // adjust z height to account for edge + zin += dz; + + // cycle over energy groups + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + // load neighboring sources + float y1 = QSR->fine_source[fine_id-2][g]; + float y2 = QSR->fine_source[fine_id-1][g]; + float y3 = QSR->fine_source[fine_id][g]; + + // do quadratic "fitting" + float c0 = y2; + float c1 = (y1 - y3) / (2.f*dz); + float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz); + + // calculate q0, q1, q2 + q0[g] = c0 + c1*zin + c2*zin*zin; + q1[g] = c1 + 2.f*c2*zin; + q2[g] = c2; + } + } + else + { + // cycle over energy groups + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + // load neighboring sources + float y1 = QSR->fine_source[fine_id-1][g]; + float y2 = QSR->fine_source[fine_id][g]; + float y3 = QSR->fine_source[fine_id+1][g]; + + // do quadratic "fitting" + float c0 = y2; + float c1 = (y1 - y3) / (2.f*dz); + float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz); + + // calculate q0, q1, q2 + q0[g] = c0 + c1*zin + c2*zin*zin; + q1[g] = c1 + 2.f*c2*zin; + q2[g] = c2; + } + } + + // cycle over energy groups + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + // load total cross section + sigT[g] = QSR->sigT[g]; + + // calculate common values for efficiency + tau[g] = sigT[g] * ds; + sigT2[g] = sigT[g] * sigT[g]; + } + + // cycle over energy groups + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + expVal[g] = interpolateTable( params.expTable, tau[g] ); + + // Flux Integral + + // Re-used Term + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + reuse[g] = tau[g] * (tau[g] - 2.f) + 2.f * expVal[g] + / (sigT[g] * sigT2[g]); + } + + + float * psi; + if(forward) + psi = track->f_psi; + else + psi = track->b_psi; + + //#pragma vector nontemporal + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + // add contribution to new source flux + flux_integral[g] = (q0[g] * tau[g] + (sigT[g] * psi[g] - q0[g]) + * expVal[g]) / sigT2[g] + q1[g] * mu * reuse[g] + q2[g] * mu2 + * (tau[g] * (tau[g] * (tau[g] - 3.f) + 6.f) - 6.f * expVal[g]) + / (3.f * sigT2[g] * sigT2[g]); + } + + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + // Prepare tally + tally[g] = weight * flux_integral[g]; + } + + #ifdef OPENMP + omp_set_lock(QSR->locks + fine_id); + #endif + + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + FSR_flux[g] += tally[g]; + } + + #ifdef OPENMP + omp_unset_lock(QSR->locks + fine_id); + #endif + + // Term 1 + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + t1[g] = q0[g] * expVal[g] / sigT[g]; + } + // Term 2 + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + t2[g] = q1[g] * mu * (tau[g] - expVal[g]) / sigT2[g]; + } + // Term 3 + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + t3[g] = q2[g] * mu2 * reuse[g]; + } + // Term 4 + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + t4[g] = psi[g] * (1.f - expVal[g]); + } + // Total psi + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I.n_egroups; g++) + { + psi[g] = t1[g] + t2[g] + t3[g] + t4[g]; + } +} + +// single direction transport sweep +void transport_sweep( Params * params, Input * I ) +{ + if(I->mype==0) printf("Starting transport sweep ...\n"); + + // calculate the height of a node's domain and of each FSR + double node_delta_z = I->height / I->decomp_assemblies_ax; + double fine_delta_z = node_delta_z / (I->cai * I->fai); + + /* loop over tracks (implicitly azimuthal angles, tracks in azimuthal + * angles, polar angles, and z stacked rays) */ + + //print_Input_struct( I ); + long segments_processed = 0; + + #pragma omp parallel default(none) \ + shared( I, params, node_delta_z, fine_delta_z ) \ + reduction(+ : segments_processed ) + { + #ifdef OPENMP + int thread = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + unsigned int seed = time(NULL) * (thread+1); + #endif + //print_Input_struct( I ); + + #ifdef PAPI + int eventset = PAPI_NULL; + int num_papi_events; + #pragma omp critical + { + counter_init(&eventset, &num_papi_events, I); + } + #endif + + AttenuateVars A; + float * ptr = (float * ) malloc( I->n_egroups * 14 * sizeof(float)); + A.q0 = ptr; + ptr += I->n_egroups; + A.q1 = ptr; + ptr += I->n_egroups; + A.q2 = ptr; + ptr += I->n_egroups; + A.sigT = ptr; + ptr += I->n_egroups; + A.tau = ptr; + ptr += I->n_egroups; + A.sigT2 = ptr; + ptr += I->n_egroups; + A.expVal = ptr; + ptr += I->n_egroups; + A.reuse = ptr; + ptr += I->n_egroups; + A.flux_integral = ptr; + ptr += I->n_egroups; + A.tally = ptr; + ptr += I->n_egroups; + A.t1 = ptr; + ptr += I->n_egroups; + A.t2 = ptr; + ptr += I->n_egroups; + A.t3 = ptr; + ptr += I->n_egroups; + A.t4 = ptr; + + #pragma omp for schedule( dynamic ) + for (long i = 0; i < I->ntracks_2D; i++) + { + #if TIMING_INFO | 0 + // print progress + #ifdef OPENMP + if(I->mype==0 && thread == 0) + { + printf("\rAttenuating Tracks... (%.0lf%% completed)", + (i / ( (double)I->ntracks_2D / (double) nthreads )) + / (double) nthreads * 100.0); + } + #else + if( i % 50 == 0) + if(I->mype==0) + printf("%s%ld%s%ld\n","2D Tracks Completed = ", i," / ", + I->ntracks_2D ); + #endif + #endif + + + // treat positive-z traveling rays first + bool pos_z_dir = true; + for( int j = 0; j < I->n_polar_angles; j++) + { + if( j == I->n_polar_angles / 2 ) + pos_z_dir = false; + float p_angle = params->polar_angles[j]; + float mu = cos(p_angle); + + // start with all z stacked rays + int begin_stacked = 0; + int end_stacked = I->z_stacked; + + for( int n = 0; n < params->tracks_2D[i].n_segments; n++) + { + // calculate distance traveled in cell if segment completed + float s_full = params->tracks_2D[i].segments[n].length + / sin(p_angle); + + // allocate varaible for distance traveled in an FSR + float ds = 0; + + // loop over remaining z-stacked rays + for( int k = begin_stacked; k < end_stacked; k++) + { + // initialize s to full length + float s = s_full; + + // select current track + Track * track = ¶ms->tracks[i][j][k]; + + // set flag for completeion of segment + bool seg_complete = false; + + // calculate interval + int curr_interval; + if( pos_z_dir) + curr_interval = get_pos_interval(track->z_height, + fine_delta_z); + else + curr_interval = get_neg_interval(track->z_height, + fine_delta_z); + + while( !seg_complete ) + { + // flag to reset z position + bool reset = false; + + + /* calculate new height based on s + * (distance traveled in FSR) */ + float z = track->z_height + s * cos(p_angle); + + // check if still in same FSR (fine axial interval) + int new_interval; + if( pos_z_dir ) + new_interval = get_pos_interval(z, + fine_delta_z); + else + new_interval = get_neg_interval(z, + fine_delta_z); + + if( new_interval == curr_interval ) + { + seg_complete = true; + ds = s; + } + + // otherwise, we need to recalculate distances + else + { + // correct z + if( pos_z_dir ) + { + curr_interval++; + z = fine_delta_z * (float) curr_interval; + } + else{ + curr_interval--; + z = fine_delta_z * (float) curr_interval; + } + + // calculate distance travelled in FSR (ds) + ds = (z - track->z_height) / cos(p_angle); + + // update track length remaining + s -= ds; + + /* check remaining track length to protect + * against potential roundoff errors */ + if( s <= 0 ) + seg_complete = true; + + // check if out of bounds or track complete + if( z <= 0 || z >= node_delta_z ) + { + // mark segment as completed + seg_complete = true; + + // remember to no longer treat this track + if ( pos_z_dir ) + end_stacked--; + else + begin_stacked++; + + // reset z height + reset = true; + } + } + + // pick a random FSR (cache miss expected) + #ifdef OPENMP + long QSR_id = rand_r(&seed) % + I->n_source_regions_per_node; + #else + long QSR_id = rand() % + I->n_source_regions_per_node; + #endif + + /* update sources and fluxes from attenuation + * over FSR */ + if( I->axial_exp == 2 ) + { + attenuate_fluxes( track, true, + ¶ms->sources[QSR_id], + I, params, ds, mu, + params->tracks_2D[i].az_weight, &A ); + + segments_processed++; + } + + else if( I->axial_exp == 0 ) + { + attenuate_FSR_fluxes( track, true, + ¶ms->sources[QSR_id], + I, params, ds, mu, + params->tracks_2D[i].az_weight, &A ); + + segments_processed++; + } + else + { + printf("Error: invalid axial expansion order"); + printf("\n Please input 0 or 2\n"); + exit(1); + } + + // update with new z height or reset if finished + if( n == params->tracks_2D[i].n_segments - 1 + || reset) + { + if( pos_z_dir) + track->z_height = I->axial_z_sep * k; + else + track->z_height = I->axial_z_sep * (k+1); + } + else + track->z_height = z; + + } + } + } + } + } + #ifdef OPENMP + if(thread == 0 && I->mype==0) printf("\n"); + #endif + + #ifdef PAPI + if( thread == 0 ) + { + printf("\n"); + border_print(); + center_print("PAPI COUNTER RESULTS", 79); + border_print(); + printf("Count \tSmybol \tDescription\n"); + } + { + #pragma omp barrier + } + counter_stop(&eventset, num_papi_events, I); + #endif + } + I->segments_processed = segments_processed; + + return; +} + + +// run one full transport sweep, return k +void two_way_transport_sweep( Params * params, Input * I ) +{ + if(I->mype==0) printf("Starting transport sweep ...\n"); + + // calculate the height of a node's domain and of each FSR + double node_delta_z = I->height / I->decomp_assemblies_ax; + int num_intervals = (I->cai * I->fai); + double fine_delta_z = node_delta_z / num_intervals; + + /* loop over tracks (implicitly azimuthal angles, tracks in azimuthal + * angles, polar angles, and z stacked rays) */ + long segments_processed = 0; + + #pragma omp parallel default(none) \ + shared( I, params, node_delta_z, fine_delta_z, num_intervals ) \ + reduction(+ : segments_processed ) + { + #ifdef OPENMP + int thread = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + unsigned int seed = time(NULL) * (thread+1); + #endif + //print_Input_struct( I ); + + #ifdef PAPI + int eventset = PAPI_NULL; + int num_papi_events; + #pragma omp critical + { + counter_init(&eventset, &num_papi_events, I); + } + #endif + + + AttenuateVars A; + float * ptr = (float * ) malloc( I->n_egroups * 14 * sizeof(float)); + A.q0 = ptr; + ptr += I->n_egroups; + A.q1 = ptr; + ptr += I->n_egroups; + A.q2 = ptr; + ptr += I->n_egroups; + A.sigT = ptr; + ptr += I->n_egroups; + A.tau = ptr; + ptr += I->n_egroups; + A.sigT2 = ptr; + ptr += I->n_egroups; + A.expVal = ptr; + ptr += I->n_egroups; + A.reuse = ptr; + ptr += I->n_egroups; + A.flux_integral = ptr; + ptr += I->n_egroups; + A.tally = ptr; + ptr += I->n_egroups; + A.t1 = ptr; + ptr += I->n_egroups; + A.t2 = ptr; + ptr += I->n_egroups; + A.t3 = ptr; + ptr += I->n_egroups; + A.t4 = ptr; + + #pragma omp for schedule( dynamic ) + for (long i = 0; i < I->ntracks_2D; i++) + { + // print progress + #ifdef OPENMP + if(I->mype==0 && thread == 0) + { + printf("\rAttenuating Tracks... (%.0lf%% completed)", + (i / ( (double)I->ntracks_2D / (double) nthreads )) + / (double) nthreads * 100.0); + } + #else + if( i % 50 == 0) + if(I->mype==0) + printf("%s%ld%s%ld\n","2D Tracks Completed = ", i," / ", + I->ntracks_2D ); + #endif + + // allocate arrays for segment storage FIXME + double ** seg_dist = malloc( I->z_stacked * sizeof(double *) ); + Source *** seg_src = malloc( I->z_stacked * sizeof(Source**) ); + int * seg_idx = malloc( I->z_stacked * sizeof(int) ); + int * seg_size = malloc( I->z_stacked * sizeof(int) ); + + // fill matrix with arrays FIXME + for( int k = 0; k < I->z_stacked; k++) + { + seg_size[k] = 2 * I->segments_per_track; + seg_dist[k] = malloc( seg_size[k] * sizeof(double) ); + seg_src[k] = malloc( seg_size[k] * sizeof(Source *) ); + seg_idx[k] = 0; + } + + // treat positive-z traveling rays first + bool pos_z_dir = true; + for( int j = 0; j < I->n_polar_angles; j++) + { + if( j == I->n_polar_angles / 2 ) + pos_z_dir = false; + float p_angle = params->polar_angles[j]; + float mu = cos(p_angle); + + // start with all z stacked rays + int begin_stacked = 0; + int end_stacked = I->z_stacked; + + // reset semgnet indexes + for( int k = 0; k < I->z_stacked; k++) + seg_idx[k] = 0; + + for( int n = 0; n < params->tracks_2D[i].n_segments; n++) + { + // calculate distance traveled in cell if segment completed + float s_full = params->tracks_2D[i].segments[n].length + / sin(p_angle); + + // allocate varaible for distance traveled in an FSR + float ds = 0; + + // loop over remaining z-stacked rays + int tracks_completed = 0; + for( int k = begin_stacked; k < end_stacked; k++) + { + // select current track + Track * track = ¶ms->tracks[i][j][k]; + + // determine current axial interval + int interval = (int) track->z_height / fine_delta_z; + + // calculate distance to domain boundary + float bound_dist; + if( pos_z_dir) + bound_dist = (node_delta_z - track->z_height) / mu; + else + bound_dist = -track->z_height / mu; + + // determine track length + float s; + if( s_full < bound_dist ) + s = s_full; + else + { + // note completion of track + s = bound_dist; + tracks_completed++; + } + + // set flag for completeion of segment + bool seg_complete = false; + + while( !seg_complete ) + { + // initialize tracking variables + long QSR_id = interval + num_intervals * n; + float ds; + float z; + + // calculate z height of next fine axial interval + float fai_z_height; + if( pos_z_dir ) + fai_z_height = (interval + 1) * fine_delta_z ; + else + fai_z_height = interval * fine_delta_z; + + // calculate z distance to next fine axial interval + float z_dist_to_fai = + fai_z_height - track->z_height; + + /* calculate total distance (s) to fine axial + * interval */ + float s_dist_to_fai = z_dist_to_fai / mu; + + // determine if a fine axial interval is crossed + if( s_dist_to_fai < s ) + { + if( pos_z_dir ) + interval++; + else + interval--; + ds = s_dist_to_fai; + z = track->z_height + z_dist_to_fai; + } + else + { + ds = s; + z = track->z_height + s * mu; + } + + /* shorten remaining segment length and check if + * completed (accounting for potential roundoff) */ + s -= ds; + if( s <= 0 || interval < 0 + || interval >= num_intervals) + seg_complete = true; + + // pick a random FSR (cache miss expected) + #ifdef OPENMP + QSR_id = rand_r(&seed) % + I->n_source_regions_per_node; + #else + QSR_id = rand() % I->n_source_regions_per_node; + #endif + + /* update sources and fluxes from attenuation + * over FSR */ + if( I->axial_exp == 2 ) + { + attenuate_fluxes( track, true, + ¶ms->sources[QSR_id], + I, params, ds, mu, + params->tracks_2D[i].az_weight, &A ); + segments_processed++; + } + + else if( I->axial_exp == 0 ) + attenuate_FSR_fluxes( track, true, + ¶ms->sources[QSR_id], + I, params, ds, mu, + params->tracks_2D[i].az_weight, &A ); + else + { + printf("Error: invalid axial expansion order"); + printf("\n Please input 0 or 2\n"); + exit(1); + } + + // update track height + track->z_height = z; + + // save segment length and source FIXME + seg_dist[k][seg_idx[k]] = ds; + seg_src[k][seg_idx[k]] = ¶ms->sources[QSR_id]; + seg_idx[k]++; + + // check if array needs to grow FIXME + if( seg_idx[k] >= seg_size[k] ) + { + seg_size[k] *= 2; + seg_dist[k] = (double *) realloc( seg_dist[k], + seg_size[k] * sizeof(double) ); + seg_src[k] = (Source **) realloc( seg_src[k], + seg_size[k] * sizeof(Source *) ); + } + } + } + if(pos_z_dir) + end_stacked -= tracks_completed; + else + begin_stacked += tracks_completed; + } + + // loop over all z stacked rays again + for( int k = 0; k < I->z_stacked; k++ ) + { + for( int n = seg_idx[k]-1; n >= 0; n--) + { + // load distance + float ds = seg_dist[k][n]; + + // select current track + Track * track = ¶ms->tracks[i][j][k]; + + // update sources and fluxes from attenuation over FSR + if( I->axial_exp == 2 ) + { + attenuate_fluxes( track, false, + seg_src[k][n], + I, params, ds, -mu, + params->tracks_2D[i].az_weight, &A ); + segments_processed++; + } + + else if( I->axial_exp == 0 ) + attenuate_FSR_fluxes( track, false, + seg_src[k][n], + I, params, ds, -mu, + params->tracks_2D[i].az_weight, &A ); + + // update z height + track->z_height -= ds * mu; + } + } + + + /* Update all tracks with correct starting z location again + * NOTE: this is only here to acocunt for roundoff error */ + for( int k = 0; k < I->z_stacked; k++) + { + Track * track = ¶ms->tracks[i][j][k]; + if( pos_z_dir) + track->z_height = I->axial_z_sep * k; + else + track->z_height = I->axial_z_sep * (k+1); + } + } + + // free memory + for( int k = 0; k < I->z_stacked; k++) + { + free(seg_dist[k]); + free(seg_src[k]); + } + free(seg_dist); + free(seg_src); + free(seg_idx); + free(seg_size); + + } + #ifdef OPENMP + if(thread == 0 && I->mype==0) printf("\n"); + #endif + + #ifdef PAPI + if( thread == 0 ) + { + printf("\n"); + border_print(); + center_print("PAPI COUNTER RESULTS", 79); + border_print(); + printf("Count \tSmybol \tDescription\n"); + } + { + #pragma omp barrier + } + counter_stop(&eventset, num_papi_events, I); + #endif + } + //printf("Number of segments processed: %ld\n", segments_processed); + I->segments_processed = segments_processed; + + return; +} + +/* returns integer number for axial interval for tracks traveling in the + * positive direction */ +int get_pos_interval( float z, float dz) +{ + int interval = (int) (z/dz); + return interval; +} + +/* returns integer number for axial interval for tracks traveling in the + * negative direction */ +int get_neg_interval( float z, float dz) +{ + // NOTE: a bit of trickery using floors to obtain ceils + int interval = INT_MAX - (int) ( (double) INT_MAX + - (double) ( z / dz ) ); + return interval; +} + +int calc_next_fai( float z, float dz, bool pos_dir) +{ + int interval = z/dz; + float lower_z = dz * (float) interval; + if(pos_dir) + return interval + 1; + else + return interval; +} + +/* Determines the change in angular flux along a particular track across a fine + * axial region and tallies the contribution to the scalar flux in the fine + * axial region. This function assumes a quadratic source, which is calculated + * on the fly using neighboring source values. + * + * This legacy function is unused since it is less efficient than the current + * attenuate_fluxes function. However, it provides a more straightforward + * description of the underlying physical problem. */ +void alt_attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I, + Params * params, float ds, float mu, float az_weight ) +{ + // compute fine axial interval spacing + float dz = I->height / (I->fai * I->decomp_assemblies_ax * I->cai); + + // compute z height in cell + float zin = track->z_height - dz * ( (int)( track->z_height / dz ) + 0.5 ); + + // compute fine axial region ID + int fine_id = (int) ( track->z_height / dz ) % I->fai; + + // compute weight (azimuthal * polar) + // NOTE: real app would also have volume weight component + float weight = track->p_weight * az_weight; + float mu2 = mu * mu; + + // load fine source region flux vector + float * FSR_flux = QSR -> fine_flux[fine_id]; + + // cycle over energy groups + for( int g = 0; g < I->n_egroups; g++) + { + // load total cross section + float sigT = QSR->sigT[g]; + + // define source parameters + float q0, q1, q2; + + // calculate source components + if( fine_id == 0 ) + { + // load neighboring sources + float y2 = QSR->fine_source[fine_id][g]; + float y3 = QSR->fine_source[fine_id+1][g]; + + // do linear "fitting" + float c0 = y2; + float c1 = (y3 - y2) / dz; + + // calculate q0, q1, q2 + q0 = c0 + c1*zin; + q1 = c1; + q2 = 0; + } + else if( fine_id == I->fai - 1 ) + { + // load neighboring sources + float y1 = QSR->fine_source[fine_id-1][g]; + float y2 = QSR->fine_source[fine_id][g]; + + // do linear "fitting" + float c0 = y2; + float c1 = (y2 - y1) / dz; + + // calculate q0, q1, q2 + q0 = c0 + c1*zin; + q1 = c1; + q2 = 0; + } + else + { + // load neighboring sources + float y1 = QSR->fine_source[fine_id-1][g]; + float y2 = QSR->fine_source[fine_id][g]; + float y3 = QSR->fine_source[fine_id+1][g]; + + // do quadratic "fitting" + float c0 = y2; + float c1 = (y1 - y3) / (2*dz); + float c2 = (y1 - 2*y2 + y3) / (2*dz*dz); + + // calculate q0, q1, q2 + q0 = c0 + c1*zin + c2*zin*zin; + q1 = c1 + 2*c2*zin; + q2 = c2; + } + + // calculate common values for efficiency + float tau = sigT * ds; + float sigT2 = sigT * sigT; + + // compute exponential ( 1 - exp(-x) ) using table lookup + float expVal = interpolateTable( params->expTable, tau ); + + // load correct angular flux vector + float * psi; + if(forward) + psi = track->f_psi; + else + psi = track->b_psi; + + // add contribution to new source flux + float flux_integral = (q0 * tau + (sigT * psi[g] - q0) * expVal) + / sigT2 + + q1 * mu * (tau * (tau - 2) + 2 * expVal) + / (sigT * sigT2) + + q2 * mu2 * (tau * (tau * (tau - 3) + 6) - 6 * expVal) + / (3 * sigT2 * sigT2); + + #pragma omp atomic + FSR_flux[g] += weight * flux_integral; + + // update angular flux + psi[g] = psi[g] * (1.0 - expVal) + q0 * expVal / sigT + + q1 * mu * (tau - expVal) / sigT2 + q2 * mu2 * + (tau * (tau - 2) + 2 * expVal) / (sigT2 * sigT); + } +} + +/* Determines the change in angular flux along a particular track across a fine + * axial region and tallies the contribution to the scalar flux in the fine + * axial region. This function assumes a constant source. */ +void attenuate_FSR_fluxes( Track * track, bool forward, Source * FSR, Input * I, + Params * params_in, float ds, float mu, float az_weight, + AttenuateVars *A) +{ + // upack attenuate vars struct + float * restrict tally = A->tally; + float * restrict expVal = A->expVal; + float * restrict sigT = A->sigT; + float * restrict tau = A->tau; + + Params params = * params_in; + + // compute fine axial interval spacing + float dz = I->height / (I->fai * I->decomp_assemblies_ax * I->cai); + + // compute z height in cell + float zin = track->z_height - dz * + ( (int)( track->z_height / dz ) + 0.5f ); + + // compute fine axial region ID + int fine_id = (int) ( track->z_height / dz ) % I->fai; + + // compute weight (azimuthal * polar) + // NOTE: real app would also have volume weight component + float weight = track->p_weight * az_weight * mu; + + // load fine source region flux vector + float * FSR_flux = FSR -> fine_flux[fine_id]; + + // cycle over energy groups + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I->n_egroups; g++) + { + // load total cross section + sigT[g] = FSR->sigT[g]; + tau[g] = sigT[g] * ds; + } + + // compute exponential ( 1 - exp(-x) ) using table lookup + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for(int g = 0; g < I->n_egroups; g++) + { + expVal[g] = interpolateTable( params.expTable, tau[g] ); + } + + float * psi; + if(forward) + psi = track->f_psi; + else + psi = track->b_psi; + + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I->n_egroups; g++) + { + // compute angular flux attenuation + float q = FSR->fine_source[fine_id][g] / sigT[g]; + float delta_psi = (psi[g] - q) * expVal[g]; + + // add contribution to new source flux + tally[g] = weight * delta_psi; + + // update angular flux + psi[g] -= delta_psi; + } + + + + #ifdef OPENMP + omp_set_lock(&FSR->locks[fine_id]); + #endif + + #ifdef INTEL + #pragma simd + #elif defined IBM + #pragma simd_level(10) + #endif + for( int g = 0; g < I->n_egroups; g++) + { + FSR_flux[g] += tally[g]; + } + + #ifdef OPENMP + omp_unset_lock(&FSR->locks[fine_id]); + #endif + + +} + +/* Renormalizes scalar and angular flux for next transport sweep iteration. + * Calculation requires multiple pair-wise sums and a reduction accross all + * nodes. */ +void renormalize_flux( Params params, Input I, CommGrid grid ) +{ + if( I.mype == 0 ) printf("Renormalizing Flux...\n"); + float node_fission_rate = 0; + #ifdef OPENMP + #pragma omp parallel default(none) shared(params, I, grid) \ + reduction(+ : node_fission_rate) + { + #endif + // tally total fission rate (pair-wise sum) + float * fission_rates = malloc( I.n_source_regions_per_node + * sizeof(float) ); + + float * fine_fission_rates = malloc( I.fai * sizeof(float) ); + float * g_fission_rates = malloc( I.n_egroups * sizeof(float) ); + + // accumulate total fission rate on node domain + #pragma omp for schedule(dynamic) + for( int i = 0; i < I.n_source_regions_per_node; i++) + { + Source src = params.sources[i]; + for( int j = 0; j < I.fai; j++) + { + for( int g = 0; g < I.n_egroups; g++) + g_fission_rates[g] = src.fine_flux[j][g] * src.vol + * src.XS[g][0]; + fine_fission_rates[j] = pairwise_sum( g_fission_rates, + I.n_egroups ); + } + fission_rates[i] = pairwise_sum( fine_fission_rates, I.fai ); + } + node_fission_rate = pairwise_sum(fission_rates, + I.n_source_regions_per_node); + + // free allocated memory + free(fission_rates); + free(fine_fission_rates); + free(g_fission_rates); + + #ifdef OPENMP + } + #endif + + #ifdef MPI + // accumulate total fission rate by MPI Allreduce + float total_fission_rate = 0; + MPI_Barrier(grid.cart_comm_3d); + MPI_Allreduce( &node_fission_rate, // Send Buffer + &total_fission_rate, // Receive Buffer + 1, // Element Count + MPI_FLOAT, // Element Type + MPI_SUM, // Reduciton Operation Type + grid.cart_comm_3d ); // MPI Communicator + MPI_Barrier(grid.cart_comm_3d); + #else + float total_fission_rate = node_fission_rate; + #endif + + + // normalize fluxes by fission reaction rate + float norm_factor = 1.0 / total_fission_rate; + + #pragma omp parallel for default(none) \ + shared(I, params) private(norm_factor) schedule(dynamic) + for( int i = 0; i < I.n_source_regions_per_node; i++) + { + Source * src = ¶ms.sources[i]; + float adjust = norm_factor * 4 * M_PI * I.fai / src->vol; + for( int k = 0; k < I.fai; k++) + for( int g = 0; g < I.n_egroups; g++) + src->fine_flux[k][g] *= adjust; + } + + // normalize boundary fluxes by same factor + #pragma omp parallel for default(none) \ + shared(I, params) private(norm_factor) schedule(dynamic) + for( long i = 0; i < I.ntracks_2D; i++) + for( int j = 0; j < I.n_polar_angles; j++) + for( int k = 0; k < I.z_stacked; k++) + for( int g = 0; g < I.n_egroups; g++) + { + params.tracks[i][j][k].f_psi[g] *= norm_factor; + params.tracks[i][j][k].b_psi[g] *= norm_factor; + } + + if( I.mype == 0 ) printf("Renormalizing Flux Complete.\n"); + return; +} + +/* Updates sources for next iteration by computing scattering and fission + * components. Calculation includes multiple pair-wise sums and reductions + * accross all nodes */ +float update_sources( Params params, Input I, float keff ) +{ + // source residual + float residual; + + // calculate inverse multiplication facotr for efficiency + float inverse_k = 1.0 / keff; + + // allocate residual arrays + float * group_res = (float *) malloc(I.n_egroups * sizeof(float)); + float * fine_res = (float *) malloc(I.fai * sizeof(float)); + float * residuals = (float *) malloc(I.n_source_regions_per_node + * sizeof(float)); + + // allocate arrays for summation + float * fission_rates = malloc(I.n_egroups * sizeof(float)); + float * scatter_rates = malloc(I.n_egroups * sizeof(float)); + + // cycle through all coarse axial intervals to update source + for( long i = 0; i < I.n_source_regions_per_node; i++) + { + Source src = params.sources[i]; + + // cycle thorugh all fine axial regions to calculate new source + for( int j = 0; j < I.fai; j++) + { + // calculate total fission source and scattering source + float fission_source; + float scatter_source; + + // compute total fission source + for( int g = 0; g < I.n_egroups; g++ ) + fission_rates[g] = src.fine_flux[j][g] * src.XS[g][0]; + fission_source = pairwise_sum( fission_rates, (long) I.n_egroups); + + // normalize fission source by multiplication factor + fission_source *= inverse_k; + + // compute scattering and new total source for each group + for( int g = 0; g < I.n_egroups; g++ ) + { + for( int g2 = 0; g2 < I.n_egroups; g2++ ) + { + // compute scatter source originating from g2 -> g + scatter_rates[g2] = src.scattering_matrix[g][g2] * + src.fine_flux[j][g2]; + } + scatter_source = pairwise_sum(scatter_rates, + (long) I.n_egroups); + + // compuate new total source + float chi = src.XS[g][2]; + + // calculate new fine source + float newSrc = (fission_source * chi + scatter_source) + / (4.0 * M_PI); + + // calculate residual + float oldSrc = src.fine_source[j][g]; + group_res[g] = (newSrc - oldSrc) * (newSrc - oldSrc) + / (oldSrc * oldSrc); + + /* calculate new source in fine axial interval assuming + * isotropic source components */ + src.fine_source[j][g] = newSrc; + } + fine_res[j] = pairwise_sum(group_res, (long) I.n_egroups); + } + residuals[i] = pairwise_sum(fine_res, (long) I.fai); + } + + // calculate source residual + residual = pairwise_sum(residuals, I.n_source_regions_per_node); + + // free memory + free(fission_rates); + free(scatter_rates); + free(group_res); + free(fine_res); + free(residuals); + + + // NOTE: See code around line 600 of CPUSolver.cpp in ClosedMOC/ OpenMOC + + return residual; +} + +/* Computes globall k-effective using multiple pair-wise summations and finally + * a reduction accross all nodes */ +float compute_keff(Params params, Input I, CommGrid grid) +{ + // allocate temporary memory + float * sigma = malloc( I.n_egroups * sizeof(float) ); + float * group_rates = malloc( I.n_egroups * sizeof(float) ); + float * fine_rates = malloc( I.fai * sizeof(float) ); + float * QSR_rates = malloc( I.n_source_regions_per_node * sizeof(float) ); + + /////////////////////////////////////////////////////////////////////////// + + // compute total absorption rate, looping over source regions + for( long i = 0; i < I.n_source_regions_per_node; i++) + { + // load absorption XS data + Source src = params.sources[i]; + for( int g = 0; g < I.n_egroups; g++) + sigma[g] = src.XS[g][1]; + + for( int j = 0; j < I.fai; j++ ) + { + // calculate absorption rates + float * fine_flux = src.fine_flux[j]; + for( int g = 0; g < I.n_egroups; g++) + group_rates[g] = sigma[g] * fine_flux[g]; + + // sum absorption over all energy groups + fine_rates[j] = pairwise_sum( group_rates, (long) I.n_egroups ); + } + // sum absorption over all fine axial intervals + QSR_rates[i] = pairwise_sum( fine_rates, (long) I.fai ); + } + // sum absorption over all source regions in a node + float node_abs = pairwise_sum( QSR_rates, I.n_source_regions_per_node); + + /////////////////////////////////////////////////////////////////////////// + + // compute total absorption rate, looping over source regions + for( long i = 0; i < I.n_source_regions_per_node; i++) + { + // load nuSigmaF XS data + Source src = params.sources[i]; + for( int g = 0; g < I.n_egroups; g++) + sigma[g] = src.XS[g][0]; + + for( int j = 0; j < I.fai; j++ ) + { + // calculate absorption rates + float * fine_flux = src.fine_flux[j]; + for( int g = 0; g < I.n_egroups; g++) + group_rates[g] = sigma[g] * fine_flux[g]; + + // sum fission over all energy groups + fine_rates[j] = pairwise_sum( group_rates, (long) I.n_egroups ); + } + // sum fission over all fine axial intervals + QSR_rates[i] = pairwise_sum( fine_rates, (long) I.fai ); + } + // sum fission over all source regions in a node + float node_fission = pairwise_sum( QSR_rates, I.n_source_regions_per_node); + + /////////////////////////////////////////////////////////////////////////// + + // MPi Reduction + float tot_abs = 0; + float tot_fission = 0; + float leakage = 0; + + #ifdef MPI + + // Total Absorption Reduction + MPI_Reduce( &node_abs, // Send Buffer + &tot_abs, // Receive Buffer + 1, // Element Count + MPI_FLOAT, // Element Type + MPI_SUM, // Reduciton Operation Type + 0, // Master Rank + grid.cart_comm_3d ); // MPI Communicator + + // Total Fission Reduction + MPI_Reduce( &node_fission, // Send Buffer + &tot_fission, // Receive Buffer + 1, // Element Count + MPI_FLOAT, // Element Type + MPI_SUM, // Reduciton Operation Type + 0, // Master Rank + grid.cart_comm_3d ); // MPI Communicator + + // Total Leakage Reduction + MPI_Reduce( params.leakage, // Send Buffer + &leakage, // Receive Buffer + 1, // Element Count + MPI_FLOAT, // Element Type + MPI_SUM, // Reduciton Operation Type + 0, // Master Rank + grid.cart_comm_3d ); // MPI Communicator + + MPI_Barrier(grid.cart_comm_3d); + + // calculate keff + float keff = tot_fission/ (tot_abs + leakage); + #else + float keff = node_fission / (node_abs + *params.leakage); + #endif + + /////////////////////////////////////////////////////////////////////////// + + // free memory + free(sigma); + free(group_rates); + free(fine_rates); + free(QSR_rates); + + return keff; +} + +/* Interpolates a formed exponential table to compute ( 1- exp(-x) ) + * at the desired x value */ +float interpolateTable( Table table, float x) +{ + // check to ensure value is in domain + if( x > table.maxVal ) + return 1.0f; + else + { + int interval = (int) ( x / table.dx + 0.5f * table.dx ); + /* + if( interval >= table.N || interval < 0) + { + printf( "Interval = %d\n", interval); + printf( "N = %d\n", table.N); + printf( "x = %f\n", x); + printf( "dx = %f\n", table.dx); + exit(1); + } + */ + float slope = table.values[ 2 * interval ]; + float intercept = table.values[ 2 * interval + 1 ]; + float val = slope * x + intercept; + return val; + } +} + Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c @@ -0,0 +1,232 @@ +#include"SimpleMOC_header.h" + +// intitialize source structures and intitialize XS values to random numbers +Source * initialize_sources( Input I, size_t * nbytes ) +{ + // Allocate space + Source * sources = (Source *) malloc( I.n_source_regions_per_node + * sizeof(Source) ); + *nbytes += I.n_source_regions_per_node * sizeof(Source); + + // determine number of cross section and coarse axial regions + long n_xs_regions = I.n_source_regions_per_node / 8; + + /////////////////////////////////////////////////////////////////////////// + + // Allocate scattering matrix matrix ptrs + float *** s_matrices = (float ***) malloc( n_xs_regions + * sizeof(float**) ); + *nbytes += n_xs_regions * sizeof(float **); + + // Allocate space for ALL scattering matrix ptrs + float ** s_matrix_ptrs = (float **) malloc( n_xs_regions * I.n_egroups + * sizeof(float *)); + *nbytes += n_xs_regions * sizeof(float **); + + // Allocate space for ALL scattering data +#ifdef PRINT_MEM_SIZES + if(I.mype==0) + printf("Scattering data requires %zu MB of data...\n", + n_xs_regions * I.n_egroups * I.n_egroups + * sizeof(float) / 1024 / 1024); +#endif + float * s_matrix_data = (float *) malloc( n_xs_regions * I.n_egroups + * I.n_egroups * sizeof(float)); + *nbytes += n_xs_regions * I.n_egroups * I.n_egroups * sizeof(float); + + // Stitch allocation ptrs together + for( long i = 0; i < n_xs_regions; i++ ) + s_matrices[i] = &s_matrix_ptrs[i * I.n_egroups]; + + for( long i = 0; i < n_xs_regions; i++ ) + for( long j = 0; j < I.n_egroups; j++ ) + s_matrices[i][j] = &s_matrix_data[i * I.n_egroups * I.n_egroups + + j * I.n_egroups]; + + // Iniitalize Scattering Matrix Values + for( long i = 0; i < n_xs_regions; i++ ) + for( long j = 0; j < I.n_egroups; j++ ) + for( long k = 0; k < I.n_egroups; k++ ) + s_matrices[i][j][k] = urand(); + + /////////////////////////////////////////////////////////////////////////// + + /* + * Create data scrtucture for storing XS data (and chi) as follows: + * An array is created which stores in contigious memory as + * [ ... , nu*SigmaF, SigmaA, Chi, ...] + */ + + if(I.mype==0) printf("Beginning XS Allocation...\n"); + + // Allocate space for XS ptrs (by region) + float *** XS = (float ***) malloc( n_xs_regions * sizeof(float **) ); + *nbytes += n_xs_regions * sizeof(float **); + + // Allocate space for each XS type of interest (total, nu*SigmaF, and chi) + float ** XS_arrays = (float **) malloc (n_xs_regions * I.n_egroups + * sizeof(float *) ); + *nbytes += n_xs_regions * I.n_egroups * sizeof(float *); + + // Allocate space for total XS data + float * XS_data = (float *) malloc( n_xs_regions * I.n_egroups * 3 + * sizeof(float) ); + *nbytes += n_xs_regions * I.n_egroups * 3 * sizeof(float); + + // stitch allocation ptrs together for XS data + for( long i = 0; i < n_xs_regions; i++) + XS[i] = &XS_arrays[i * I.n_egroups]; + + for( long i = 0; i < n_xs_regions; i++) + for(long j = 0; j < I.n_egroups; j++) + XS[i][j] = &XS_data[i * I.n_egroups * 3 + j * 3]; + + // Initialize XS data + for( long i = 0; i < n_xs_regions; i++) + for( int j = 0; j < I.n_egroups; j++) + for( int k = 0; k < 3; k++) + XS[i][j][k] = urand(); + + /////////////////////////////////////////////////////////////////////////// + + /* Here I allocate useful source parameters and fluxes for use in the + * attenuate_fluxes function in solver.c - the parameters are arranged + * contigiously for efficiency */ + + if(I.mype==0) printf("Beginning Source and Flux Parameter Allocation...\n"); + + // Allocate space for source parameters (quadratic axially) + float *** fineSource = (float ***) malloc( I.n_source_regions_per_node + * sizeof(float **) ); + *nbytes += I.n_source_regions_per_node * sizeof(float **); + + // Allocate space for array pointers to parameters + float ** fineSourcePtrs = (float **) malloc ( I.n_source_regions_per_node + * I.fai * sizeof(float *) ); + *nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *); + + // Allocate space for flux parameters (quadratic axially) + float *** fineFlux = (float ***) malloc( I.n_source_regions_per_node + * sizeof(float **) ); + *nbytes += I.n_source_regions_per_node * sizeof(float **); + + // Allocate space for array pointers to parameters + float ** fineFluxPtrs = (float **) malloc ( I.n_source_regions_per_node + * I.fai * sizeof(float *) ); + *nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *); + + // Allocate space for cross section pointers + float ** sigT = (float **) malloc( I.n_source_regions_per_node + * sizeof(float *) ); + + // Allocate space for parameter data + float * data = (float *) malloc( (2 * I.fai + 1) * + I.n_source_regions_per_node * I.n_egroups * sizeof(float) ); + + *nbytes += I.n_source_regions_per_node * I.fai * I.n_egroups + * sizeof(float); + + /////////////////////////////////////////////////////////////////////////// + + // stitch pointers together + + // stitch allocation ptrs together for source data + for( long i = 0; i < I.n_source_regions_per_node; i++) + fineSource[i] = &fineSourcePtrs[i * I.fai]; + + for( long i = 0; i < I.n_source_regions_per_node; i++) + for(long j = 0; j < I.fai; j++) + fineSource[i][j] = &data[i * I.fai * I.n_egroups + + j * I.n_egroups]; + + // stitch allocation ptrs together for flux data + for( long i = 0; i < I.n_source_regions_per_node; i++) + fineFlux[i] = &fineFluxPtrs[i * I.fai]; + + for( long i = 0; i < I.n_source_regions_per_node; i++) + for(int j = 0; j < I.fai; j++) + fineFlux[i][j] = &data[I.n_egroups * I.fai * + (I.n_source_regions_per_node + i) + j * I.n_egroups]; + + // stitch allocation ptrs together for total XS + for( long i = 0; i < I.n_source_regions_per_node; i++) + sigT[i] = &data[ 2 * I.n_source_regions_per_node * I.fai * I.n_egroups + + i * I.n_egroups]; + + /////////////////////////////////////////////////////////////////////////// + + // Initialize source to random numbers + for( long i = 0; i < I.n_source_regions_per_node; i++) + for( int j = 0; j < I.fai; j++) + for( int k = 0; k < I.n_egroups; k++) + fineSource[i][j][k] = urand(); + + // Initialize fine flux to zeros + for( long i = 0; i < I.n_source_regions_per_node; i++) + for( int j = 0; j < I.fai; j++) + for( int k = 0; k < I.n_egroups; k++) + fineFlux[i][j][k] = 0; + + + // Initialize total XS to random numbers + for( long i = 0; i < I.n_source_regions_per_node; i++) + for( int k = 0; k < I.n_egroups; k++) + sigT[i][k] = urand(); + + /////////////////////////////////////////////////////////////////////////// + // + + #ifdef OPENMP + omp_lock_t * locks = init_locks(I); + long lock_idx = 0; + #endif + + // Assign to source regions + for( long i = 0; i < I.n_source_regions_per_node; i++ ) + { + long idx; + if( i == 0 ) + idx = 0; + else + idx = rand() % n_xs_regions; + + sources[i].scattering_matrix = s_matrices[idx]; + sources[i].XS = XS[idx]; + sources[i].fine_flux = fineFlux[i]; + sources[i].fine_source = fineSource[i]; + sources[i].sigT = sigT[i]; + + // initialize FSR volume + sources[i].vol = urand(); + + #ifdef OPENMP + sources[i].locks = &locks[lock_idx]; + lock_idx += I.fai; + #endif + } + + // free memory of temporary variables + free( s_matrices ); + free( XS ); + free( fineFlux ); + free( fineSource); + free( sigT); + + return sources; +} + +void free_sources( Input I, Source * sources ) +{ + // Free XS's + free( sources[0].XS[0] ); + free( sources[0].XS ); + // Free Flux's + free( sources[0].fine_flux[0] ); + free( sources[0].fine_flux ); + // Free scattering matrices + free( sources[0].scattering_matrix[0] ); + free( sources[0].scattering_matrix ); + // Free source values + free( sources[0].fine_source[0] ); + free( sources[0].fine_source ); +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c @@ -0,0 +1,71 @@ +#include"SimpleMOC_header.h" + +// prints input parameters to screen +void print_Input_struct( Input I ) +{ + printf("INPUT STRUCT\n"); + printf("%d\n", I.x_assemblies); /* Number of assemblies in the + x-axis of the reactor */ + printf("%d\n", I.y_assemblies); /* Number of assemblies in the + y-axis of the reactor */ + printf("%d\n", I.cai); // Number of course axial intervals + printf("%d\n", I.fai); /* Number of fine axial intervals + per coarse axial interval */ + printf("%d\n", I.axial_exp); // Axial source expansion order + printf("%lf\n", I.radial_ray_sep); // Radial ray separation + printf("%lf\n", I.axial_z_sep); // Axial stacked z-ray separation + printf("%d\n", I.n_azimuthal); // Number of azimuthal angles + printf("%d\n", I.n_polar_angles); // Number of polar angles + printf("%d\n", I.n_egroups); // Number of energy groups + printf("%d\n", I.decomp_assemblies_ax); /* Number of assemblies per + sub-domain (axially) */ + printf("%ld\n", I.segments_per_track); /* Average number of segments per + track */ + printf("%lf\n", I.assembly_width); // Width of an assembly + printf("%lf\n", I.height); // Height of the reactor + printf("%lf\n", I.domain_height); // Z Height of a domain + printf("%lf\n", I.precision); // precision for source convergence + printf("%ld\n", I.mype); // MPI Rank + printf("%ld\n", I.ntracks_2D); // Number of 2D tracks (derived) + printf("%d\n", I.z_stacked); // Number of z rays (derived) + printf("%ld\n", I.ntracks); /* Total number of 3D tracks per + assembly (derived) */ + printf("%d\n", I.nthreads); // Number of OpenMP Threads + printf("%d\n", I.papi_event_set); /* PAPI event set: + 0 - FLOPS + 1 - Bandwidth + 2 - CPU Stall reason + */ + + // Source regions per assembly + printf("%ld\n", I.n_2D_source_regions_per_assembly); + + // Source regions per node (derived) + printf("%ld\n", I.n_source_regions_per_node); + printf("END INPUT STRUCT\n"); + +} + +// generates normally distributed random numbers +void gen_norm_pts(float mean, float sigma, int n_pts) +{ + // generate output file + FILE * out; + out = fopen("gen_points.txt","w"); + fprintf(out, "Random numbers generated for a normal distribution\n"); + fprintf(out, "Mean = %f\n", mean); + fprintf(out, "Standard deviation = %f\n", sigma); + fprintf(out, "Generated points:\n"); + + // generate gaussian random points + for(int i = 0; i < n_pts; i++) + { + float random = nrand(mean,sigma); + fprintf(out, "%f\n", random); + } + + // close stream + fclose(out); + + return; +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c @@ -0,0 +1,327 @@ +#include"SimpleMOC_header.h" + +// Allocates and initializes an array of 2D tracks +Track2D * generate_2D_tracks( Input I, size_t * nbytes ) +{ + // Allocate space for 2D tracks + Track2D * tracks = (Track2D *) malloc( I.ntracks_2D * sizeof(Track2D)); + *nbytes += I.ntracks_2D * sizeof(Track2D); + + // Fill weights with randomized data + for( int i = 0; i < I.ntracks_2D; i++ ) + tracks[i].az_weight = urand(); + + // Allocate and randomize segments + generate_2D_segments( I, tracks, nbytes ); + + return tracks; +} + +// Allocates and initializes all segments +void generate_2D_segments( + Input I, + Track2D * tracks, + size_t * nbytes ) +{ + /* Randomize Number of segments per track, and accumulate total 2D + * segments in assembly */ + long total_segments = 0; + for( long i = 0; i < I.ntracks_2D; i++ ) + { + tracks[i].n_segments = segments_per_2D_track_distribution( I ); + total_segments += tracks[i].n_segments; + } + + // Allocate contiguous space for segments + Segment * contiguous_segments = (Segment *) malloc( total_segments + * sizeof(Segment)); + + *nbytes += total_segments * sizeof(Segment); + + // Set segments arrays to correct locations within contiguous allocation + long idx = 0; + for( long i = 0; i < I.ntracks_2D; i++ ) + { + tracks[i].segments = &contiguous_segments[idx]; + idx += tracks[i].n_segments; + } + + // Initialize segments to randomized values + for( long i = 0; i < I.ntracks_2D; i++ ) + { + for( long j = 0; j < tracks[i].n_segments; j++ ) + { + tracks[i].segments[j].length = urand() * I.assembly_width + / tracks[i].n_segments; + // source ID to be calculated on the fly + } + } +} + +// generate segments per 2D track based on a normal distribution +long segments_per_2D_track_distribution( Input I ) +{ + return nrand(I.segments_per_track, sqrt(I.segments_per_track)); +} + +// free memory associated with 2D tracks +void free_2D_tracks( Track2D * tracks ) +{ + free(tracks[0].segments); + free(tracks); +} + +// allocate memory for tracks (primarily angular fluxes) +Track *** generate_tracks(Input I, Track2D * tracks_2D, size_t * nbytes) +{ + // Allocate space for tracks (3D) + Track *** tracks = (Track ***) malloc( I.ntracks_2D * sizeof(Track **)); + *nbytes += I.ntracks_2D * sizeof(Track **); + + // Allocate pointers for tracks associated with a 2D track + Track ** tracks_in_track2D = (Track **) malloc( I.ntracks_2D * + I.n_polar_angles * sizeof(Track *)); + *nbytes += I.ntracks_2D * I.n_polar_angles * sizeof(Track *); + + // Allocate complete array of track data + Track * track_data = (Track *) malloc( I.ntracks * sizeof(Track) ); + *nbytes += I.ntracks * sizeof(Track); +#ifdef PRINT_MEM_SIZES + if(I.mype==0) printf("3D track data requires %zu MB of data...\n", + I.ntracks * sizeof(Track) / 1024 / 1024 ); +#endif + + // stitch pointers together + for( long i = 0; i < I.ntracks_2D; i++ ) + tracks[i] = &tracks_in_track2D[ i * I.n_polar_angles ]; + + for( long i = 0; i < I.ntracks_2D; i++ ) + { + for( int j = 0; j < I.n_polar_angles; j++ ) + { + tracks[i][j] = &track_data[ + (i * I.n_polar_angles + j) * I.z_stacked ]; + } + } + + // Allocate space for Flux Arrays + size_t flux_bytes_needed = 2 * I.ntracks * I.n_egroups * sizeof(float); + +#ifdef PRINT_MEM_SIZES + if(I.mype==0) printf("Flux Arrays Require %zu MB of data...\n", + flux_bytes_needed / 1024 / 1024); +#endif + + float * flux_space = (float *) malloc( flux_bytes_needed ); + *nbytes += flux_bytes_needed; + size_t flux_idx = 0; + + long offset = I.ntracks * I.n_egroups; + + for( long i = 0; i < I.ntracks_2D; i++ ) + { + for( int j = 0; j < I.n_polar_angles; j++ ) + { + for( int k = 0; k < I.z_stacked; k++ ) + { + /* bottom z heights should only have upward directed polar + * angles and similarly top should only have downward directed + * polar angles */ + if( j < I.n_polar_angles/2 ) + tracks[i][j][k].z_height = I.axial_z_sep * k; + else + tracks[i][j][k].z_height = I.axial_z_sep * ( k + 1 ); + + // set polar weight, NOTE: this is the same for same polar angle + tracks[i][j][k].p_weight = urand(); + + // assign angular flux array memory + tracks[i][j][k].f_psi = &flux_space[flux_idx]; + flux_idx += I.n_egroups; + tracks[i][j][k].b_psi = &flux_space[flux_idx]; + flux_idx += I.n_egroups; + + // set incoming flux to 0 (guess 0 for inner assemblies) + for( int g = 0; g < I.n_egroups; g++) + { + tracks[i][j][k].f_psi[g] = 0; + tracks[i][j][k].b_psi[g] = 0; + } + } + } + } + return tracks; +} + +// free memory associated with tracks +void free_tracks( Track *** tracks ) +{ + free(tracks); +} + +// assign polar angles +float * generate_polar_angles( Input I ) +{ + float * polar_angles = (float *) malloc( I.n_polar_angles * + sizeof(float) ); + + for( int i = 0; i < I.n_polar_angles; i++) + polar_angles[i] = M_PI * (i + 0.5) / I.n_polar_angles; + + return polar_angles; +} + +Track2D * load_OpenMOC_tracks(char* fname, bool CMFD_obj, Input* I, size_t* nbytes) +{ + int ret; + FILE* in; + in = fopen(fname, "r"); + printf("Reading track data from:\n%s\n",fname); + int string_length; + + /* Import Geometry metadata from the Track file */ + ret = fread(&string_length, sizeof(int), 1, in); + char geometry_to_string[string_length]; + ret = fread(geometry_to_string, sizeof(char)*string_length, 1, in); + printf("Importing ray tracing data from file...\n"); + + /* Import ray tracing metadata from the Track file */ + double spacing; + ret = fread(&I->n_azimuthal, sizeof(int), 1, in); + ret = fread(&spacing, sizeof(double), 1, in); + I->radial_ray_sep = (float) spacing; + + /* Initialize data structures for Tracks */ + int num_tracks[I->n_azimuthal]; + int num_x[I->n_azimuthal]; + int num_y[I->n_azimuthal]; + double azim_weights[I->n_azimuthal]; + + ret = fread(num_tracks, sizeof(int), I->n_azimuthal, in); + ret = fread(num_x, sizeof(int), I->n_azimuthal, in); + ret = fread(num_y, sizeof(int), I->n_azimuthal, in); + ret = fread(azim_weights, sizeof(double), I->n_azimuthal, in); + + /* Import azimuthal angle quadrature weights from Track file */ + double x0, y0, x1, y1; + double phi; + int azim_angle_index; + int num_segments; + double length; + int material_id; + int region_id; + int mesh_surface_fwd; + int mesh_surface_bwd; + long num_2D_tracks; + + /* Calculate the total number of 2D Tracks */ + I->ntracks_2D = 0; + for (int i=0; i < I->n_azimuthal; i++) + I->ntracks_2D += (long) num_tracks[i]; + + /* Allocate memory for 2D tracks */ + Track2D * tracks = (Track2D *) malloc( I->ntracks_2D * sizeof(Track2D)); + *nbytes += I->ntracks_2D * sizeof(Track2D); + + // Allocate memory for the number of segments per Track array + long tot_num_segments = 0; + + fpos_t checkpoint; + fgetpos(in, &checkpoint); + + // Loop over tracks to determine total number of segments + for (int i=0; i < I->n_azimuthal; i++) + { + for (int j=0; j < num_tracks[i]; j++) + { + // Skip to segment number information + fseek( in, 5*sizeof(double) + sizeof(int), SEEK_CUR); + + // Load number of segments + ret = fread(&num_segments, sizeof(int), 1, in); + tot_num_segments += (long) num_segments; + + // Loop over all segments in this Track + for (int s=0; s < num_segments; s++) + { + // Skip ahead again + fseek( in, 2*sizeof(int) + sizeof(double), SEEK_CUR); + if (CMFD_obj) + fseek( in, 2*sizeof(int), SEEK_CUR); + } + } + } + + // Allocate contiguous space for segments + Segment * contiguous_segments = (Segment *) malloc( tot_num_segments + * sizeof(Segment)); + *nbytes += tot_num_segments * sizeof(Segment); + + // Reset file handle + fsetpos(in, &checkpoint); + + /* Initialize 2D track unique identifer (uid) end semgnet index (idx)*/ + int uid = 0; + int idx = 0; + + /* Loop over Tracks */ + for (int i=0; i < I->n_azimuthal; i++) + { + //TODO: recored actual azimuthal angles + for (int j=0; j < num_tracks[i]; j++) + { + /* Import data for this Track from Track file */ + ret = fread(&x0, sizeof(double), 1, in); + ret = fread(&y0, sizeof(double), 1, in); + ret = fread(&x1, sizeof(double), 1, in); + ret = fread(&y1, sizeof(double), 1, in); + ret = fread(&phi, sizeof(double), 1, in); + ret = fread(&azim_angle_index, sizeof(int), 1, in); + + /* Load number of segments */ + ret = fread(&num_segments, sizeof(int), 1, in); + tracks[uid].n_segments = (long) num_segments; + + /* Allocate memory for segments */ + tracks[uid].segments = &contiguous_segments[idx]; + idx += tracks[uid].n_segments; + + // TODO: set real azimuthal weight + tracks[uid].az_weight = urand(); + + /* Loop over all segments in this Track */ + for (int s=0; s < num_segments; s++) + { + /* Import data for this segment from Track file */ + ret = fread(&length, sizeof(double), 1, in); + ret = fread(&material_id, sizeof(int), 1, in); + ret = fread(®ion_id, sizeof(int), 1, in); + + /* record track length */ + tracks[uid].segments[s].length = (float) length; + + /* record region_id (not used at the moment) */ + tracks[uid].segments[s].source_id = (long) region_id; + + /* Import CMFD-related data if needed */ + if (CMFD_obj) + { + ret = fread(&mesh_surface_fwd, sizeof(int), 1, in); + ret = fread(&mesh_surface_bwd, sizeof(int), 1, in); + } + } + uid++; + } + } + // recalculate average number of segments per track + I->segments_per_track = tot_num_segments / I->ntracks_2D; + + printf("Number of 2D tracks = %ld\n", I->ntracks_2D); + I->ntracks = I->ntracks_2D * I->n_polar_angles * I->z_stacked; + printf("Number of 3D tracks = %ld\n", I->ntracks); + printf("Number of segments = %ld\n", tot_num_segments); + + /* Close the Track file */ + fclose(in); + return tracks; +} Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c =================================================================== --- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c +++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c @@ -0,0 +1,155 @@ +#include"SimpleMOC_header.h" + +// generates a random number between 0 and 1 +float urand(void) +{ + return (float)rand() / (float)RAND_MAX; +} + +/* generates a random number from a normal distribution of a + * given mean and standard deviation (Box-Muller) */ +float nrand(float mean, float sigma) +{ + // generate two random numbers + float rand1 = urand(); + float rand2 = urand(); + + /* use first for "exponential part" and second for + * "azimuthal part" to create a standard normal random number */ + float x = sqrt( -2 * log(rand1) ) * cos( 2 * M_PI * rand2); + + // NOTE: We can generate a second random number if neeeded + // y = sqrt(-2*log(rand1)) * sin( 2 * M_PI * rand2); + + // shift random number to appropriate normal distribution and return + return x * sigma + mean; +} + +// pairwise summation for long arrays +float pairwise_sum( float * vector, long size ){ + float sum = 0; + + // Base case: perform summation if size <= 16 + if( size <= 16) + for( int i = 0; i < size; i++ ) + sum += vector[i]; + + else + { + // otherwise, split + sum = pairwise_sum( &vector[0], size/2 ) + + pairwise_sum( &vector[size/2], size - size/2 ); + } + + return sum; +} + +// Builds a table of exponential values for linear interpolation +Table buildExponentialTable( float precision, float maxVal ) +{ + // define table + Table table; + + // compute number of arry values + int N = (int) ( maxVal * sqrt(1.0 / ( 8.0 * precision * 0.01 ) ) ); + + // compute spacing + float dx = maxVal / (float) N; + + // allocate an array to store information + float * tableVals = malloc( (2 * N ) * sizeof(float) ); + + // store linear segment information (slope and y-intercept) + for( int n = 0; n < N; n++ ) + { + // compute slope and y-intercept for ( 1 - exp(-x) ) + float exponential = exp( - n * dx ); + tableVals[ 2*n ] = - exponential; + tableVals[ 2*n + 1 ] = 1 + ( n * dx - 1 ) * exponential; + } + + // assign data to table + table.dx = dx; + table.values = tableVals; + table.maxVal = maxVal - table.dx; + table.N = N; + + return table; +} + +// Timer function. Depends on if compiled with MPI, openmp, or vanilla +double get_time(void) +{ + #ifdef MPI + return MPI_Wtime(); + #endif + + #ifdef OPENMP + return omp_get_wtime(); + #endif + + time_t time; + time = clock(); + + return (double) time / (double) CLOCKS_PER_SEC; +} + +// Memory Usage Estimator +size_t est_mem_usage( Input I ) +{ + size_t nbytes = 0; + + int z_stacked = (int) ( I.height / (I.axial_z_sep * + I.decomp_assemblies_ax) ); + + long n_xs_regions = I.n_source_regions_per_node / 8; + + nbytes += I.ntracks_2D * sizeof(Track2D); + nbytes += I.segments_per_track * I.ntracks_2D * sizeof(Segment); + nbytes += I.ntracks_2D * sizeof(Track **); + nbytes += I.ntracks_2D * I.n_polar_angles * sizeof(Track *); + nbytes += I.ntracks * sizeof(Track); + nbytes += I.ntracks_2D * I.n_polar_angles * z_stacked + * I.n_egroups * sizeof(float) * 2; + nbytes += I.n_source_regions_per_node * sizeof(Source); + nbytes += n_xs_regions * sizeof(float **); + nbytes += n_xs_regions * sizeof(float **); + nbytes += n_xs_regions * I.n_egroups * I.n_egroups * sizeof(float); + nbytes += n_xs_regions * sizeof(float **); + nbytes += n_xs_regions * I.n_egroups * sizeof(float *); + nbytes += n_xs_regions * I.n_egroups * 3 * sizeof(float); + nbytes += I.n_source_regions_per_node * sizeof(float **); + nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *); + nbytes += I.n_source_regions_per_node * sizeof(float **); + nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *); + nbytes += I.n_source_regions_per_node * I.fai * I.n_egroups + * sizeof(float); + + // 2way tracking memory + nbytes += I.nthreads * I.z_stacked * sizeof(double *); + nbytes += I.nthreads * I.z_stacked * sizeof(Source**); + nbytes += I.nthreads * I.z_stacked * sizeof(int); + nbytes += I.nthreads * I.z_stacked * sizeof(int); + nbytes += I.nthreads * I.z_stacked * 2 * I.segments_per_track + * sizeof(double); + nbytes += I.nthreads * I.z_stacked * 2 * I.segments_per_track + * sizeof(Source *); + + // MPI Buffers + #ifdef MPI + nbytes += 6 * I.n_egroups * 10000 * sizeof(float); + #endif + + return nbytes; +} + +// Calculates Time per Intersection +double time_per_intersection( Input I, double time ) +{ + /* This was the old estimate - new way uses actual tracking data + double tpi = time / (double) I.ntracks / + (double) I.segments_per_track / (double) I.n_egroups / 1e-9 / 2; + */ + double tpi = time / (double) I.segments_processed * 1.0e9 / (double) I.n_egroups; + return tpi; +}