Index: test-suite/trunk/LICENSE.TXT
===================================================================
--- test-suite/trunk/LICENSE.TXT
+++ test-suite/trunk/LICENSE.TXT
@@ -88,6 +88,7 @@
 XSBench:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench
 miniGMG:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG
 RSBench:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/RSBench
+SimpleMOC:          llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC
 CLAMR:              llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR
 HPCCG:              llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/HPCCG
 PENNANT:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
@@ -3,3 +3,4 @@
 add_subdirectory(miniAMR)
 add_subdirectory(miniGMG)
 add_subdirectory(RSBench)
+add_subdirectory(SimpleMOC)
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
@@ -5,6 +5,7 @@
                 miniAMR    \
                 Pathfinder \
                 miniGMG    \
-                RSBench
+                RSBench    \
+                SimpleMOC
 
 include $(LEVEL)/Makefile.programs
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(PROG SimpleMOC)
+list(APPEND LDFLAGS -lm)
+list(APPEND CFLAGS -std=gnu99)
+set(RUN_OPTIONS -i ${CMAKE_CURRENT_SOURCE_DIR}/default.in)
+llvm_multisource()
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 John Tramm
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile
@@ -0,0 +1,7 @@
+LEVEL = ../../../..
+
+PROG = SimpleMOC
+LDFLAGS = -lm
+CFLAGS = -std=gnu99
+RUN_OPTIONS = -i $(PROJ_SRC_DIR)/default.in
+include $(LEVEL)/MultiSource/Makefile.multisrc
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output
@@ -0,0 +1,12 @@
+                                 INITIALIZATION
+================================================================================
+Initializing 2D tracks...
+Initializing 3D tracks...
+Initializing flat source regions...
+Beginning XS Allocation...
+Beginning Source and Flux Parameter Allocation...
+================================================================================
+Starting transport sweep ...
+Renormalizing Flux...
+Renormalizing Flux Complete.
+exit 0
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h
@@ -0,0 +1,262 @@
+#ifndef __SimpleMOC_header
+#define __SimpleMOC_header
+
+#include<stdio.h>
+#include<stdlib.h>
+#include<math.h>
+#include<string.h>
+#include<time.h>
+#include<stdbool.h>
+#include<limits.h>
+#include<assert.h>
+#include<pthread.h>
+#include<unistd.h>
+
+#ifdef MPI
+#include<mpi.h>
+#endif
+
+#ifdef OPENMP
+#include<omp.h>
+#endif
+
+#ifdef PAPI
+#include<papi.h>
+#endif
+
+#define TIMIING_INFO 0
+
+#include "glibc_compat_rand.h"
+#ifndef NO_GLIBC_COMPAT_RAND
+#define rand glibc_compat_rand
+#define srand glibc_compat_srand
+#endif
+
+// User inputs
+typedef struct{
+	int x_assemblies;          	/* Number of assemblies in the x-axis of the 
+								   reactor */
+	int y_assemblies;           /* Number of assemblies in the y-axis of the 
+								   reactor */
+	int cai;                    // Number of course axial intervals
+	int fai;                    /* Number of fine axial intervals per coarse 
+								   axial interval */
+	int axial_exp;             	// Axial source expansion order
+	float radial_ray_sep;     	// Radial ray separation
+	float axial_z_sep;        	// Axial stacked z-ray separation
+	int n_azimuthal;           	// Number of azimuthal angles
+	int n_polar_angles;        	// Number of polar angles
+	int n_egroups;             	// Number of energy groups
+	bool decompose;            	// Turn decomposition on/off
+	int decomp_assemblies_ax;  	// Number of sub-domains per assembly (axially)
+	long segments_per_track;   	// Average number of segments per track
+	float assembly_width;     	// Width of an assembly - 1.26 x 17 cm
+	float height;             	// Height of the reactor - 400 cm
+	float domain_height;      	// Z Height of a domain
+	float precision;		   	// precision for source convergence
+	long mype;                 	// MPI Rank
+	long ntracks_2D;           	// Number of 2D tracks (derived)
+	int z_stacked;             	// Number of z rays (derived)
+	long ntracks;              	/* Total number of 3D tracks per assembly
+								   (derived) */
+	int nthreads;              	// Number of OpenMP Threads
+	int papi_event_set;         // PAPI event set
+	// 0 - FLOPS   1 - Bandwidth   2 - CPU Stall reason
+	
+	// Source regions per assembly (3M estimate)
+	long n_2D_source_regions_per_assembly;
+
+	// Source regions per node (derived)	
+	long n_source_regions_per_node; 
+
+    #ifdef PAPI
+    // String for command line PAPI event
+    char event_name[PAPI_MAX_STR_LEN]; 
+    // Array to accumulate PAPI counts across all threads
+    long long *vals_accum;
+    #endif
+
+	bool load_tracks;		// Turn on/off loading 2D tracks from file
+	char* track_file;		// Name/address of tracking file to load
+
+	long segments_processed;    // Total number of segments processed per node
+
+} Input;
+
+// Localized geometrical region ID
+typedef struct{
+	long assembly;             // Assembly ID
+	long pin;                  // Pin Cell ID
+	long zone;                 // Zone (inside pin cell) ID
+} RegionID;
+
+// Segment Structure
+typedef struct{
+	float length;
+	long source_id;
+} Segment;
+
+// Track2D Structure
+typedef struct{
+	float az_weight;          	// Azimuthal Quadrature Weight (rand)
+	long n_segments;           	// Number of Segments (gaussian)
+	Segment * segments;        	// Array of Segments
+	int n_3D_segments;			// Number of intersections in 3D tracks
+} Track2D;
+
+// Track Structure
+typedef struct{
+	float p_weight;				// Polar Quadrature Weight     (rand)
+	float z_height;           	// Z-height
+	long rank_in;              	// MPI rank to receive from
+	long rank_out;             	// MPI rank to send to
+	float * f_psi;			   	// Forward angular flux along track
+	float * b_psi;				// Backward angular flux along track
+} Track;
+
+// Source Region Structure
+typedef struct{
+	float ** fine_flux;
+	float ** fine_source;
+	float vol;
+	float * sigT;
+	float ** XS;
+	float ** scattering_matrix;
+	#ifdef OPENMP
+	omp_lock_t * locks;
+	#endif
+} Source;
+
+// Table structure for computing exponential
+typedef struct{
+	float * values;
+	float dx;
+	float maxVal;
+	int N;
+} Table;
+
+// Params Structure for easier data pointer passing
+typedef struct{
+	Track2D * tracks_2D;
+	Track *** tracks;
+	Source * sources;
+   	float * polar_angles;
+	float * leakage;
+	Table expTable;
+} Params;
+
+// MPI 3D Grid info
+typedef struct{
+	#ifdef MPI
+	MPI_Comm cart_comm_3d;
+	MPI_Datatype Flux_Array;
+	#endif
+	int x_pos_src;
+	int x_pos_dest;
+	int x_neg_src;
+	int x_neg_dest;
+	int y_pos_src;
+	int y_pos_dest;
+	int y_neg_src;
+	int y_neg_dest;
+	int z_pos_src;
+	int z_pos_dest;
+	int z_neg_src;
+	int z_neg_dest;
+} CommGrid;
+
+// Attenuation Arrays
+typedef struct{
+	float * q0;
+	float * q1;
+	float * q2;
+	float * sigT;
+	float * tau;
+	float * sigT2;
+	float * expVal;
+	float * reuse;
+	float * flux_integral;
+	float * tally;
+	float * t1;
+	float * t2;
+	float * t3;
+	float * t4;
+} AttenuateVars;
+
+
+// init.c
+Input set_default_input( void );
+void set_small_input( Input * I );
+Params build_tracks( Input* I );
+CommGrid init_mpi_grid( Input I );
+void calculate_derived_inputs( Input * I );
+#ifdef OPENMP
+omp_lock_t * init_locks( Input I );
+#endif
+
+// io.c
+void logo(int version);
+void center_print(const char *s, int width);
+void border_print(void);
+void fancy_int( int a );
+void print_input_summary(Input input);
+void read_CLI( int argc, char * argv[], Input * input );
+void print_CLI_error(void);
+void read_input_file( Input * I, char * fname);
+
+// tracks.c
+Track2D * generate_2D_tracks( Input input, size_t * nbytes );
+void generate_2D_segments( Input input, Track2D * tracks,
+	   	size_t * nbytes );
+void free_2D_tracks( Track2D * tracks );
+Track *** generate_tracks(Input input, Track2D * tracks_2D, size_t * nbytes);
+void free_tracks( Track *** tracks );
+long segments_per_2D_track_distribution( Input I );
+float * generate_polar_angles( Input I );
+Track2D * load_OpenMOC_tracks(char* fname, bool CMFD_obj, Input* I, size_t* nbytes);
+
+// utils.c
+float urand(void);
+float nrand(float mean, float sigma);
+float pairwise_sum(float * vector, long size);
+Table buildExponentialTable( float precision, float maxVal );
+float interpolateTable( Table table, float x);
+double get_time(void);
+size_t est_mem_usage( Input I );
+double time_per_intersection( Input I, double time );
+
+// source.c
+Source * initialize_sources( Input I, size_t * nbytes );
+void free_sources( Input I, Source * sources );
+
+// solver.c
+void transport_sweep( Params * params, Input * I );
+int get_pos_interval( float z, float dz);
+int get_neg_interval( float z, float dz);
+int get_alt_neg_interval( float z, float dz);
+void attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I, 
+		Params * params, float ds, float mu, float az_weight, AttenuateVars * A ); 
+void attenuate_FSR_fluxes( Track * track, bool forward, Source * FSR, Input * I,
+		Params * params, float ds, float mu, float az_weight, AttenuateVars * A );
+void alt_attenuate_fluxes( Track * track, bool forward, Source * FSR, Input * I,
+		Params * params, float ds, float mu, float az_weight );
+void renormalize_flux( Params params, Input I, CommGrid grid );
+float update_sources( Params params, Input I, float keff );
+float compute_keff( Params params, Input I, CommGrid grid);
+
+// test.c
+void gen_norm_pts(float mean, float sigma, int n_pts);
+void print_Input_struct( Input I );
+
+// papi.c
+void papi_serial_init(void);
+void counter_init( int *eventset, int *num_papi_events, Input * I );
+void counter_stop( int * eventset, int num_papi_events, Input * I );
+
+// comms.c
+#ifdef MPI
+void fast_transfer_boundary_fluxes( Params params, Input I, CommGrid grid);
+void transfer_boundary_fluxes( Params params, Input I, CommGrid grid);
+#endif
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c
@@ -0,0 +1,199 @@
+#include"SimpleMOC_header.h"
+
+#ifdef MPI
+// Transfer information between nodes (angular fluxes)
+void fast_transfer_boundary_fluxes( Params params, Input I, CommGrid grid)
+{
+	MPI_Barrier(grid.cart_comm_3d);
+	if(I.mype==0) printf("Beginning Inter-Node Border Flux Transfer...\n");
+
+	int tracks_per_msg = 10000;
+
+	float h = I.domain_height;
+	float x = I.assembly_width;
+
+	// calculate number of tracks to each surface
+	long ntracks_per_axial_direction  = I.ntracks * x / (2*x + 4*h);
+	long ntracks_per_radial_direction = I.ntracks * h / (2*x + 4*h);
+
+	// correct so that all tracks are used and are symmetric
+	long remaining_tracks = I.ntracks - 2 * ntracks_per_axial_direction
+	   - 4 * ntracks_per_radial_direction;
+
+	long add_radial = remaining_tracks * ( 4*h / (2*x + 4*h) );
+	add_radial = 4 * (add_radial / 4);
+	ntracks_per_radial_direction += add_radial / 4;
+	
+	long add_axial = remaining_tracks - add_radial;
+	ntracks_per_axial_direction += add_axial / 2;
+
+	// Calculate all requests needed
+	long max_requests = ntracks_per_radial_direction / tracks_per_msg;
+	max_requests *= 4;
+	max_requests += 2 * (ntracks_per_axial_direction / tracks_per_msg );
+
+	// One for send, one for receive
+	max_requests *= 2;
+
+	long send_idx = 0;
+	MPI_Status stat;
+
+	// Computer Message Size
+	size_t bytes = I.n_egroups * sizeof(float) * tracks_per_msg;
+#ifdef PRINT_MEM_SIZES
+	if(I.mype==0) printf("MPI Message Size: %.2lf (MB)\n",
+			bytes / 1024. / 1024. );
+#endif
+	
+
+	// Use knowledge of underlying flux structure for efficiency
+	float * flux_array = params.tracks[0][0][0].f_psi;
+
+	// TODO: Send reverse direction as well!!!
+
+	// make an array of radial sending destinations
+	int send_dest[6] = 
+	{
+		grid.x_pos_dest,
+		grid.x_neg_dest,
+		grid.y_pos_dest,
+		grid.y_neg_dest,
+		grid.z_pos_dest,
+		grid.z_neg_dest
+	};
+	// make an array of radial receiving sources
+	int rec_sources[6] = 
+	{
+		grid.x_pos_src,
+		grid.x_neg_src,
+		grid.y_pos_src,
+		grid.y_neg_src,
+		grid.z_pos_src,
+		grid.z_neg_src
+	};
+	
+	// make an array of number of messages
+	// NOTE: There is some rounding here, should be corrected in real app
+	long num_messages[6] =
+	{
+		ntracks_per_radial_direction / tracks_per_msg,
+		ntracks_per_radial_direction / tracks_per_msg,
+		ntracks_per_radial_direction / tracks_per_msg,
+		ntracks_per_radial_direction / tracks_per_msg,
+		ntracks_per_axial_direction / tracks_per_msg,
+		ntracks_per_axial_direction / tracks_per_msg
+	};
+
+
+	// send_idx is now the beginning of the non-border region memory
+	// i.e., we need to actually MPI Send/Recv the rest of the data
+
+	// calculate the maximum number of messages sent in any direction
+	long max_msgs_per_dir = 0;
+	for( int i = 0; i < 6; i++ )
+		if( num_messages[i] > max_msgs_per_dir )
+		   max_msgs_per_dir = num_messages[i];
+
+	// Flux Memory is assumed to be laid out as follows:
+	// (Border Flux) --- (Send_MSG_Dir_0) (Recv_MSG_Dir_0) (Send_MSG_Dir_1) (Recv_MSG_Dir_1)....
+
+	// New Comms
+	float ** buffer = (float **) malloc(6 * sizeof(float*));
+	float * _buffer = (float *) malloc( 6 * tracks_per_msg * I.n_egroups * sizeof(float));
+	for( int i = 0; i < 6; i++ )
+		buffer[i] = &_buffer[i * tracks_per_msg * I.n_egroups];
+
+	long idx = 0;
+	for( long i = 0; i < max_msgs_per_dir; i++ )
+	{
+		MPI_Request request[12];
+		int active[6] = {0};
+		int mpi_send[6] = {0};
+		int mpi_recv[6] = {0};
+		long bookmark = idx;
+		for( int j = 0; j < 6; j++ )
+		{
+			if( i >= num_messages[j] )
+				continue;
+			
+			// check if border assembly
+			else if( send_dest[j] == -1 )
+			{
+				* params.leakage += pairwise_sum( &flux_array[idx],
+						I.n_egroups * tracks_per_msg );
+				idx += (long) I.n_egroups * tracks_per_msg;
+			}
+			else
+			{
+				MPI_Isend(
+					&flux_array[idx], 	// Send Buffer
+					tracks_per_msg,     // Number of Elements
+					grid.Flux_Array,  	/* Type of element 
+											(all energy group array) */
+					send_dest[j],	  	// Destination MPI rank
+					j,                	// Message ID
+					grid.cart_comm_3d,	// MPI Communicator
+					&request[j] );   	/* MPI Request (to monitor 
+											when call finishes) */
+				idx += (long) I.n_egroups * tracks_per_msg;
+				mpi_send[j] = 1;
+			}
+		}
+		for( int j = 0; j < 6; j++ )
+		{
+			if( i >= num_messages[j] )
+				continue;
+			
+			// Check if Border Case
+			else if( rec_sources[j] == -1)
+				for( long k =0; k < I.n_egroups * tracks_per_msg; k++)
+					buffer[j][k] = 0;
+			else
+			{
+				MPI_Irecv(
+						buffer[j],   		// Recv Buffer
+						tracks_per_msg,   	// Number of Elements
+						grid.Flux_Array,  	/* Type of element 
+												(all energy group array) */
+						rec_sources[j],		// MPI rank to Receive From
+						j,              	// Message ID
+						grid.cart_comm_3d,	// MPI Communicator
+						&request[6+j] ); 		/* MPI Request (to monitor 
+												when call finishes) */
+				mpi_recv[j] = 1;
+			}
+			active[j] = 1;
+		}
+
+		// Block for Comm Round to complete & copy received data out of buffer
+		for( int j = 0; j < 6; j++ )
+		{
+			if( mpi_send[j] == 1 )
+			{
+				MPI_Wait( &request[j], &stat );
+			}
+			if( mpi_recv[j] == 1 )
+			{
+				MPI_Wait( &request[6+j], &stat );
+			}
+			if( active[j] == 1 )
+			{
+				memcpy(&flux_array[bookmark], buffer[j],
+						I.n_egroups * tracks_per_msg * sizeof(float));
+				bookmark += (long) I.n_egroups*tracks_per_msg;
+			}
+		}
+
+
+	}
+
+	free(&buffer[0][0]);
+	free(buffer);
+
+
+	MPI_Barrier( grid.cart_comm_3d );
+	MPI_Barrier( MPI_COMM_WORLD);
+
+	if(I.mype==0) printf("Finished Inter-Node Border Flux Transfer.\n");
+}
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in
@@ -0,0 +1,20 @@
+10 - x_assemblies - Number of assemblies in the x-axis of the reactor
+10 - y_assemblies - Number of assemblies in the y-axis of the reactor
+9 - cai - Number of coarse axial intervals
+5 - fai - Number of fine axial intervals per coarse  axial interval
+2 - axial_exp - Axial source expansion order
+0.05 - radial_ray_sep -  Radial ray separation
+0.25 - axial_z_sep -  Axial stacked z-ray separation
+6 - n_azimuthal  -  Number of azimuthal angles (should be 32)
+10 - n_polar_angles -  Number of polar angles
+10 - n_egroups -  Number of energy groups
+1 - decompose - Turn decomposition on/off (true = 1, false = 0) 
+20 - decomp_assemblies_ax - Number of assemblies per sub-domain (axially) 
+20 - segments_per_track - Average number of segments per track
+21.42 - assembly_width - Width of an assembly - 1.26 x 17 cm
+50.0 - height - Height of the reactor
+0.01 - precision - precision for source convergence
+1000 - n_2D_source_regions_per_assembly - 2D src regions per assembly 
+0 - papi_event_set - PAPI Event Set Choice
+0 - Turn on/off load tracks from OpenMOC (true = 1, false = 0)
+file_name - name of OpenMOC tracking file to load
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h
@@ -0,0 +1,16 @@
+/*===------------- glibc_compat_rand.h- glibc rand emulation --------------===*\
+ * |*
+ * |*                     The LLVM Compiler Infrastructure
+ * |*
+ * |* This file is distributed under the University of Illinois Open Source
+ * |* License. See LICENSE.TXT for details.
+ * |*
+ * \*===----------------------------------------------------------------------===*/
+
+#ifndef GLIBC_COMPAT_RAND_H
+#define GLIBC_COMPAT_RAND_H
+
+int glibc_compat_rand(void);
+void glibc_compat_srand(unsigned int seed);
+
+#endif /* GLIBC_COMPAT_RAND_H */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c
@@ -0,0 +1,60 @@
+/*===------------ glibc_compat_rand.c - glibc rand emulation --------------===*\
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#include "glibc_compat_rand.h"
+
+/**
+ * This rand implementation is designed to emulate the implementation of
+ * rand/srand in recent versions of glibc. This is used for programs which
+ * require this specific rand implementation in order to pass verification
+ * tests.
+ *
+ * For more information, see: http://www.mathstat.dal.ca/~selinger/random/
+ **/
+
+#define TABLE_SIZE 344
+static unsigned int table[TABLE_SIZE];
+static int next;
+
+int glibc_compat_rand(void) {
+  /* Calculate the indices i-3 and i-31 in the circular vector. */
+  int i3 = (next < 3) ? (TABLE_SIZE + next - 3) : (next - 3);
+  int i31 = (next < 31) ? (TABLE_SIZE + next - 31) : (next - 31);
+
+  table[next] = table[i3] + table[i31];
+  unsigned int r = table[next] >> 1;
+
+  ++next;
+  if (next >= TABLE_SIZE)
+    next = 0;
+
+  return r;
+}
+
+void glibc_compat_srand(unsigned int seed) {
+  if (seed == 0)
+    seed = 1;
+
+  table[0] = seed;
+
+  for (int i = 1; i < 31; i++) {
+    int r = (16807ll * table[i - 1]) % 2147483647;
+    if (r < 0)
+      r += 2147483647;
+
+    table[i] = r;
+  }
+
+  for (int i = 31; i < 34; i++)
+    table[i] = table[i - 31];
+  for (int i = 34; i < TABLE_SIZE; i++)
+    table[i] = table[i - 31] + table[i - 3];
+
+  next = 0;
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c
@@ -0,0 +1,249 @@
+#include"SimpleMOC_header.h"
+
+// Calculate Derived Inputs
+void calculate_derived_inputs( Input * I )
+{
+	
+	#ifdef MPI
+	int mype;
+	MPI_Comm_rank(MPI_COMM_WORLD, &mype);
+	I->mype = mype;
+	#endif
+
+	/* Divide number of azimuthal angles by 2 to accound for forward/backward
+	 *  tracking */
+	I->n_azimuthal /= 2;
+
+	// calculate number of 2D tracks, enforcing divisible by 2
+	I->ntracks_2D = I->n_azimuthal * 
+			(I->assembly_width * sqrt(2) / I->radial_ray_sep);
+
+	I->ntracks_2D = 2 * ( I->ntracks_2D / 2 );
+
+	I->z_stacked = (int) ( I->height / (I->axial_z_sep * I->decomp_assemblies_ax));
+	I->ntracks = I->ntracks_2D * I->n_polar_angles * I->z_stacked;  
+	I->domain_height = I->height / I->decomp_assemblies_ax;
+
+	I->n_source_regions_per_node = I->n_2D_source_regions_per_assembly *
+		I->cai / I->decomp_assemblies_ax;
+
+}
+
+// Gets I from user and sets defaults
+Input set_default_input( void )
+{
+	Input I;
+
+	I.x_assemblies = 17;      	/* Number of assemblies in the x-axis 
+								   of the reactor */
+	I.y_assemblies = 17;        /* Number of assemblies in the y-axis 
+								   of the reactor */
+	I.cai = 27;                	// Number of coarse axial intervals
+	I.fai = 5;                  /* Number of fine axial intervals per coarse 
+								   axial interval */
+	I.axial_exp = 2;            // Axial source expansion order
+	I.radial_ray_sep = 0.05;    // Radial ray separation
+	I.axial_z_sep = 0.25;       // Axial stacked z-ray separation
+	I.n_azimuthal = 64;         // Number of azimuthal angles (should be 32)
+	I.n_polar_angles = 10;      // Number of polar angles
+	I.n_egroups = 104;        	// Number of energy groups
+	I.decompose = true;      	/* Turn decomposition on/off (true = on, 
+								   flase = off) */
+	I.decomp_assemblies_ax =20; /* Number of subdomains per assembly
+								   (axially) */
+	I.segments_per_track = 120;  // Average number of segments per track (123)
+	I.assembly_width = 21.42;   // Width of an assembly - 1.26 x 17 cm
+	I.height = 400.0;           // Height of the reactor - 400 cm
+	I.precision = 0.01;			// precision for source convergence
+	I.mype = 0;                 // MPI Rank
+
+	// source regions per assembly (estimate)
+	I.n_2D_source_regions_per_assembly = 5000; 
+
+	#ifdef PAPI
+	I.papi_event_set = 4;
+	#endif
+	
+	#ifdef OPENMP
+	I.nthreads = omp_get_max_threads();
+	#endif
+
+	I.load_tracks = false; 
+
+	return I;
+}
+
+// Changes defaults to small problem size
+void set_small_input( Input * I )
+{
+	I->x_assemblies = 15;      		/* Number of assemblies in the x-axis 
+								   		of the reactor */
+	I->y_assemblies = 15;        	/* Number of assemblies in the y-axis 
+								    	of the reactor */
+	I->cai = 5;                	 	// Number of coarse axial intervals
+	I->fai = 3;                  	/* Number of fine axial intervals per 
+									   coarse axial interval */
+	I->axial_exp = 2;            	// Axial source expansion order
+	I->radial_ray_sep = 0.5;     	// Radial ray separation
+	I->axial_z_sep = 0.2;        	// Axial stacked z-ray separation
+	I->n_azimuthal = 5;         	// Number of azimuthal angles
+	I->n_polar_angles = 5;      	// Number of polar angles
+	I->n_egroups = 104;        		// Number of energy groups
+	I->decompose = false;      	 	/* Turn decomposition on/off (true = on, 
+								   		flase = off) */
+	I->decomp_assemblies_ax = 1; 	/* Number of sub-domains per assembly
+								   		(axially) */
+	I->segments_per_track = 120;  	// Average number of segments per track
+	I->assembly_width = 1.26*17; 	// Width of an assembly - 1.26 x 17 cm
+	I->height = 400.0;           	// Height of the reactor - 400 cm
+	I->precision = 0.01;			// precision for source convergence
+
+	// source regions per assembly (estimate)
+	I->n_2D_source_regions_per_assembly = 3000; 
+}
+
+// Initializes all track data
+Params build_tracks( Input* input )
+{
+	Input I = *input;
+	size_t nbytes = 0;
+	Params params;
+
+	if(I.mype == 0)
+	{
+		center_print("INITIALIZATION", 79);
+		border_print();
+		printf("Initializing 2D tracks...\n");
+	}
+
+	if(I.load_tracks)
+	{
+		params.tracks_2D = load_OpenMOC_tracks(
+				I.track_file,false, input, &nbytes);
+		I = *input;
+	}
+	else
+    	params.tracks_2D = generate_2D_tracks(I, &nbytes); 
+
+	if(I.mype == 0)
+	{
+#ifdef PRINT_MEM_SIZES
+		printf("Memory allocated thus far (MB): %zu\n", nbytes / 1024 / 1014 );
+#endif
+		printf("Initializing 3D tracks...\n");
+	}
+
+	params.tracks = generate_tracks(I, params.tracks_2D, &nbytes);
+	params.polar_angles = generate_polar_angles( I ); 
+
+	if(I.mype == 0)
+	{
+#ifdef PRINT_MEM_SIZES
+		printf("Memory allocated thus far (MB): %zu\n", nbytes / 1024 / 1014 );
+#endif
+		printf("Initializing flat source regions...\n");
+	}
+
+	params.sources = initialize_sources(I, &nbytes); 
+
+	if(I.mype == 0)
+	{
+#ifdef PRINT_MEM_SIZES
+		printf("Memory allocated thus far (MB): %zu\n", nbytes / 1024 / 1014 );
+#endif
+		border_print();
+	}
+
+	// initialize to zero leakage
+	float * leakage = calloc( 1, sizeof(float) );
+	params.leakage = leakage;
+
+	// build exponential table for interpolation
+	params.expTable = buildExponentialTable( I.precision, 10.0 ); 
+
+	return params;
+}
+
+// Initializes 3D Cartesian Communication Grid + Shift Ranks
+CommGrid init_mpi_grid( Input I )
+{
+	CommGrid grid;
+
+	#ifdef MPI
+	MPI_Comm cart_comm_3d;
+	int ndims = 3;
+	int dims[3] = {2,2,1};
+	int period[3] = {0,0,0};
+   	int reorder = 1;
+
+	MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, period, reorder, 
+			&cart_comm_3d);
+
+	// X Shifts
+	int x_pos_src;
+	int x_pos_dest;
+	int x_neg_src;
+	int x_neg_dest;
+	MPI_Cart_shift( cart_comm_3d, 0,  1, &x_pos_src, &x_pos_dest );
+	MPI_Cart_shift( cart_comm_3d, 0, -1, &x_neg_src, &x_neg_dest );
+
+	// Y Shifts
+	int y_pos_src;
+	int y_pos_dest;
+	int y_neg_src;
+	int y_neg_dest;
+	MPI_Cart_shift( cart_comm_3d, 1,  1, &y_pos_src, &y_pos_dest );
+	MPI_Cart_shift( cart_comm_3d, 1, -1, &y_neg_src, &y_neg_dest );
+
+	// Z Shifts
+	int z_pos_src;
+	int z_pos_dest;
+	int z_neg_src;
+	int z_neg_dest;
+	MPI_Cart_shift( cart_comm_3d, 2,  1, &z_pos_src, &z_pos_dest );
+	MPI_Cart_shift( cart_comm_3d, 2, -1, &z_neg_src, &z_neg_dest );
+
+	grid.cart_comm_3d = cart_comm_3d;
+	grid.x_pos_src    = x_pos_src;
+	grid.x_pos_dest   = x_pos_dest;
+	grid.x_neg_src    = x_neg_src;
+	grid.x_neg_dest   = x_neg_dest;
+	grid.y_pos_src    = y_pos_src;
+	grid.y_pos_dest   = y_pos_dest;
+	grid.y_neg_src    = y_neg_src;
+	grid.y_neg_dest   = y_neg_dest;
+	grid.z_pos_src    = z_pos_src;
+	grid.z_pos_dest   = z_pos_dest;
+	grid.z_neg_src    = z_neg_src;
+	grid.z_neg_dest   = z_neg_dest;
+
+
+	// Init flux buffer MPI type
+	MPI_Datatype flux_array;
+	MPI_Type_contiguous(I.n_egroups, MPI_FLOAT, &flux_array);
+	MPI_Type_commit(&flux_array);
+
+	grid.Flux_Array = flux_array;
+
+	#endif
+
+	return grid;
+}
+
+#ifdef OPENMP
+// Intialized OpenMP Source Region Locks
+omp_lock_t * init_locks( Input I )
+{
+	// Allocate locks array
+	long n_locks = I.n_source_regions_per_node * I.fai; 
+	omp_lock_t * locks = (omp_lock_t *) malloc( n_locks* sizeof(omp_lock_t));
+	
+	// Initialize locks array
+	for( long i = 0; i < n_locks; i++ )
+		omp_init_lock(&locks[i]);
+
+	return locks;
+}	
+#endif
+
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c
@@ -0,0 +1,276 @@
+#include "SimpleMOC_header.h"
+
+// Prints program logo
+void logo(int version)
+{
+	border_print();
+	printf(
+"              _____ _                 _      __  __  ____   _____ \n"
+"             / ____(_)               | |    |  \\/  |/ __ \\ / ____|\n"
+"            | (___  _ _ __ ___  _ __ | | ___| \\  / | |  | | |     \n"
+"             \\___ \\| | '_ ` _ \\| '_ \\| |/ _ \\ |\\/| | |  | | |     \n"
+"             ____) | | | | | | | |_) | |  __/ |  | | |__| | |____ \n"
+"            |_____/|_|_| |_| |_| .__/|_|\\___|_|  |_|\\____/ \\_____|\n"
+"                               | |                                \n"
+"                               |_|                                \n"
+	);
+	border_print();
+	printf("\n");
+	center_print("Developed at", 79);
+	center_print("The Massachusetts Institute of Technology", 79);
+	center_print("and", 79);
+	center_print("Argonne National Laboratory", 79);
+	printf("\n");
+	char v[100];
+	sprintf(v, "Version: %d", version);
+	center_print(v, 79);
+	printf("\n");
+	border_print();
+}
+
+// Prints Section titles in center of 80 char terminal
+void center_print(const char *s, int width)
+{
+	int length = strlen(s);
+	int i;
+	for (i=0; i<=(width-length)/2; i++) {
+		fputs(" ", stdout);
+	}
+	fputs(s, stdout);
+	fputs("\n", stdout);
+}
+
+// Prints a border
+void border_print(void)
+{
+	printf(
+	"==================================================================="
+	"=============\n");
+}
+
+// Prints comma separated integers - for ease of reading
+void fancy_int( int a )
+{
+    if( a < 1000 )
+        printf("%d\n",a);
+
+    else if( a >= 1000 && a < 1000000 )
+        printf("%d,%03d\n", a / 1000, a % 1000);
+
+    else if( a >= 1000000 && a < 1000000000 )
+        printf("%d,%03d,%03d\n", a / 1000000, (a % 1000000) / 1000, a % 1000 );
+
+    else if( a >= 1000000000 )
+        printf("%d,%03d,%03d,%03d\n",
+               a / 1000000000,
+               (a % 1000000000) / 1000000,
+               (a % 1000000) / 1000,
+               a % 1000 );
+    else
+        printf("%d\n",a);
+}
+
+// Prints out the summary of User input
+void print_input_summary(Input I)
+{
+	center_print("INPUT SUMMARY", 79);
+	border_print();
+	#ifdef MPI
+	int nranks;
+	MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+	printf("%-35s%d\n", "MPI Ranks:", nranks); 
+	#endif
+	#ifdef OPENMP
+	printf("%-35s%d\n", "Number of Threads:", I.nthreads);
+	#endif
+	printf("%-35s%d\n", "x-axis assemblies:", I.x_assemblies);
+	printf("%-35s%d\n", "y-axis assemblies:", I.y_assemblies);
+	printf("%-35s%d\n", "coarse axial intervals:", I.cai);
+	printf("%-35s%d\n", "fine axial intervals:", I.fai);
+	printf("%-35s%d\n", "axial source expansion order:", I.axial_exp);
+	printf("%-35s%.2lf\n", "radial ray separation:", I.radial_ray_sep);
+	printf("%-35s%.2lf\n", "axial z-ray separation:", I.axial_z_sep);
+	printf("%-35s%d\n", "azimuthal angles:", I.n_azimuthal);
+	printf("%-35s%d\n", "polar angles:", I.n_polar_angles);
+	printf("%-35s%d\n", "energy groups:", I.n_egroups);
+	printf("%-35s%d\n", "assemblies per axial sub-domain:", 
+			I.decomp_assemblies_ax);
+	printf("%-35s%ld\n", "avg segments per track:", I.segments_per_track);
+	printf("%-35s%.2lf\n", "assembly width:", I.assembly_width);
+	printf("%-35s%.2lf\n", "reactor height:", I.height);
+	printf("%-35s%ld\n", "2D Src regions per assembly:", 
+			I.n_2D_source_regions_per_assembly);
+	printf("%-35s%ld\n", "2D Tracks:", I.ntracks_2D);
+	printf("%-35s", "3D Tracks:"); fancy_int(I.ntracks);
+#ifdef PRINT_MEM_SIZES
+	printf("%-35s%zu (MB)\n", "Estimated Memory Usage:",
+			est_mem_usage(I) / 1024 / 1024);
+#endif
+	#ifdef PAPI
+    if( I.papi_event_set == -1)
+        printf("%-35s%s\n", "PAPI event to count:", I.event_name);
+	#endif
+	border_print();
+}
+
+// reads command line inputs and applies options
+void read_CLI( int argc, char * argv[], Input * input )
+{
+	// defaults to max threads on the system	
+	#ifdef OPENMP
+	input->nthreads = omp_get_num_procs();
+	#else
+	input->nthreads = 1;
+	#endif
+	
+	// Collect Raw Input
+	for( int i = 1; i < argc; i++ )
+	{
+		char * arg = argv[i];
+
+		// nthreads (-t)
+		if( strcmp(arg, "-t") == 0 )
+		{
+			if( ++i < argc )
+				input->nthreads = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// input file (-i)
+		else if( strcmp(arg, "-i") == 0 )
+		{
+			if( ++i < argc )
+				read_input_file( input, argv[i]);
+			else
+				print_CLI_error();
+		}
+		// set input for small problem (-s)
+		else if( strcmp(arg, "-s") == 0)
+			set_small_input( input );
+
+        #ifdef PAPI
+        // Add single PAPI event
+        else if( strcmp(arg, "-p") == 0 )
+        {
+            if( ++i < argc ){
+                input->papi_event_set = -1;
+                strcpy(input->event_name, argv[i]);
+            }
+            else
+                print_CLI_error();
+        }
+        #endif
+		// Load OpenMOC track data file
+		else if( strcmp(arg,"-d") == 0)
+		{
+			if( ++i < argc )
+			{
+				input->track_file = argv[i];
+				input->load_tracks = true;
+			}
+			else
+				print_CLI_error();
+		}
+		else
+			print_CLI_error();
+	}
+
+	// Validate Input
+
+	// Validate nthreads
+	if( input->nthreads < 1 )
+		print_CLI_error();
+}
+
+// print error to screen, inform program options
+void print_CLI_error(void)
+{
+	printf("Usage: ./SimpleMOC <options>\n");
+	printf("Options include:\n");
+	printf("  -t <threads>     Number of OpenMP threads to run\n");
+	printf("  -i <filename>    Input file name\n");
+    printf("  -p <PAPI event>  PAPI event name to count (1 only) \n");
+    printf("  -s               Small problem flag \n");
+	printf("  -d <filename>    OpenMOC tracking file\n");
+	printf("See readme for full description of default run values\n");
+	exit(1);
+}
+
+// read input file describing problem parameters
+void read_input_file( Input * I, char * fname)
+{
+	#if TIMING_INFO | 0
+		printf("Attempting to open FIle: %s\n", fname);
+	#endif
+    	FILE * fp = fopen(fname, "r");
+	#if TIMING_INFO | 0
+		printf("Opened FIle: %s\n", fname);
+	#endif
+	if( fp == NULL )
+		printf("FIle Open FAILED\n");
+    char c[255];
+
+	char * stat;
+	int err;
+
+    err = fscanf(fp, "%d", &I->x_assemblies);
+    stat = fgets(c, 255, fp);
+
+    err = fscanf(fp, "%d", &I->y_assemblies);
+    stat = fgets(c, 255, fp);
+    
+	err = fscanf(fp, "%d", &I->cai);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->fai);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->axial_exp);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->radial_ray_sep);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->axial_z_sep);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->n_azimuthal);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->n_polar_angles);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->n_egroups);
+    stat = fgets(c, 255, fp);
+
+	int decompose;	
+	err = fscanf(fp, "%d", &decompose);
+    stat = fgets(c, 255, fp);
+	if(decompose == 0)
+		I->decompose = false;
+	else
+		I->decompose = true;
+	
+	err = fscanf(fp, "%d", &I->decomp_assemblies_ax);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%ld", &I->segments_per_track);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->assembly_width);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->height);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->precision);
+    stat = fgets(c, 255, fp);
+
+	err = fscanf(fp, "%ld", &I->n_2D_source_regions_per_assembly);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->papi_event_set);
+    stat = fgets(c, 255, fp);
+
+	fclose(fp);
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c
@@ -0,0 +1,147 @@
+#include"SimpleMOC_header.h"
+
+int main( int argc, char * argv[] )
+{
+	int version = 3;
+	int mype = 1;
+
+	#ifdef MPI
+	int nranks;
+	MPI_Status stat;
+	MPI_Init(&argc, &argv);
+	MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+	MPI_Comm_rank(MPI_COMM_WORLD, &mype);
+	#endif
+
+	#ifdef PAPI
+	papi_serial_init();
+	#endif
+
+	srand(time(NULL) * (mype+1));
+
+	Input input = set_default_input();
+	read_CLI( argc, argv, &input );
+	calculate_derived_inputs( &input );
+
+	if( mype == 0 )
+		logo(version);
+
+	#ifdef OPENMP
+	omp_set_num_threads(input.nthreads); 
+	#endif
+
+	Params params = build_tracks( &input );
+	CommGrid grid = init_mpi_grid( input );
+
+	if( mype == 0 )
+		print_input_summary(input);
+
+	float res;
+	float keff = 1.0;
+	int num_iters = 1;
+
+
+	double time_transport = 0;
+	double time_flux_exchange = 0;
+	double time_renormalize_flux = 0;
+	double time_update_sources = 0;
+	double time_compute_keff = 0;
+	double start, stop;
+
+	if(mype==0)
+	{
+		center_print("SIMULATION", 79);
+		border_print();
+	}
+
+	for( int i = 0; i < num_iters; i++)
+	{
+		// Transport Sweep
+		start = get_time();
+		transport_sweep(&params, &input);
+		stop = get_time();
+		time_transport += stop-start;
+
+		// Domain Boundary Flux Exchange (MPI)
+		#ifdef MPI
+		start = get_time();
+		fast_transfer_boundary_fluxes(params, input, grid);
+		stop = get_time();
+		time_flux_exchange += stop-start;
+		#endif
+
+		// Flux Renormalization
+		start = get_time();
+		renormalize_flux(params,input, grid);
+		stop = get_time();
+		time_renormalize_flux += stop-start;
+
+		// Update Source Regions
+		start = get_time();
+		res = update_sources(params, input, keff);
+		stop = get_time();
+		time_update_sources += stop-start;
+
+		// Calculate K-Effective
+		start = get_time();
+		keff = compute_keff(params, input, grid);
+		stop = get_time();
+		time_compute_keff += stop-start;
+		if( mype == 0 )
+			printf("keff = %f\n", keff);
+	}
+
+	double time_total = time_transport + time_flux_exchange 
+		+ time_renormalize_flux + time_update_sources + time_compute_keff;
+
+	if( mype == 0 )
+	{
+		border_print();
+		center_print("RESULTS SUMMARY", 79);
+		border_print();
+
+		printf("Transport Sweep Time:         %6.2lf sec   (%4.1lf%%)\n", 
+				time_transport, 100*time_transport/time_total);
+		printf("Domain Flux Exchange Time:    %6.2lf sec   (%4.1lf%%)\n", 
+				time_flux_exchange, 100*time_flux_exchange/time_total);
+		printf("Flux Renormalization Time:    %6.2lf sec   (%4.1lf%%)\n", 
+				time_renormalize_flux, 100*time_renormalize_flux/time_total);
+		printf("Update Source Time:           %6.2lf sec   (%4.1lf%%)\n", 
+				time_update_sources, 100*time_update_sources/time_total);
+		printf("K-Effective Calc Time:        %6.2lf sec   (%4.1lf%%)\n", 
+				time_compute_keff, 100*time_compute_keff/time_total);
+		printf("Total Time:                   %6.2lf sec\n", time_total);
+	}
+
+	long tracks_per_second = 2 * input.ntracks/time_transport;
+
+	#ifdef MPI
+	MPI_Barrier(grid.cart_comm_3d);
+	long global_tps = 0;
+	MPI_Reduce( &tracks_per_second, // Send Buffer
+			&global_tps,            // Receive Buffer
+			1,                    	// Element Count
+			MPI_LONG,           	// Element Type
+			MPI_SUM,              	// Reduciton Operation Type
+			0,                    	// Master Rank
+			grid.cart_comm_3d );  	// MPI Communicator
+	MPI_Barrier(grid.cart_comm_3d);
+	tracks_per_second = global_tps;
+	#endif
+
+	if( mype == 0 )
+	{
+		printf("Time per Intersection:          ");
+		printf("%.2lf ns\n", time_per_intersection( input, time_transport ));
+		border_print();
+	}
+
+	free_2D_tracks( params.tracks_2D );
+	free_tracks( params.tracks );
+
+	#ifdef MPI
+	MPI_Finalize();
+	#endif
+
+	return 0;
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c
@@ -0,0 +1,493 @@
+#include"SimpleMOC_header.h"
+
+#ifdef PAPI
+
+// initialize papi with one thread first
+void papi_serial_init(void)
+{
+	if ( PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT){
+		fprintf(stderr, "PAPI library init error!\n");
+		exit(1);
+	}
+	if (( PAPI_thread_init((long unsigned int (*)(void))
+					pthread_self )) != PAPI_OK){
+		PAPI_perror("PAPI_thread_init");
+		exit(1);
+	}
+}
+
+void counter_init( int *eventset, int *num_papi_events, Input * I )
+{
+	char error_str[PAPI_MAX_STR_LEN];
+	int stat;
+
+	int * events;
+    
+    // Command line event
+    if( I->papi_event_set == -1){
+        *num_papi_events = 1;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+        PAPI_event_name_to_code( I->event_name, &events[0]);
+    }
+	// FLOPS
+	if( I->papi_event_set == 0 )
+	{
+		*num_papi_events = 2;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		events[0] = PAPI_SP_OPS;
+		events[1] = PAPI_TOT_CYC;
+	}
+	// Bandwidth
+	if( I->papi_event_set == 1 )
+	{
+		*num_papi_events = 2;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		events[0] = PAPI_L3_TCM;
+		events[1] = PAPI_TOT_CYC;
+	}
+	// CPU Stall Reason
+	if( I->papi_event_set == 2 )
+	{
+		*num_papi_events = 4;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		int EventCode;
+		char * event1 = "RESOURCE_STALLS:ANY";
+		char * event2 = "RESOURCE_STALLS:SB";
+		char * event3 = "RESOURCE_STALLS:RS";
+		char * event4 = "RESOURCE_STALLS2:OOO_RSRC";
+		PAPI_event_name_to_code( event1, &EventCode );
+		events[0] = EventCode;	
+		PAPI_event_name_to_code( event2, &EventCode );
+		events[1] = EventCode;	
+		PAPI_event_name_to_code( event3, &EventCode );
+		events[2] = EventCode;	
+		PAPI_event_name_to_code( event4, &EventCode );
+		events[3] = EventCode;	
+	}
+	// CPU Stall Percentage
+	if( I->papi_event_set == 3 )
+	{
+		*num_papi_events = 2;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		int EventCode;
+		char * event1 = "RESOURCE_STALLS:ANY";
+		char * event2 = "PAPI_TOT_CYC";
+		PAPI_event_name_to_code( event1, &EventCode );
+		events[0] = EventCode;	
+		PAPI_event_name_to_code( event2, &EventCode );
+		events[1] = EventCode;	
+	}
+	// Memory Loads
+	if( I->papi_event_set == 4 )
+	{
+		*num_papi_events = 4;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		int EventCode;
+		char * event1 = "MEM_LOAD_UOPS_RETIRED";
+		char * event2 = "MEM_LOAD_UOPS_RETIRED:L1_HIT";
+		char * event3 = "MEM_LOAD_UOPS_RETIRED:L2_HIT";
+		char * event4 = "MEM_LOAD_UOPS_RETIRED:L3_HIT";
+		PAPI_event_name_to_code( event1, &EventCode );
+		events[0] = EventCode;	
+		PAPI_event_name_to_code( event2, &EventCode );
+		events[1] = EventCode;	
+		PAPI_event_name_to_code( event3, &EventCode );
+		events[2] = EventCode;	
+		PAPI_event_name_to_code( event4, &EventCode );
+		events[3] = EventCode;	
+	}
+	// LLC Miss Rate
+	if( I->papi_event_set == 5 )
+	{
+		*num_papi_events = 2;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		events[0] = PAPI_L3_TCM;
+		events[1] = PAPI_L3_TCA;
+	}
+	// Branch MisPrediction
+	if( I->papi_event_set == 6 )
+	{
+		*num_papi_events = 3;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		events[0] = PAPI_BR_MSP;
+		events[1] = PAPI_BR_CN;
+		events[2] = PAPI_BR_PRC;
+	}
+	// TLB Misses
+	if( I->papi_event_set == 7 )
+	{
+		*num_papi_events = 4;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		int EventCode;
+		char * event1 = "perf::DTLB-LOADS";
+		char * event2 = "perf::DTLB-LOAD-MISSES";
+		char * event3 = "perf::DTLB-STORES";
+		char * event4 = "perf::DTLB-STORE-MISSES";
+		PAPI_event_name_to_code( event1, &EventCode );
+		events[0] = EventCode;	
+		PAPI_event_name_to_code( event2, &EventCode );
+		events[1] = EventCode;	
+		PAPI_event_name_to_code( event3, &EventCode );
+		events[2] = EventCode;	
+		PAPI_event_name_to_code( event4, &EventCode );
+		events[3] = EventCode;	
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	//                        PAPI EVENT SELECTION
+	/////////////////////////////////////////////////////////////////////////
+	// User can comment/uncomment blocks as they see fit within this seciton
+
+	// Some Standard Events
+	//int events[] = {PAPI_TOT_INS,PAPI_LD_INS,PAPI_FP_INS};
+
+	// Bandwidth Used
+	// ((PAPI_Lx_TCM * Lx_linesize) / PAPI_TOT_CYC) * Clock(MHz)
+	//int events[] = {PAPI_L3_TCM, PAPI_TOT_CYC};
+
+	// L3 Total Cache Miss Ratio
+	// PAPI_L3_TCM / PAPI_L3_TCA
+	// (On Xeon dual octo -  65%, not dependent on # of threads)
+	//int events[] = {PAPI_L3_TCM, PAPI_L3_TCA};
+
+	// % Cycles with no instruction use
+	// PAPI_STL_ICY / PAPI_TOT_CYC
+	//int events[] = { PAPI_STL_ICY, PAPI_TOT_CYC };
+
+	// % Branch instructions Mispredicted
+	// PAPI_BR_MSP / PAPI_BR_CN
+	//int events[] = { PAPI_BR_MSP, PAPI_BR_CN, PAPI_BR_PRC };
+
+	// TLB Misses
+	//int events[] = { PAPI_TLB_DM };
+
+	// MFlops
+	// (PAPI_FP_INS/PAPI_TOT_CYC) * Clock(MHz)
+	//int events[] = { PAPI_FP_INS, PAPI_TOT_CYC };
+
+	// MFlops (Alternate?)
+	// (PAPI_FP_INS/PAPI_TOT_CYC) * Clock(MHz)
+	//int events[] = { PAPI_DP_OPS, PAPI_TOT_CYC };
+
+
+	// TLB misses (Using native counters)
+	/*
+	   int events[2];
+	   int EventCode;
+	   char * event1 = "perf::DTLB-LOADS";
+	   char * event2 = "perf::DTLB-LOAD-MISSES";
+	   PAPI_event_name_to_code( event1, &EventCode );
+	   events[0] = EventCode;	
+	   PAPI_event_name_to_code( event2, &EventCode );
+	   events[1] = EventCode;	
+	   */
+
+	/*	
+	// Stalled Cycles, front v back (Using native counters)
+	int events[3];
+	int EventCode;
+	char * event1 = "perf::STALLED-CYCLES-FRONTEND";
+	char * event2 = "perf::STALLED-CYCLES-BACKEND";
+	char * event3 = "perf::PERF_COUNT_HW_CPU_CYCLES";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	PAPI_event_name_to_code( event3, &EventCode );
+	events[2] = EventCode;	
+	*/	
+	/*
+	// LLC Cache Misses (Using native counters)
+	int events[2];
+	int EventCode;
+	char * event1 = "ix86arch::LLC_REFERENCES";
+	char * event2 = "ix86arch::LLC_MISSES";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	*/
+
+	/*
+	// Node Prefetch Misses (Using native counters)
+	int events[1];
+	int EventCode;
+	//char * event1 = "perf::NODE-PREFETCHES";
+	//char * event2 = "perf::NODE-PREFETCH-MISSES";
+	char * event1 = "perf::NODE-PREFETCHES";
+	char * event2 = "perf::NODE-LOAD-MISSES:COUNT";
+	//PAPI_event_name_to_code( event1, &EventCode );
+	//events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[0] = EventCode;	
+	*/
+
+	/*
+	// CPU Stalls Due to lack of Load Buffers (Using native counters)
+	int events[2];
+	int EventCode;
+	char * event1 = "RESOURCE_STALLS:LB";
+	char * event2 = "perf::PERF_COUNT_HW_CPU_CYCLES";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	*/	
+	/*
+	// CPU Stalls Due to ANY Resource (Using native counters)
+	int events[2];
+	int EventCode;
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "PAPI_TOT_CYC";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	*/
+
+	/*
+	// CPU Stalls at Reservation Station (Using native counters)
+	int events[2];
+	int EventCode;
+	char * event1 = "RESOURCE_STALLS:RS";
+	char * event2 = "perf::PERF_COUNT_HW_CPU_CYCLES";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	*/
+
+	/*
+	// CPU Stall Reason Breakdown (Using native counters)
+	int events[4];
+	int EventCode;
+
+	// Set 1
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "RESOURCE_STALLS:LB";
+	char * event3 = "RESOURCE_STALLS:RS";
+	char * event4 = "RESOURCE_STALLS:SB";
+	// Set 1
+
+	// Set 2
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "RESOURCE_STALLS:ROB";
+	char * event3 = "RESOURCE_STALLS:MEM_RS";
+	char * event4 = "RESOURCE_STALLS2:ALL_FL_EMPTY";
+	// Set 2
+	// Set 3
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "RESOURCE_STALLS2:ALL_PRF_CONTROL";
+	char * event3 = "RESOURCE_STALLS2:ANY_PRF_CONTROL"; // duplicate
+	char * event4 = "RESOURCE_STALLS2:OOO_RSRC";
+	// Set 3
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "RESOURCE_STALLS:SB";
+	char * event3 = "RESOURCE_STALLS:RS"; // duplicate
+	char * event4 = "RESOURCE_STALLS2:OOO_RSRC";
+
+
+	// Events that don't need to be counted
+	// Don't bother measuring these
+	//char * event1 = "RESOURCE_STALLS:FCSW"; // Always 0, don't measure
+	//char * event1 = "RESOURCE_STALLS:MXCSR"; // Always 0, don't measure
+	//char * event3 = "RESOURCE_STALLS2:BOB_FULL"; // Always trivial
+	//char * event3 = "RESOURCE_STALLS2:ANY_PRF_CONTROL"; // duplicate
+
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	PAPI_event_name_to_code( event3, &EventCode );
+	events[2] = EventCode;	
+	PAPI_event_name_to_code( event4, &EventCode );
+	events[3] = EventCode;	
+	*/
+
+	/////////////////////////////////////////////////////////////////////////
+	//                        PAPI EVENT LOADING
+	/////////////////////////////////////////////////////////////////////////
+	// Users should not need to alter anything within this section
+
+	int thread = omp_get_thread_num();
+
+	if ( (stat= PAPI_create_eventset(eventset)) != PAPI_OK)
+	{
+		PAPI_perror("PAPI_create_eventset");
+		exit(1);
+	}
+
+	for( int i = 0; i < *num_papi_events; i++ )
+	{
+		if ((stat=PAPI_add_event(*eventset,events[i])) != PAPI_OK)
+		{
+			PAPI_perror("PAPI_add_event");
+			exit(1);
+		}
+	}
+
+	if ((stat=PAPI_start(*eventset)) != PAPI_OK)
+	{
+		PAPI_perror("PAPI_start");
+		exit(1);
+	}
+}
+
+
+
+
+/*
+   void counter_init( int *eventset, int *num_papi_events )
+   {
+   char error_str[PAPI_MAX_STR_LEN];
+//  int events[] = {PAPI_TOT_INS,PAPI_BR_INS,PAPI_SR_INS};
+int events[] = {PAPI_TOT_INS,PAPI_LD_INS,PAPI_FP_INS};
+int events[] = {ix86arch::LLC_REFERENCES, 
+int stat;
+
+int thread = omp_get_thread_num();
+if( thread == 0 )
+printf("Initializing PAPI counters...\n");
+
+ *num_papi_events = sizeof(events) / sizeof(int);
+
+ if ((stat = PAPI_thread_init((long unsigned int (*)(void)) omp_get_thread_num)) != PAPI_OK){
+ PAPI_perror("PAPI_thread_init");
+ exit(1);
+ }
+
+ if ( (stat= PAPI_create_eventset(eventset)) != PAPI_OK){
+ PAPI_perror("PAPI_create_eventset");
+ exit(1);
+ }
+
+ for( int i = 0; i < *num_papi_events; i++ ){
+ if ((stat=PAPI_add_event(*eventset,events[i])) != PAPI_OK){
+ PAPI_perror("PAPI_add_event");
+ exit(1);
+ }
+ }
+
+ if ((stat=PAPI_start(*eventset)) != PAPI_OK){
+ PAPI_perror("PAPI_start");
+ exit(1);
+ }
+ }
+ */
+
+// Stops the papi counters and prints results
+void counter_stop( int * eventset, int num_papi_events, Input * I )
+{
+	int * events = malloc(num_papi_events * sizeof(int));
+	int n = num_papi_events;
+	PAPI_list_events( *eventset, events, &n );
+	PAPI_event_info_t info;
+
+	long_long * values = malloc( num_papi_events * sizeof(long_long));
+	PAPI_stop(*eventset, values);
+	int thread = omp_get_thread_num();
+	int nthreads = omp_get_num_threads();
+
+	static long LLC_cache_miss = 0;
+	static long total_cycles = 0;
+	static long FLOPS = 0;
+	static long stall_any = 0;
+	static long stall_SB = 0;
+	static long stall_RS = 0;
+	static long stall_OO = 0;
+	static long tlb_load = 0;
+	static long tlb_load_m = 0;
+	static long tlb_store = 0;
+	static long tlb_store_m = 0;
+
+    #pragma omp master
+    {
+        I->vals_accum = malloc( num_papi_events * sizeof(long long));
+        for(int i=0; i < num_papi_events ; i ++)
+            I->vals_accum[i] = 0;
+    }
+    #pragma omp barrier
+
+	#pragma omp critical (papi)
+	{
+		printf("Thread %d\n", thread);
+		for( int i = 0; i < num_papi_events; i++ )
+		{
+            I->vals_accum[i] += values[i];
+			PAPI_get_event_info(events[i], &info);
+			printf("%-15lld\t%s\t%s\n", values[i],info.symbol,info.long_descr);
+			if( strcmp(info.symbol, "PAPI_L3_TCM") == 0 )
+				LLC_cache_miss += values[i];
+			if( strcmp(info.symbol, "PAPI_TOT_CYC") == 0 )
+				total_cycles += values[i];
+			if( strcmp(info.symbol, "PAPI_SP_OPS") == 0 )
+				FLOPS += values[i];
+			if( strcmp(info.symbol, "RESOURCE_STALLS:ANY") == 0 )
+				stall_any += values[i];
+			if( strcmp(info.symbol, "RESOURCE_STALLS:SB") == 0 )
+				stall_SB += values[i];
+			if( strcmp(info.symbol, "RESOURCE_STALLS:RS") == 0 )
+				stall_RS += values[i];
+			if( strcmp(info.symbol, "RESOURCE_STALLS2:OOO_RSRC") == 0 )
+				stall_OO += values[i];
+			if( strcmp(info.symbol, "perf::DTLB-LOADS") == 0 )
+				tlb_load += values[i];
+			if( strcmp(info.symbol, "perf::DTLB-LOAD-MISSES") == 0 )
+				tlb_load_m += values[i];
+			if( strcmp(info.symbol, "perf::DTLB-STORES") == 0 )
+				tlb_store += values[i];
+			if( strcmp(info.symbol, "perf::DTLB-STORE-MISSES") == 0 )
+				tlb_store_m += values[i];
+		}
+		free(values);	
+	}
+	{
+		#pragma omp barrier
+	}
+	#pragma omp master
+	{
+        if( omp_get_num_threads() > 1){
+            printf("Thread Totals:\n");
+            for( int i = 0; i < num_papi_events; i++ )
+            {
+                PAPI_get_event_info(events[i], &info);
+                printf("%-15lld\t%s\t%s\n", I->vals_accum[i],info.symbol,info.long_descr);
+            }
+        }
+        free( I->vals_accum );
+
+		border_print();
+		center_print("PERFORMANCE SUMMARY", 79);
+		border_print();
+		long cycles = (long) (total_cycles / (double) nthreads);
+		double bw = LLC_cache_miss*64./cycles*2.8e9/1024./1024./1024.;
+		if( I->papi_event_set == 0 )
+			printf("GFLOPs: %.3lf\n", FLOPS / (double) cycles * 2.8  );
+		if( I->papi_event_set == 1 )
+			printf("Bandwidth: %.3lf (GB/s)\n", bw);
+		if( I->papi_event_set == 2 )
+		{
+			printf("%-30s %.2lf%%\n", "Store Buffer Full:",
+					stall_SB / (double) stall_any * 100.);
+			printf("%-30s %.2lf%%\n", "Reservation Station Full:",
+					stall_RS / (double) stall_any * 100.);
+			printf("%-30s %.2lf%%\n", "OO Pipeline Full:",
+					stall_OO / (double) stall_any * 100.);
+		}
+		if( I->papi_event_set == 3 )
+			printf("CPU Stalled Cycles: %.2lf%%\n",
+					stall_any / (double) total_cycles * 100.);	
+		if( I->papi_event_set == 7 )
+		{
+			printf("%-30s %.2lf%%\n", "Data TLB Load Miss Rate: ",
+					tlb_load_m / (double) tlb_load * 100 );
+			printf("%-30s %.2lf%%\n", "Data TLB Store Miss Rate: ",
+					tlb_store_m / (double) tlb_store * 100 );
+		}
+
+		border_print();
+	}
+    free(events);
+}
+
+#endif
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c
@@ -0,0 +1,1469 @@
+#include"SimpleMOC_header.h"
+
+/* Efficient version of attenuate fluxes which determines the change in angular
+ * flux along a particular track across a fine axial region and tallies the 
+ * contribution to the scalar flux in the fine axial region. This function
+ * assumes a quadratic source, which is calculated on the fly using neighboring
+ * source values. 
+ * 
+ * This version decomposes the work into many for loops for efficient SIMD 
+ * instructions and to reduce register pressure. For a more descriptive 
+ * (but less effiient) version of the code in terms of the underlying physics, 
+ * see alt_attenuate_fluxes which solves the problem in a more naive, 
+ * straightforward manner. */
+void attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I_in,
+		Params * params_in, float ds, float mu, float az_weight, 
+		AttenuateVars * A ) 
+{
+	Input I = *I_in;
+	Params params = *params_in;
+
+	// unload attenuate vars
+	float * restrict q0 = A->q0;
+	float *  restrict q1 = A->q1;
+	float *  restrict q2 = A->q2;
+	float *  restrict sigT = A->sigT;
+	float *  restrict tau = A->tau;
+	float *  restrict sigT2 = A->sigT2;
+	float *  restrict expVal = A->expVal;
+	float *  restrict reuse = A->reuse;
+	float *  restrict flux_integral = A->flux_integral;
+	float *  restrict tally = A->tally;
+	float *  restrict t1 = A->t1;
+	float *  restrict t2 = A->t2;
+	float *  restrict t3 = A->t3;
+	float *  restrict t4 = A->t4;
+
+	// compute fine axial interval spacing
+	float dz = I.height / (I.fai * I.decomp_assemblies_ax * I.cai);
+
+	// compute z height in cell
+	float zin = track->z_height - dz * 
+		( (int)( track->z_height / dz ) + 0.5f );
+
+	// compute fine axial region ID
+	int fine_id = (int) ( track->z_height / dz ) % I.fai;
+
+	// compute weight (azimuthal * polar)
+	// NOTE: real app would also have volume weight component
+	float weight = track->p_weight * az_weight;
+	float mu2 = mu * mu;
+
+	// load fine source region flux vector
+	float * FSR_flux = QSR -> fine_flux[fine_id];
+
+	if( fine_id == 0 )
+	{
+		// adjust z height to account for edge
+		zin -= dz;
+
+		// cycle over energy groups
+		#ifdef INTEL
+		#pragma simd
+		#elif defined IBM
+		#pragma simd_level(10)
+		#endif
+		for( int g = 0; g < I.n_egroups; g++)
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id][g];
+			float y2 = QSR->fine_source[fine_id+1][g];
+			float y3 = QSR->fine_source[fine_id+2][g];
+
+			// do quadratic "fitting"
+			float c0 = y2;
+			float c1 = (y1 - y3) / (2.f*dz);
+			float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz);
+			
+			// calculate q0, q1, q2
+			q0[g] = c0 + c1*zin + c2*zin*zin;
+			q1[g] = c1 + 2.f*c2*zin;
+			q2[g] = c2;
+		}
+	}
+	else if ( fine_id == I.fai - 1 )
+	{
+		// adjust z height to account for edge
+		zin += dz;
+		
+		// cycle over energy groups
+		#ifdef INTEL
+		#pragma simd
+		#elif defined IBM
+		#pragma simd_level(10)
+		#endif
+		for( int g = 0; g < I.n_egroups; g++)
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id-2][g];
+			float y2 = QSR->fine_source[fine_id-1][g];
+			float y3 = QSR->fine_source[fine_id][g];
+
+			// do quadratic "fitting"
+			float c0 = y2;
+			float c1 = (y1 - y3) / (2.f*dz);
+			float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz);
+
+			// calculate q0, q1, q2
+			q0[g] = c0 + c1*zin + c2*zin*zin;
+			q1[g] = c1 + 2.f*c2*zin;
+			q2[g] = c2;
+		}
+	}
+	else
+	{
+		// cycle over energy groups
+		#ifdef INTEL
+		#pragma simd
+		#elif defined IBM
+		#pragma simd_level(10)
+		#endif
+		for( int g = 0; g < I.n_egroups; g++)
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id-1][g];
+			float y2 = QSR->fine_source[fine_id][g];
+			float y3 = QSR->fine_source[fine_id+1][g];
+
+			// do quadratic "fitting"
+			float c0 = y2;
+			float c1 = (y1 - y3) / (2.f*dz);
+			float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz);
+
+			// calculate q0, q1, q2
+			q0[g] = c0 + c1*zin + c2*zin*zin;
+			q1[g] = c1 + 2.f*c2*zin;
+			q2[g] = c2;
+		}
+	}
+
+	// cycle over energy groups
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		// load total cross section
+		sigT[g] = QSR->sigT[g];
+
+		// calculate common values for efficiency
+		tau[g] = sigT[g] * ds;
+		sigT2[g] = sigT[g] * sigT[g];
+	}
+
+	// cycle over energy groups
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+		expVal[g] = interpolateTable( params.expTable, tau[g] );  
+
+	// Flux Integral
+
+	// Re-used Term
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		reuse[g] = tau[g] * (tau[g] - 2.f) + 2.f * expVal[g] 
+			/ (sigT[g] * sigT2[g]); 
+	}
+
+
+	float * psi;
+	if(forward)
+		psi = track->f_psi;
+	else
+		psi = track->b_psi;
+
+	//#pragma vector nontemporal
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		// add contribution to new source flux
+		flux_integral[g] = (q0[g] * tau[g] + (sigT[g] * psi[g] - q0[g])
+			* expVal[g]) / sigT2[g] + q1[g] * mu * reuse[g] + q2[g] * mu2 
+			* (tau[g] * (tau[g] * (tau[g] - 3.f) + 6.f) - 6.f * expVal[g]) 
+			/ (3.f * sigT2[g] * sigT2[g]);
+	}
+
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		// Prepare tally
+		tally[g] = weight * flux_integral[g];
+	}
+
+	#ifdef OPENMP
+	omp_set_lock(QSR->locks + fine_id);
+	#endif
+
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		FSR_flux[g] += tally[g];
+	}
+
+	#ifdef OPENMP
+	omp_unset_lock(QSR->locks + fine_id);
+	#endif
+
+	// Term 1
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		t1[g] = q0[g] * expVal[g] / sigT[g];  
+	}
+	// Term 2
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		t2[g] = q1[g] * mu * (tau[g] - expVal[g]) / sigT2[g]; 
+	}
+	// Term 3
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		t3[g] =	q2[g] * mu2 * reuse[g];
+	}
+	// Term 4
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		t4[g] = psi[g] * (1.f - expVal[g]);
+	}
+	// Total psi
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		psi[g] = t1[g] + t2[g] + t3[g] + t4[g];
+	}
+}	
+
+// single direction transport sweep
+void transport_sweep( Params * params, Input * I )
+{
+	if(I->mype==0) printf("Starting transport sweep ...\n");
+
+	// calculate the height of a node's domain and of each FSR
+	double node_delta_z = I->height / I->decomp_assemblies_ax;
+	double fine_delta_z = node_delta_z / (I->cai * I->fai);
+
+	/* loop over tracks (implicitly azimuthal angles, tracks in azimuthal 
+	 * angles, polar angles, and z stacked rays) */
+
+	//print_Input_struct( I );
+	long segments_processed = 0;
+
+	#pragma omp parallel default(none) \
+	shared( I, params, node_delta_z, fine_delta_z ) \
+	reduction(+ : segments_processed )
+	{
+		#ifdef OPENMP
+		int thread = omp_get_thread_num();
+		int nthreads = omp_get_num_threads();
+		unsigned int seed = time(NULL) * (thread+1);
+		#endif
+		//print_Input_struct( I );
+
+		#ifdef PAPI
+		int eventset = PAPI_NULL;
+		int num_papi_events;
+		#pragma omp critical
+		{
+			counter_init(&eventset, &num_papi_events, I);
+		}
+		#endif
+
+		AttenuateVars A;
+		float * ptr = (float * ) malloc( I->n_egroups * 14 * sizeof(float));
+		A.q0 = ptr;
+		ptr += I->n_egroups;
+		A.q1 = ptr;
+		ptr += I->n_egroups;
+		A.q2 = ptr;
+		ptr += I->n_egroups;
+		A.sigT = ptr;
+		ptr += I->n_egroups;
+		A.tau = ptr;
+		ptr += I->n_egroups;
+		A.sigT2 = ptr;
+		ptr += I->n_egroups;
+		A.expVal = ptr;
+		ptr += I->n_egroups;
+		A.reuse = ptr;
+		ptr += I->n_egroups;
+		A.flux_integral = ptr;
+		ptr += I->n_egroups;
+		A.tally = ptr;
+		ptr += I->n_egroups;
+		A.t1 = ptr;
+		ptr += I->n_egroups;
+		A.t2 = ptr;
+		ptr += I->n_egroups;
+		A.t3 = ptr;
+		ptr += I->n_egroups;
+		A.t4 = ptr;
+
+		#pragma omp for schedule( dynamic ) 
+		for (long i = 0; i < I->ntracks_2D; i++)
+		{
+			#if TIMING_INFO | 0
+				// print progress
+				#ifdef OPENMP
+				if(I->mype==0 && thread == 0)
+				{
+					printf("\rAttenuating Tracks... (%.0lf%% completed)",
+							(i / ( (double)I->ntracks_2D / (double) nthreads ))
+							/ (double) nthreads * 100.0);
+				}
+				#else
+				if( i % 50 == 0)
+					if(I->mype==0)
+						printf("%s%ld%s%ld\n","2D Tracks Completed = ", i," / ", 
+								I->ntracks_2D );
+				#endif
+			#endif
+
+
+			// treat positive-z traveling rays first
+			bool pos_z_dir = true;
+			for( int j = 0; j < I->n_polar_angles; j++)
+			{
+				if( j == I->n_polar_angles / 2 )
+					pos_z_dir = false;
+				float p_angle = params->polar_angles[j];
+				float mu = cos(p_angle);
+
+				// start with all z stacked rays
+				int begin_stacked = 0;
+				int end_stacked = I->z_stacked;
+
+				for( int n = 0; n < params->tracks_2D[i].n_segments; n++)
+				{
+					// calculate distance traveled in cell if segment completed
+					float s_full = params->tracks_2D[i].segments[n].length 
+						/ sin(p_angle);
+
+					// allocate varaible for distance traveled in an FSR
+					float ds = 0;
+
+					// loop over remaining z-stacked rays
+					for( int k = begin_stacked; k < end_stacked; k++)
+					{
+						// initialize s to full length
+						float s = s_full;
+
+						// select current track
+						Track * track = &params->tracks[i][j][k];
+
+						// set flag for completeion of segment
+						bool seg_complete = false;
+
+						// calculate interval
+						int curr_interval;
+						if( pos_z_dir)
+							curr_interval = get_pos_interval(track->z_height, 
+									fine_delta_z);
+						else
+							curr_interval = get_neg_interval(track->z_height, 
+									fine_delta_z);
+
+						while( !seg_complete )
+						{
+							// flag to reset z position
+							bool reset = false;
+
+
+							/* calculate new height based on s 
+							 * (distance traveled in FSR) */
+							float z = track->z_height + s * cos(p_angle);
+
+							// check if still in same FSR (fine axial interval)
+							int new_interval;
+							if( pos_z_dir )
+								new_interval = get_pos_interval(z, 
+										fine_delta_z);
+							else
+								new_interval = get_neg_interval(z,
+										fine_delta_z);
+
+							if( new_interval == curr_interval )
+							{
+								seg_complete = true;
+								ds = s;
+							}
+
+							// otherwise, we need to recalculate distances
+							else
+							{
+								// correct z
+								if( pos_z_dir )
+								{
+									curr_interval++;
+									z = fine_delta_z * (float) curr_interval;
+								}
+								else{
+									curr_interval--;
+									z = fine_delta_z * (float) curr_interval;
+								}
+
+								// calculate distance travelled in FSR (ds)
+								ds = (z - track->z_height) / cos(p_angle);
+
+								// update track length remaining
+								s -= ds;
+
+								/* check remaining track length to protect
+								 * against potential roundoff errors */
+								if( s <= 0 )
+									seg_complete = true;
+
+								// check if out of bounds or track complete
+								if( z <= 0 || z >= node_delta_z )
+								{
+									// mark segment as completed
+									seg_complete = true;
+
+									// remember to no longer treat this track
+									if ( pos_z_dir )
+										end_stacked--;
+									else
+										begin_stacked++;
+
+									// reset z height
+									reset = true;
+								}
+							}
+
+							// pick a random FSR (cache miss expected)
+							#ifdef OPENMP
+							long QSR_id = rand_r(&seed) % 
+								I->n_source_regions_per_node;
+							#else
+							long QSR_id = rand() % 
+								I->n_source_regions_per_node;
+							#endif
+
+							/* update sources and fluxes from attenuation 
+							 * over FSR */
+							if( I->axial_exp == 2 )
+							{
+								attenuate_fluxes( track, true, 
+										&params->sources[QSR_id], 
+										I, params, ds, mu, 
+										params->tracks_2D[i].az_weight, &A );
+
+								segments_processed++;
+							}
+
+							else if( I->axial_exp == 0 )
+							{
+								attenuate_FSR_fluxes( track, true,
+										&params->sources[QSR_id],
+										I, params, ds, mu,
+										params->tracks_2D[i].az_weight, &A );
+
+								segments_processed++;
+							}
+							else
+							{
+								printf("Error: invalid axial expansion order");
+								printf("\n Please input 0 or 2\n");
+								exit(1);
+							}
+
+							// update with new z height or reset if finished
+							if( n == params->tracks_2D[i].n_segments - 1  
+									|| reset)
+							{
+								if( pos_z_dir)
+									track->z_height = I->axial_z_sep * k;
+								else
+									track->z_height = I->axial_z_sep * (k+1);
+							}
+							else
+								track->z_height = z;
+
+						}
+					}
+				}
+			}
+		}
+		#ifdef OPENMP
+		if(thread == 0 && I->mype==0) printf("\n");
+		#endif
+
+		#ifdef PAPI
+		if( thread == 0 )
+		{
+			printf("\n");
+			border_print();
+			center_print("PAPI COUNTER RESULTS", 79);
+			border_print();
+			printf("Count          \tSmybol      \tDescription\n");
+		}
+		{
+			#pragma omp barrier
+		}
+		counter_stop(&eventset, num_papi_events, I);
+		#endif
+	}
+	I->segments_processed = segments_processed;
+
+	return;
+}
+
+
+// run one full transport sweep, return k
+void two_way_transport_sweep( Params * params, Input * I )
+{
+	if(I->mype==0) printf("Starting transport sweep ...\n");
+
+	// calculate the height of a node's domain and of each FSR
+	double node_delta_z = I->height / I->decomp_assemblies_ax;
+	int num_intervals = (I->cai * I->fai);
+	double fine_delta_z = node_delta_z / num_intervals;
+
+	/* loop over tracks (implicitly azimuthal angles, tracks in azimuthal 
+	 * angles, polar angles, and z stacked rays) */
+		long segments_processed = 0;
+
+	#pragma omp parallel default(none) \
+	shared( I, params, node_delta_z, fine_delta_z, num_intervals ) \
+	reduction(+ : segments_processed )
+	{
+		#ifdef OPENMP
+		int thread = omp_get_thread_num();
+		int nthreads = omp_get_num_threads();
+		unsigned int seed = time(NULL) * (thread+1);
+		#endif
+		//print_Input_struct( I );
+
+		#ifdef PAPI
+		int eventset = PAPI_NULL;
+		int num_papi_events;
+		#pragma omp critical
+		{
+			counter_init(&eventset, &num_papi_events, I);
+		}
+		#endif
+
+
+		AttenuateVars A;
+		float * ptr = (float * ) malloc( I->n_egroups * 14 * sizeof(float));
+		A.q0 = ptr;
+		ptr += I->n_egroups;
+		A.q1 = ptr;
+		ptr += I->n_egroups;
+		A.q2 = ptr;
+		ptr += I->n_egroups;
+		A.sigT = ptr;
+		ptr += I->n_egroups;
+		A.tau = ptr;
+		ptr += I->n_egroups;
+		A.sigT2 = ptr;
+		ptr += I->n_egroups;
+		A.expVal = ptr;
+		ptr += I->n_egroups;
+		A.reuse = ptr;
+		ptr += I->n_egroups;
+		A.flux_integral = ptr;
+		ptr += I->n_egroups;
+		A.tally = ptr;
+		ptr += I->n_egroups;
+		A.t1 = ptr;
+		ptr += I->n_egroups;
+		A.t2 = ptr;
+		ptr += I->n_egroups;
+		A.t3 = ptr;
+		ptr += I->n_egroups;
+		A.t4 = ptr;
+
+		#pragma omp for schedule( dynamic ) 
+		for (long i = 0; i < I->ntracks_2D; i++)
+		{
+			// print progress
+			#ifdef OPENMP
+			if(I->mype==0 && thread == 0)
+			{
+				printf("\rAttenuating Tracks... (%.0lf%% completed)",
+						(i / ( (double)I->ntracks_2D / (double) nthreads ))
+						/ (double) nthreads * 100.0);
+			}
+			#else
+			if( i % 50 == 0)
+				if(I->mype==0)
+					printf("%s%ld%s%ld\n","2D Tracks Completed = ", i," / ", 
+							I->ntracks_2D );
+			#endif
+
+			// allocate arrays for segment storage FIXME
+			double ** seg_dist = malloc( I->z_stacked * sizeof(double *) );
+			Source *** seg_src = malloc( I->z_stacked * sizeof(Source**) );
+			int * seg_idx = malloc( I->z_stacked * sizeof(int) );
+			int * seg_size = malloc( I->z_stacked * sizeof(int) );
+
+			// fill matrix with arrays FIXME
+			for( int k = 0; k < I->z_stacked; k++)
+			{
+				seg_size[k] = 2 * I->segments_per_track;
+				seg_dist[k] = malloc( seg_size[k] * sizeof(double) );
+				seg_src[k] = malloc( seg_size[k] * sizeof(Source *) );
+				seg_idx[k] = 0;
+			}
+
+			// treat positive-z traveling rays first
+			bool pos_z_dir = true;
+			for( int j = 0; j < I->n_polar_angles; j++)
+			{
+				if( j == I->n_polar_angles / 2 )
+					pos_z_dir = false;
+				float p_angle = params->polar_angles[j];
+				float mu = cos(p_angle);
+
+				// start with all z stacked rays
+				int begin_stacked = 0;
+				int end_stacked = I->z_stacked;
+
+				// reset semgnet indexes
+				for( int k = 0; k < I->z_stacked; k++)
+					seg_idx[k] = 0;
+
+				for( int n = 0; n < params->tracks_2D[i].n_segments; n++)
+				{
+					// calculate distance traveled in cell if segment completed
+					float s_full = params->tracks_2D[i].segments[n].length 
+						/ sin(p_angle);
+
+					// allocate varaible for distance traveled in an FSR
+					float ds = 0;
+
+					// loop over remaining z-stacked rays
+					int tracks_completed = 0;
+					for( int k = begin_stacked; k < end_stacked; k++)
+					{
+						// select current track
+						Track * track = &params->tracks[i][j][k];
+
+						// determine current axial interval
+						int interval = (int) track->z_height / fine_delta_z;
+
+						// calculate distance to domain boundary
+						float bound_dist;
+						if( pos_z_dir)
+							bound_dist = (node_delta_z - track->z_height) / mu;
+						else
+							bound_dist = -track->z_height / mu;
+
+						// determine track length
+						float s;
+						if(	s_full < bound_dist )
+							s = s_full;
+						else
+						{
+							// note completion of track
+							s = bound_dist;
+							tracks_completed++;
+						}
+
+						// set flag for completeion of segment
+						bool seg_complete = false;
+
+						while( !seg_complete )
+						{
+							// initialize tracking variables
+							long QSR_id = interval + num_intervals * n;
+							float ds;
+							float z;
+
+							// calculate z height of next fine axial interval
+							float fai_z_height;
+							if( pos_z_dir )
+								fai_z_height = (interval + 1) * fine_delta_z ;
+							else
+								fai_z_height = interval * fine_delta_z;
+
+							// calculate z distance to next fine axial interval
+							float z_dist_to_fai = 
+								fai_z_height - track->z_height;
+
+							/* calculate total distance (s) to fine axial 
+							 * interval */
+							float s_dist_to_fai = z_dist_to_fai / mu;
+
+							// determine if a fine axial interval is crossed
+							if( s_dist_to_fai < s )
+							{
+								if( pos_z_dir )
+									interval++;
+								else
+									interval--;
+								ds = s_dist_to_fai;
+								z = track->z_height + z_dist_to_fai;
+							}
+							else
+							{
+								ds = s;
+								z = track->z_height + s * mu;
+							}	
+
+							/* shorten remaining segment length and check if
+							 * completed (accounting for potential roundoff) */
+							s -= ds;
+							if( s <= 0 || interval < 0 
+									|| interval >= num_intervals)
+								seg_complete = true;
+
+							// pick a random FSR (cache miss expected)
+							#ifdef OPENMP
+							QSR_id = rand_r(&seed) % 
+								I->n_source_regions_per_node;
+							#else
+							QSR_id = rand() % I->n_source_regions_per_node;
+							#endif
+
+							/* update sources and fluxes from attenuation 
+							 * over FSR */
+							if( I->axial_exp == 2 )
+							{
+								attenuate_fluxes( track, true,
+										&params->sources[QSR_id], 
+										I, params, ds, mu, 
+										params->tracks_2D[i].az_weight, &A );
+								segments_processed++;
+							}
+
+							else if( I->axial_exp == 0 )
+								attenuate_FSR_fluxes( track, true,
+										&params->sources[QSR_id],
+										I, params, ds, mu,
+										params->tracks_2D[i].az_weight, &A );
+							else
+							{
+								printf("Error: invalid axial expansion order");
+								printf("\n Please input 0 or 2\n");
+								exit(1);
+							}
+
+							// update track height
+							track->z_height = z;
+							
+							// save segment length and source FIXME
+							seg_dist[k][seg_idx[k]] = ds;
+							seg_src[k][seg_idx[k]] = &params->sources[QSR_id];
+							seg_idx[k]++;
+
+							// check if array needs to grow FIXME
+							if( seg_idx[k] >= seg_size[k] )
+							{
+								seg_size[k] *= 2;
+								seg_dist[k] = (double *) realloc( seg_dist[k],
+										seg_size[k] * sizeof(double) );
+								seg_src[k] = (Source **) realloc( seg_src[k],
+										seg_size[k] * sizeof(Source *) );
+							}
+						}
+					}
+					if(pos_z_dir)
+						end_stacked -= tracks_completed;
+					else
+						begin_stacked += tracks_completed;
+				}
+
+				// loop over all z stacked rays again
+				for( int k = 0; k < I->z_stacked; k++ )
+				{
+					for( int n = seg_idx[k]-1; n >= 0; n--)
+					{
+						// load distance
+						float ds = seg_dist[k][n];
+
+						// select current track
+						Track * track = &params->tracks[i][j][k];
+
+						// update sources and fluxes from attenuation over FSR
+						if( I->axial_exp == 2 )
+						{
+							attenuate_fluxes( track, false,
+									seg_src[k][n], 
+									I, params, ds, -mu, 
+									params->tracks_2D[i].az_weight, &A );
+								segments_processed++;
+						}
+
+						else if( I->axial_exp == 0 )
+							attenuate_FSR_fluxes( track, false,
+									seg_src[k][n],
+									I, params, ds, -mu,
+									params->tracks_2D[i].az_weight, &A );
+
+						// update z height
+						track->z_height -= ds * mu;
+					}
+				}
+
+
+				/* Update all tracks with correct starting z location again
+				 * NOTE: this is only here to acocunt for roundoff error */
+				for( int k = 0; k < I->z_stacked; k++)
+				{
+					Track * track = &params->tracks[i][j][k];
+					if( pos_z_dir)
+						track->z_height = I->axial_z_sep * k;
+					else
+						track->z_height = I->axial_z_sep * (k+1);
+				}
+			}
+
+			// free memory
+			for( int k = 0; k < I->z_stacked; k++)
+			{
+				free(seg_dist[k]);
+				free(seg_src[k]);
+			}
+			free(seg_dist);
+			free(seg_src);
+			free(seg_idx);
+			free(seg_size);
+
+		}
+		#ifdef OPENMP
+		if(thread == 0 && I->mype==0) printf("\n");
+		#endif
+
+		#ifdef PAPI
+		if( thread == 0 )
+		{
+			printf("\n");
+			border_print();
+			center_print("PAPI COUNTER RESULTS", 79);
+			border_print();
+			printf("Count          \tSmybol      \tDescription\n");
+		}
+		{
+			#pragma omp barrier
+		}
+		counter_stop(&eventset, num_papi_events, I);
+		#endif
+	}
+	//printf("Number of segments processed: %ld\n", segments_processed);
+	I->segments_processed = segments_processed;
+
+	return;
+}
+
+/* returns integer number for axial interval for tracks traveling in the
+ *  positive direction */
+int get_pos_interval( float z, float dz)
+{
+	int interval = (int) (z/dz);
+	return interval;
+}
+
+/* returns integer number for axial interval for tracks traveling in the 
+ * negative direction */
+int get_neg_interval( float z, float dz)
+{
+	// NOTE: a bit of trickery using floors to obtain ceils 
+	int interval = INT_MAX - (int) ( (double) INT_MAX 
+			- (double) ( z / dz ) );
+	return interval;
+}
+
+int calc_next_fai( float z, float dz, bool pos_dir)
+{
+	int interval = z/dz;
+	float lower_z = dz * (float) interval;
+	if(pos_dir)
+		return interval + 1;
+	else
+		return interval;
+}
+
+/* Determines the change in angular flux along a particular track across a fine
+ * axial region and tallies the contribution to the scalar flux in the fine 
+ * axial region. This function assumes a quadratic source, which is calculated 
+ * on the fly using neighboring source values.
+ *
+ * This legacy function is unused since it is less efficient than the current 
+ * attenuate_fluxes function. However, it provides a more straightforward 
+ * description of the underlying physical problem. */
+void alt_attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I,
+		Params * params, float ds, float mu, float az_weight ) 
+{
+	// compute fine axial interval spacing
+	float dz = I->height / (I->fai * I->decomp_assemblies_ax * I->cai);
+
+	// compute z height in cell
+	float zin = track->z_height - dz * ( (int)( track->z_height / dz ) + 0.5 );
+
+	// compute fine axial region ID
+	int fine_id = (int) ( track->z_height / dz ) % I->fai;
+
+	// compute weight (azimuthal * polar)
+	// NOTE: real app would also have volume weight component
+	float weight = track->p_weight * az_weight;
+	float mu2 = mu * mu;
+
+	// load fine source region flux vector
+	float * FSR_flux = QSR -> fine_flux[fine_id];
+
+	// cycle over energy groups
+	for( int g = 0; g < I->n_egroups; g++)
+	{
+		// load total cross section
+		float sigT = QSR->sigT[g];
+
+		// define source parameters
+		float q0, q1, q2;
+
+		// calculate source components
+		if( fine_id == 0 )
+		{
+			// load neighboring sources
+			float y2 = QSR->fine_source[fine_id][g];
+			float y3 = QSR->fine_source[fine_id+1][g];
+
+			// do linear "fitting"
+			float c0 = y2;
+			float c1 = (y3 - y2) / dz;
+
+			// calculate q0, q1, q2
+			q0 = c0 + c1*zin;
+			q1 = c1;
+			q2 = 0;
+		}
+		else if( fine_id == I->fai - 1 )
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id-1][g];
+			float y2 = QSR->fine_source[fine_id][g];
+
+			// do linear "fitting"
+			float c0 = y2;
+			float c1 = (y2 - y1) / dz;
+
+			// calculate q0, q1, q2
+			q0 = c0 + c1*zin;
+			q1 = c1;
+			q2 = 0;
+		}		
+		else
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id-1][g];
+			float y2 = QSR->fine_source[fine_id][g];
+			float y3 = QSR->fine_source[fine_id+1][g];
+
+			// do quadratic "fitting"
+			float c0 = y2;
+			float c1 = (y1 - y3) / (2*dz);
+			float c2 = (y1 - 2*y2 + y3) / (2*dz*dz);
+
+			// calculate q0, q1, q2
+			q0 = c0 + c1*zin + c2*zin*zin;
+			q1 = c1 + 2*c2*zin;
+			q2 = c2;
+		}
+
+		// calculate common values for efficiency
+		float tau = sigT * ds;
+		float sigT2 = sigT * sigT;
+
+		// compute exponential ( 1 - exp(-x) ) using table lookup
+		float expVal = interpolateTable( params->expTable, tau );  
+
+		// load correct angular flux vector
+		float * psi;
+		if(forward)
+			psi = track->f_psi;
+		else
+			psi = track->b_psi;
+
+		// add contribution to new source flux
+		float flux_integral = (q0 * tau + (sigT * psi[g] - q0) * expVal)
+			/ sigT2
+			+ q1 * mu * (tau * (tau - 2) + 2 * expVal)
+			/ (sigT * sigT2)
+			+ q2 * mu2 * (tau * (tau * (tau - 3) + 6) - 6 * expVal)
+			/ (3 * sigT2 * sigT2);
+
+		#pragma omp atomic
+		FSR_flux[g] += weight * flux_integral;
+
+		// update angular flux
+		psi[g] = psi[g] * (1.0 - expVal) + q0 * expVal / sigT
+			+ q1 * mu * (tau - expVal) / sigT2 + q2 * mu2 *
+			(tau * (tau - 2) + 2 * expVal) / (sigT2 * sigT);
+	}
+}
+
+/* Determines the change in angular flux along a particular track across a fine
+ * axial region and tallies the contribution to the scalar flux in the fine 
+ * axial region. This function assumes a constant  source. */ 
+void attenuate_FSR_fluxes( Track * track, bool forward, Source * FSR, Input * I,
+		Params * params_in, float ds, float mu, float az_weight, 
+		AttenuateVars *A)
+{
+	// upack attenuate vars struct
+	float *  restrict tally = A->tally;
+	float *  restrict expVal = A->expVal;
+	float *  restrict sigT = A->sigT;
+	float *  restrict tau = A->tau;
+
+	Params params = * params_in;
+
+	// compute fine axial interval spacing
+	float dz = I->height / (I->fai * I->decomp_assemblies_ax * I->cai);
+
+	// compute z height in cell
+	float zin = track->z_height - dz * 
+		( (int)( track->z_height / dz ) + 0.5f );
+
+	// compute fine axial region ID
+	int fine_id = (int) ( track->z_height / dz ) % I->fai;
+
+	// compute weight (azimuthal * polar)
+	// NOTE: real app would also have volume weight component
+	float weight = track->p_weight * az_weight * mu;
+
+	// load fine source region flux vector
+	float * FSR_flux = FSR -> fine_flux[fine_id];
+
+	// cycle over energy groups
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I->n_egroups; g++)
+	{
+		// load total cross section
+		sigT[g] = FSR->sigT[g];
+		tau[g] = sigT[g] * ds;
+	}
+
+	// compute exponential ( 1 - exp(-x) ) using table lookup
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for(int g = 0; g < I->n_egroups; g++)
+	{
+		expVal[g] = interpolateTable( params.expTable, tau[g] );
+	}
+
+	float * psi;
+	if(forward)
+		psi = track->f_psi;
+	else
+		psi = track->b_psi;
+
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I->n_egroups; g++)
+	{
+		// compute angular flux attenuation
+		float q = FSR->fine_source[fine_id][g] / sigT[g];
+		float delta_psi = (psi[g] - q) * expVal[g];
+
+		// add contribution to new source flux
+		tally[g] = weight * delta_psi;
+
+		// update angular flux
+		psi[g] -= delta_psi;
+	}
+
+
+
+	#ifdef OPENMP
+	omp_set_lock(&FSR->locks[fine_id]);
+	#endif
+
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I->n_egroups; g++)
+	{
+		FSR_flux[g] += tally[g];
+	}
+
+	#ifdef OPENMP
+	omp_unset_lock(&FSR->locks[fine_id]);
+	#endif
+
+
+}
+
+/* Renormalizes scalar and angular flux for next transport sweep iteration.
+ * Calculation requires multiple pair-wise sums and a reduction accross all 
+ * nodes. */
+void renormalize_flux( Params params, Input I, CommGrid grid )
+{
+	if( I.mype == 0 ) printf("Renormalizing Flux...\n");
+	float node_fission_rate = 0;
+	#ifdef OPENMP
+	#pragma omp parallel default(none) shared(params, I, grid) \
+	reduction(+ : node_fission_rate)
+	{
+	#endif
+		// tally total fission rate (pair-wise sum)
+		float * fission_rates = malloc( I.n_source_regions_per_node 
+				* sizeof(float) );
+
+		float * fine_fission_rates = malloc( I.fai * sizeof(float) );
+		float * g_fission_rates = malloc( I.n_egroups * sizeof(float) );
+
+		// accumulate total fission rate on node domain
+		#pragma omp for schedule(dynamic)
+		for( int i = 0; i < I.n_source_regions_per_node; i++)
+		{
+			Source src = params.sources[i];
+			for( int j = 0; j < I.fai; j++)
+			{
+				for( int g = 0; g < I.n_egroups; g++)
+					g_fission_rates[g] = src.fine_flux[j][g] * src.vol 
+						* src.XS[g][0];
+				fine_fission_rates[j] = pairwise_sum( g_fission_rates, 
+						I.n_egroups );
+			}
+			fission_rates[i] = pairwise_sum( fine_fission_rates, I.fai );
+		}
+		node_fission_rate = pairwise_sum(fission_rates, 
+				I.n_source_regions_per_node);
+		
+		// free allocated memory
+		free(fission_rates);
+		free(fine_fission_rates);
+		free(g_fission_rates);
+	
+	#ifdef OPENMP
+	}
+	#endif
+
+	#ifdef MPI	
+	// accumulate total fission rate by MPI Allreduce
+	float total_fission_rate = 0;
+	MPI_Barrier(grid.cart_comm_3d);
+	MPI_Allreduce( &node_fission_rate, // Send Buffer
+			&total_fission_rate,    // Receive Buffer
+			1,                    	// Element Count
+			MPI_FLOAT,           	// Element Type
+			MPI_SUM,              	// Reduciton Operation Type
+			grid.cart_comm_3d );  	// MPI Communicator
+	MPI_Barrier(grid.cart_comm_3d);
+	#else
+	float total_fission_rate = node_fission_rate;
+	#endif
+
+
+	// normalize fluxes by fission reaction rate
+	float norm_factor = 1.0 / total_fission_rate;
+
+	#pragma omp parallel for default(none) \
+	shared(I, params) private(norm_factor) schedule(dynamic)
+	for( int i = 0; i < I.n_source_regions_per_node; i++)
+	{
+		Source * src = &params.sources[i];
+		float adjust = norm_factor * 4 * M_PI * I.fai / src->vol;
+		for( int k = 0; k < I.fai; k++)
+			for( int g = 0; g < I.n_egroups; g++)
+				src->fine_flux[k][g] *= adjust;
+	}
+
+	// normalize boundary fluxes by same factor
+	#pragma omp parallel for default(none) \
+	shared(I, params) private(norm_factor) schedule(dynamic)
+	for( long i = 0; i < I.ntracks_2D; i++)
+		for( int j = 0; j < I.n_polar_angles; j++)
+			for( int k = 0; k < I.z_stacked; k++)
+				for( int g = 0; g < I.n_egroups; g++)
+				{
+					params.tracks[i][j][k].f_psi[g] *= norm_factor;
+					params.tracks[i][j][k].b_psi[g] *= norm_factor;
+				}
+
+	if( I.mype == 0 ) printf("Renormalizing Flux Complete.\n");
+	return;
+}
+
+/* Updates sources for next iteration by computing scattering and fission
+ * components. Calculation includes multiple pair-wise sums and reductions
+ * accross all nodes */
+float update_sources( Params params, Input I, float keff )
+{
+	// source residual
+	float residual;
+
+	// calculate inverse multiplication facotr for efficiency
+	float inverse_k = 1.0 / keff;
+
+	// allocate residual arrays
+	float * group_res = (float *) malloc(I.n_egroups * sizeof(float));
+	float * fine_res = (float *) malloc(I.fai * sizeof(float));
+	float * residuals = (float *) malloc(I.n_source_regions_per_node 
+			* sizeof(float));
+
+	// allocate arrays for summation
+	float * fission_rates = malloc(I.n_egroups * sizeof(float));
+	float * scatter_rates = malloc(I.n_egroups * sizeof(float));
+
+	// cycle through all coarse axial intervals to update source
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+	{
+		Source src = params.sources[i];
+
+		// cycle thorugh all fine axial regions to calculate new source
+		for( int j = 0; j < I.fai; j++)
+		{
+			// calculate total fission source and scattering source
+			float fission_source;
+			float scatter_source;
+
+			// compute total fission source
+			for( int g = 0; g < I.n_egroups; g++ )
+				fission_rates[g] = src.fine_flux[j][g] * src.XS[g][0];
+			fission_source = pairwise_sum( fission_rates, (long) I.n_egroups);
+
+			// normalize fission source by multiplication factor
+			fission_source *= inverse_k;
+
+			// compute scattering and new total source for each group
+			for( int g = 0; g < I.n_egroups; g++ )
+			{
+				for( int g2 = 0; g2 < I.n_egroups; g2++ )
+				{
+					// compute scatter source originating from g2 -> g
+					scatter_rates[g2] = src.scattering_matrix[g][g2] * 
+						src.fine_flux[j][g2];
+				}
+				scatter_source = pairwise_sum(scatter_rates, 
+						(long) I.n_egroups);
+
+				// compuate new total source
+				float chi = src.XS[g][2];
+
+				// calculate new fine source
+				float newSrc = (fission_source * chi + scatter_source) 
+					/ (4.0 * M_PI);
+
+				// calculate residual
+				float oldSrc = src.fine_source[j][g];
+				group_res[g] = (newSrc - oldSrc) * (newSrc - oldSrc)
+					/ (oldSrc * oldSrc);
+
+				/* calculate new source in fine axial interval assuming 
+				 * isotropic source components */
+				src.fine_source[j][g] = newSrc;
+			}
+			fine_res[j] = pairwise_sum(group_res, (long) I.n_egroups);
+		}
+		residuals[i] = pairwise_sum(fine_res, (long) I.fai);
+	}
+
+	// calculate source residual
+	residual = pairwise_sum(residuals, I.n_source_regions_per_node);
+
+	// free memory
+	free(fission_rates);
+	free(scatter_rates);
+	free(group_res);
+	free(fine_res);
+	free(residuals);
+
+
+	// NOTE: See code around line 600 of CPUSolver.cpp in ClosedMOC/ OpenMOC
+
+	return residual;
+}
+
+/* Computes globall k-effective using multiple pair-wise summations and finally
+ * a reduction accross all nodes */
+float compute_keff(Params params, Input I, CommGrid grid)
+{
+	// allocate temporary memory
+	float * sigma = malloc( I.n_egroups * sizeof(float) );
+	float * group_rates = malloc( I.n_egroups * sizeof(float) );
+	float * fine_rates = malloc( I.fai * sizeof(float) );
+	float * QSR_rates = malloc( I.n_source_regions_per_node * sizeof(float) );
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// compute total absorption rate, looping over source regions
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+	{
+		// load absorption XS data
+		Source src = params.sources[i];
+		for( int g = 0; g < I.n_egroups; g++)	
+			sigma[g] = src.XS[g][1];
+
+		for( int j = 0; j < I.fai; j++ )
+		{
+			// calculate absorption rates
+			float * fine_flux = src.fine_flux[j];
+			for( int g = 0; g < I.n_egroups; g++)
+				group_rates[g] = sigma[g] * fine_flux[g];
+
+			// sum absorption over all energy groups
+			fine_rates[j] = pairwise_sum( group_rates, (long) I.n_egroups );
+		}
+		// sum absorption over all fine axial intervals
+		QSR_rates[i] = pairwise_sum( fine_rates, (long) I.fai );
+	}
+	// sum absorption over all source regions in a node
+	float node_abs = pairwise_sum( QSR_rates, I.n_source_regions_per_node);
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// compute total absorption rate, looping over source regions
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+	{
+		// load nuSigmaF XS data
+		Source src = params.sources[i];
+		for( int g = 0; g < I.n_egroups; g++)	
+			sigma[g] = src.XS[g][0];
+
+		for( int j = 0; j < I.fai; j++ )
+		{
+			// calculate absorption rates
+			float * fine_flux = src.fine_flux[j];
+			for( int g = 0; g < I.n_egroups; g++)
+				group_rates[g] = sigma[g] * fine_flux[g];
+
+			// sum fission over all energy groups
+			fine_rates[j] = pairwise_sum( group_rates, (long) I.n_egroups );
+		}
+		// sum fission over all fine axial intervals
+		QSR_rates[i] = pairwise_sum( fine_rates, (long) I.fai );
+	}
+	// sum fission over all source regions in a node
+	float node_fission = pairwise_sum( QSR_rates, I.n_source_regions_per_node);
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// MPi Reduction
+	float tot_abs = 0;
+	float tot_fission = 0;
+	float leakage = 0;
+
+	#ifdef MPI 
+
+	// Total Absorption Reduction
+	MPI_Reduce( &node_abs,    		// Send Buffer
+			&tot_abs,      			// Receive Buffer
+			1,                  	// Element Count
+			MPI_FLOAT,          	// Element Type
+			MPI_SUM,            	// Reduciton Operation Type
+			0,                  	// Master Rank
+			grid.cart_comm_3d );	// MPI Communicator
+
+	// Total Fission Reduction
+	MPI_Reduce( &node_fission,     	// Send Buffer
+			&tot_fission,  			// Receive Buffer
+			1,                    	// Element Count
+			MPI_FLOAT,           	// Element Type
+			MPI_SUM,              	// Reduciton Operation Type
+			0,                    	// Master Rank
+			grid.cart_comm_3d );  	// MPI Communicator
+
+	// Total Leakage Reduction
+	MPI_Reduce( params.leakage,  	// Send Buffer
+			&leakage,      			// Receive Buffer
+			1,                    	// Element Count
+			MPI_FLOAT,           	// Element Type
+			MPI_SUM,              	// Reduciton Operation Type
+			0,                    	// Master Rank
+			grid.cart_comm_3d );  	// MPI Communicator
+
+	MPI_Barrier(grid.cart_comm_3d);
+
+	// calculate keff
+	float keff = tot_fission/ (tot_abs + leakage);
+	#else
+	float keff = node_fission / (node_abs + *params.leakage);
+	#endif
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// free memory
+	free(sigma);
+	free(group_rates);
+	free(fine_rates);
+	free(QSR_rates);
+
+	return keff;
+}
+
+/* Interpolates a formed exponential table to compute ( 1- exp(-x) )
+ *  at the desired x value */
+float interpolateTable( Table table, float x)
+{
+	// check to ensure value is in domain
+	if( x > table.maxVal )
+		return 1.0f;
+	else
+	{
+		int interval = (int) ( x / table.dx + 0.5f * table.dx );
+		/*
+		   if( interval >= table.N || interval < 0)
+		   {
+		   printf( "Interval = %d\n", interval);
+		   printf( "N = %d\n", table.N);
+		   printf( "x = %f\n", x);
+		   printf( "dx = %f\n", table.dx);
+		   exit(1);
+		   }
+		   */
+		float slope = table.values[ 2 * interval ];
+		float intercept = table.values[ 2 * interval + 1 ];
+		float val = slope * x + intercept;
+		return val;
+	}
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c
@@ -0,0 +1,232 @@
+#include"SimpleMOC_header.h"
+
+// intitialize source structures and intitialize XS values to random numbers
+Source * initialize_sources( Input I, size_t * nbytes )
+{
+	// Allocate space
+	Source * sources = (Source *) malloc( I.n_source_regions_per_node 
+			* sizeof(Source) );
+	*nbytes += I.n_source_regions_per_node * sizeof(Source);
+
+	// determine number of cross section and coarse axial regions
+	long n_xs_regions = I.n_source_regions_per_node / 8;
+	
+	///////////////////////////////////////////////////////////////////////////
+	
+	// Allocate scattering matrix matrix ptrs
+	float *** s_matrices = (float ***) malloc( n_xs_regions 
+			* sizeof(float**) );
+	*nbytes += n_xs_regions * sizeof(float **);
+
+	// Allocate space for ALL scattering matrix ptrs
+	float ** s_matrix_ptrs = (float **) malloc( n_xs_regions * I.n_egroups 
+			* sizeof(float *));
+	*nbytes += n_xs_regions * sizeof(float **);
+
+	// Allocate space for ALL scattering data
+#ifdef PRINT_MEM_SIZES
+	if(I.mype==0) 
+		printf("Scattering data requires %zu MB of data...\n", 
+				n_xs_regions * I.n_egroups * I.n_egroups 
+				* sizeof(float) / 1024 / 1024);
+#endif
+	float * s_matrix_data = (float *) malloc( n_xs_regions * I.n_egroups
+		   	* I.n_egroups * sizeof(float));
+	*nbytes += n_xs_regions * I.n_egroups * I.n_egroups * sizeof(float);
+
+	// Stitch allocation ptrs together
+	for( long i = 0; i < n_xs_regions; i++ )
+		s_matrices[i] = &s_matrix_ptrs[i * I.n_egroups];
+
+	for( long i = 0; i < n_xs_regions; i++ )
+		for( long j = 0; j < I.n_egroups; j++ )
+			s_matrices[i][j] = &s_matrix_data[i * I.n_egroups * I.n_egroups 
+				+ j * I.n_egroups];
+
+	// Iniitalize Scattering Matrix Values
+	for( long i = 0; i < n_xs_regions; i++ )
+		for( long j = 0; j < I.n_egroups; j++ )
+			for( long k = 0; k < I.n_egroups; k++ )
+				s_matrices[i][j][k] = urand();
+
+	///////////////////////////////////////////////////////////////////////////
+	
+	/*
+	 * Create data scrtucture for storing XS data (and chi) as follows:
+	 * An array is created which stores in contigious memory as
+	 * [ ... , nu*SigmaF, SigmaA, Chi, ...]
+	 */
+	
+	if(I.mype==0) printf("Beginning XS Allocation...\n");
+
+	// Allocate space for XS ptrs (by region)
+	float *** XS = (float ***) malloc( n_xs_regions * sizeof(float **) );
+	*nbytes += n_xs_regions * sizeof(float **);
+
+	// Allocate space for each XS type of interest (total, nu*SigmaF, and chi)
+	float ** XS_arrays = (float **) malloc (n_xs_regions * I.n_egroups
+		   	* sizeof(float *) );
+	*nbytes += n_xs_regions * I.n_egroups * sizeof(float *);
+
+	// Allocate space for total XS data
+	float * XS_data = (float *) malloc( n_xs_regions * I.n_egroups * 3 
+			* sizeof(float) );
+	*nbytes += n_xs_regions * I.n_egroups * 3 * sizeof(float);
+
+	// stitch allocation ptrs together for XS data
+	for( long i = 0; i < n_xs_regions; i++)
+		XS[i] = &XS_arrays[i * I.n_egroups];
+
+	for( long i = 0; i < n_xs_regions; i++)
+		for(long j = 0; j < I.n_egroups; j++)
+			XS[i][j] = &XS_data[i * I.n_egroups * 3 + j * 3];
+
+	// Initialize XS data
+	for( long i = 0; i < n_xs_regions; i++)
+		for( int j = 0; j < I.n_egroups; j++)
+			for( int k = 0; k < 3; k++)
+				XS[i][j][k] = urand();
+
+	///////////////////////////////////////////////////////////////////////////
+
+	/* Here I allocate useful source parameters and fluxes for use in the
+	 * attenuate_fluxes function in solver.c - the parameters are arranged
+	 * contigiously for efficiency */
+
+	if(I.mype==0) printf("Beginning Source and Flux Parameter Allocation...\n");
+
+	// Allocate space for source parameters (quadratic axially)
+	float *** fineSource = (float ***) malloc( I.n_source_regions_per_node
+		   	* sizeof(float **) );
+	*nbytes += I.n_source_regions_per_node * sizeof(float **);
+
+	// Allocate space for array pointers to parameters
+	float ** fineSourcePtrs = (float **) malloc ( I.n_source_regions_per_node
+		   	* I.fai * sizeof(float *) );
+	*nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *);
+
+	// Allocate space for flux parameters (quadratic axially)
+	float *** fineFlux = (float ***) malloc( I.n_source_regions_per_node
+		   	* sizeof(float **) );
+	*nbytes += I.n_source_regions_per_node * sizeof(float **);
+
+	// Allocate space for array pointers to parameters
+	float ** fineFluxPtrs = (float **) malloc ( I.n_source_regions_per_node
+		   	* I.fai * sizeof(float *) );
+	*nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *);
+
+	// Allocate space for cross section pointers
+	float ** sigT = (float **) malloc( I.n_source_regions_per_node
+			* sizeof(float *) );
+
+	// Allocate space for parameter data
+	float * data = (float *) malloc( (2 * I.fai + 1) * 
+			I.n_source_regions_per_node * I.n_egroups * sizeof(float) );
+
+	*nbytes += I.n_source_regions_per_node * I.fai * I.n_egroups 
+		* sizeof(float);
+
+	///////////////////////////////////////////////////////////////////////////
+	
+	// stitch pointers together
+
+	// stitch allocation ptrs together for source data
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		fineSource[i] = &fineSourcePtrs[i * I.fai];
+
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for(long j = 0; j < I.fai; j++)
+			fineSource[i][j] = &data[i * I.fai * I.n_egroups
+			   	+ j * I.n_egroups];
+
+	// stitch allocation ptrs together for flux data
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		fineFlux[i] = &fineFluxPtrs[i * I.fai];
+
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for(int j = 0; j < I.fai; j++)
+			fineFlux[i][j] = &data[I.n_egroups * I.fai * 
+				(I.n_source_regions_per_node + i) + j * I.n_egroups];
+	
+	// stitch allocation ptrs together for total XS
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		sigT[i] = &data[ 2 * I.n_source_regions_per_node * I.fai * I.n_egroups 
+			+ i * I.n_egroups];
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// Initialize source to random numbers
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for( int j = 0; j < I.fai; j++)
+			for( int k = 0; k < I.n_egroups; k++)
+				fineSource[i][j][k] = urand();
+		
+	// Initialize fine flux to zeros
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for( int j = 0; j < I.fai; j++)
+			for( int k = 0; k < I.n_egroups; k++)
+				fineFlux[i][j][k] = 0;
+	
+
+	// Initialize total XS to random numbers
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for( int k = 0; k < I.n_egroups; k++)
+			sigT[i][k] = urand();
+
+	///////////////////////////////////////////////////////////////////////////
+	//
+	
+	#ifdef OPENMP
+	omp_lock_t * locks = init_locks(I);
+	long lock_idx = 0;
+	#endif
+
+	// Assign to source regions
+	for( long i = 0; i < I.n_source_regions_per_node; i++ )
+	{
+		long idx;
+		if( i == 0 )
+			idx = 0;
+		else
+			idx = rand() % n_xs_regions;
+
+		sources[i].scattering_matrix = s_matrices[idx];
+		sources[i].XS = XS[idx];
+		sources[i].fine_flux = fineFlux[i];
+		sources[i].fine_source = fineSource[i]; 
+		sources[i].sigT = sigT[i];
+
+		// initialize FSR volume
+		sources[i].vol = urand();
+
+		#ifdef OPENMP
+		sources[i].locks = &locks[lock_idx];
+		lock_idx += I.fai;
+		#endif
+	}
+
+	// free memory of temporary variables
+	free( s_matrices );
+	free( XS );
+	free( fineFlux );
+	free( fineSource);
+	free( sigT);
+
+	return sources;
+}
+
+void free_sources( Input I, Source * sources )
+{
+	// Free XS's
+	free( sources[0].XS[0] );
+	free( sources[0].XS );
+	// Free Flux's
+	free( sources[0].fine_flux[0] );
+	free( sources[0].fine_flux );
+	// Free scattering matrices
+	free( sources[0].scattering_matrix[0] );
+	free( sources[0].scattering_matrix );
+	// Free source values
+	free( sources[0].fine_source[0] );
+	free( sources[0].fine_source );
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c
@@ -0,0 +1,71 @@
+#include"SimpleMOC_header.h"
+
+// prints input parameters to screen
+void print_Input_struct( Input I )
+{
+	printf("INPUT STRUCT\n");
+	printf("%d\n", I.x_assemblies);    		/* Number of assemblies in the 
+												x-axis of the reactor */
+	printf("%d\n", I.y_assemblies);       	/* Number of assemblies in the 
+												y-axis of the reactor */
+	printf("%d\n", I.cai);                 	// Number of course axial intervals
+	printf("%d\n", I.fai);                 	/* Number of fine axial intervals 
+											   per coarse axial interval */
+	printf("%d\n", I.axial_exp);          	// Axial source expansion order
+	printf("%lf\n", I.radial_ray_sep);    	// Radial ray separation
+	printf("%lf\n", I.axial_z_sep);       	// Axial stacked z-ray separation
+	printf("%d\n", I.n_azimuthal);         	// Number of azimuthal angles
+	printf("%d\n", I.n_polar_angles);      	// Number of polar angles
+	printf("%d\n", I.n_egroups);           	// Number of energy groups
+	printf("%d\n", I.decomp_assemblies_ax);	/* Number of assemblies per 
+												sub-domain (axially) */
+	printf("%ld\n", I.segments_per_track); 	/* Average number of segments per 
+												track */
+	printf("%lf\n", I.assembly_width);    	// Width of an assembly
+	printf("%lf\n", I.height);            	// Height of the reactor
+	printf("%lf\n", I.domain_height);     	// Z Height of a domain
+	printf("%lf\n", I.precision);		  	// precision for source convergence
+	printf("%ld\n", I.mype);               	// MPI Rank
+	printf("%ld\n", I.ntracks_2D);        	// Number of 2D tracks (derived)
+	printf("%d\n", I.z_stacked);          	// Number of z rays (derived)
+	printf("%ld\n", I.ntracks);           	/* Total number of 3D tracks per 
+											   assembly (derived) */
+	printf("%d\n", I.nthreads);           	// Number of OpenMP Threads
+	printf("%d\n", I.papi_event_set);     	/* PAPI event set:
+												0 - FLOPS   
+												1 - Bandwidth   
+												2 - CPU Stall reason
+											*/
+	
+	// Source regions per assembly
+	printf("%ld\n", I.n_2D_source_regions_per_assembly);
+
+	// Source regions per node (derived)	
+	printf("%ld\n", I.n_source_regions_per_node); 
+	printf("END INPUT STRUCT\n");
+	
+}
+
+// generates normally distributed random numbers
+void gen_norm_pts(float mean, float sigma, int n_pts)
+{
+	// generate output file
+	FILE * out;
+	out = fopen("gen_points.txt","w");
+	fprintf(out, "Random numbers generated for a normal distribution\n");
+	fprintf(out, "Mean = %f\n", mean);
+	fprintf(out, "Standard deviation = %f\n", sigma);
+	fprintf(out, "Generated points:\n");
+
+	// generate gaussian random points
+	for(int i = 0; i < n_pts; i++)
+	{
+		float random = nrand(mean,sigma);	
+		fprintf(out, "%f\n", random);
+	}
+
+	// close stream
+	fclose(out);
+
+	return;
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c
@@ -0,0 +1,327 @@
+#include"SimpleMOC_header.h"
+
+// Allocates and initializes an array of 2D tracks
+Track2D * generate_2D_tracks( Input I, size_t * nbytes )
+{
+	// Allocate space for 2D tracks
+	Track2D * tracks = (Track2D *) malloc( I.ntracks_2D * sizeof(Track2D));
+	*nbytes += I.ntracks_2D * sizeof(Track2D);
+
+	// Fill weights with randomized data
+	for( int i = 0; i < I.ntracks_2D; i++ )
+		tracks[i].az_weight = urand();
+
+	// Allocate and randomize segments
+	generate_2D_segments( I, tracks, nbytes );
+
+	return tracks;
+}
+
+// Allocates and initializes all segments
+void generate_2D_segments( 
+		Input I, 
+		Track2D * tracks, 
+		size_t * nbytes )
+{
+	/* Randomize Number of segments per track, and accumulate total 2D 
+	 * segments in assembly */
+	long total_segments = 0;
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		tracks[i].n_segments = segments_per_2D_track_distribution( I );
+		total_segments += tracks[i].n_segments;
+	}
+	
+	// Allocate contiguous space for segments
+	Segment * contiguous_segments = (Segment *) malloc( total_segments 
+			* sizeof(Segment));
+	
+	*nbytes += total_segments * sizeof(Segment);
+
+	// Set segments arrays to correct locations within contiguous allocation
+	long idx = 0;
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		tracks[i].segments = &contiguous_segments[idx];
+		idx += tracks[i].n_segments;
+	}
+
+	// Initialize segments to randomized values
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		for( long j = 0; j < tracks[i].n_segments; j++ )
+		{
+			tracks[i].segments[j].length  = urand() * I.assembly_width
+				/ tracks[i].n_segments;
+			// source ID to be calculated on the fly
+		}
+	}
+}
+
+// generate segments per 2D track based on a normal distribution
+long segments_per_2D_track_distribution( Input I )
+{
+	return nrand(I.segments_per_track, sqrt(I.segments_per_track));
+}
+
+// free memory associated with 2D tracks
+void free_2D_tracks( Track2D * tracks )
+{
+	free(tracks[0].segments);
+	free(tracks);
+}
+
+// allocate memory for tracks (primarily angular fluxes)
+Track *** generate_tracks(Input I, Track2D * tracks_2D, size_t * nbytes)
+{
+	// Allocate space for tracks (3D)
+	Track *** tracks = (Track ***) malloc( I.ntracks_2D * sizeof(Track **));
+	*nbytes += I.ntracks_2D * sizeof(Track **);
+
+	// Allocate pointers for tracks associated with a 2D track
+	Track ** tracks_in_track2D = (Track **) malloc( I.ntracks_2D *
+		   	I.n_polar_angles * sizeof(Track *));
+	*nbytes += I.ntracks_2D * I.n_polar_angles * sizeof(Track *);
+
+	// Allocate complete array of track data
+	Track * track_data = (Track *) malloc( I.ntracks * sizeof(Track) );
+	*nbytes += I.ntracks * sizeof(Track);
+#ifdef PRINT_MEM_SIZES
+	if(I.mype==0) printf("3D track data requires %zu MB of data...\n", 
+			I.ntracks * sizeof(Track) / 1024 / 1024 );
+#endif
+
+	// stitch pointers together
+	for( long i = 0; i < I.ntracks_2D; i++ )
+		tracks[i] = &tracks_in_track2D[ i * I.n_polar_angles ];
+
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		for( int j = 0; j < I.n_polar_angles; j++ )
+		{
+			tracks[i][j] = &track_data[ 
+				(i * I.n_polar_angles + j) * I.z_stacked ];
+		}
+	}
+
+	// Allocate space for Flux Arrays
+	size_t flux_bytes_needed = 2 * I.ntracks * I.n_egroups * sizeof(float);
+	
+#ifdef PRINT_MEM_SIZES
+	if(I.mype==0) printf("Flux Arrays Require %zu MB of data...\n", 
+			flux_bytes_needed / 1024 / 1024);
+#endif
+
+	float * flux_space = (float *) malloc( flux_bytes_needed );
+	*nbytes += flux_bytes_needed;
+	size_t flux_idx = 0;
+
+	long offset = I.ntracks * I.n_egroups;
+
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		for( int j = 0; j < I.n_polar_angles; j++ )
+		{
+			for( int k = 0; k < I.z_stacked; k++ )
+			{
+				/* bottom z heights should only have upward directed polar
+				 * angles and  similarly top should only have downward directed
+				 * polar angles */
+				if( j < I.n_polar_angles/2 )
+					tracks[i][j][k].z_height = I.axial_z_sep * k;
+				else
+					tracks[i][j][k].z_height = I.axial_z_sep * ( k + 1 );
+				
+				// set polar weight, NOTE: this is the same for same polar angle
+				tracks[i][j][k].p_weight = urand();
+
+				// assign angular flux array memory
+				tracks[i][j][k].f_psi = &flux_space[flux_idx];
+				flux_idx += I.n_egroups;
+				tracks[i][j][k].b_psi = &flux_space[flux_idx];
+				flux_idx += I.n_egroups;
+
+				// set incoming flux to 0 (guess 0 for inner assemblies)
+				for( int g = 0; g < I.n_egroups; g++)
+				{
+					tracks[i][j][k].f_psi[g] = 0;
+					tracks[i][j][k].b_psi[g] = 0;
+				}
+			}
+		}
+	}
+	return tracks;
+}
+
+// free memory associated with tracks
+void free_tracks( Track *** tracks )
+{
+	free(tracks);
+}
+
+// assign polar angles
+float * generate_polar_angles( Input I )
+{
+	float * polar_angles = (float *) malloc( I.n_polar_angles * 
+			sizeof(float) );
+
+	for( int i = 0; i < I.n_polar_angles; i++)
+		polar_angles[i] = M_PI * (i + 0.5) / I.n_polar_angles;
+	
+	return polar_angles;
+}
+
+Track2D * load_OpenMOC_tracks(char* fname, bool CMFD_obj, Input* I, size_t* nbytes)
+{
+	int ret;
+	FILE* in;
+	in = fopen(fname, "r");
+	printf("Reading track data from:\n%s\n",fname);
+	int string_length;
+
+	/* Import Geometry metadata from the Track file */
+	ret = fread(&string_length, sizeof(int), 1, in);
+	char geometry_to_string[string_length];
+	ret = fread(geometry_to_string, sizeof(char)*string_length, 1, in);
+	printf("Importing ray tracing data from file...\n");
+	
+	/* Import ray tracing metadata from the Track file */
+	double spacing;
+	ret = fread(&I->n_azimuthal, sizeof(int), 1, in);
+	ret = fread(&spacing, sizeof(double), 1, in);
+	I->radial_ray_sep = (float) spacing;
+
+	/* Initialize data structures for Tracks */
+	int num_tracks[I->n_azimuthal];
+	int num_x[I->n_azimuthal];
+	int num_y[I->n_azimuthal];
+	double azim_weights[I->n_azimuthal];
+	
+	ret = fread(num_tracks, sizeof(int), I->n_azimuthal, in);
+	ret = fread(num_x, sizeof(int), I->n_azimuthal, in);
+	ret = fread(num_y, sizeof(int), I->n_azimuthal, in);
+	ret = fread(azim_weights, sizeof(double), I->n_azimuthal, in);
+	
+	/* Import azimuthal angle quadrature weights from Track file */
+	double x0, y0, x1, y1;
+	double phi;
+	int azim_angle_index;
+	int num_segments;
+	double length;
+	int material_id;
+	int region_id;
+	int mesh_surface_fwd;
+	int mesh_surface_bwd;
+	long num_2D_tracks;
+
+	/* Calculate the total number of 2D Tracks */
+	I->ntracks_2D = 0;
+	for (int i=0; i < I->n_azimuthal; i++)
+		I->ntracks_2D += (long) num_tracks[i];	
+	
+	/* Allocate memory for 2D tracks */
+	Track2D * tracks = (Track2D *) malloc( I->ntracks_2D * sizeof(Track2D));
+	*nbytes += I->ntracks_2D * sizeof(Track2D);
+
+	// Allocate memory for the number of segments per Track array
+	long tot_num_segments = 0;
+
+	fpos_t checkpoint;
+	fgetpos(in, &checkpoint);
+
+	// Loop over tracks to determine total number of segments
+	for (int i=0; i < I->n_azimuthal; i++)
+	{
+		for (int j=0; j < num_tracks[i]; j++)
+		{
+			// Skip to segment number information
+			fseek( in, 5*sizeof(double) + sizeof(int), SEEK_CUR);
+			
+			// Load number of segments
+			ret = fread(&num_segments, sizeof(int), 1, in);
+			tot_num_segments += (long) num_segments;
+
+			// Loop over all segments in this Track
+			for (int s=0; s < num_segments; s++)
+			{
+				// Skip ahead again 
+				fseek( in, 2*sizeof(int) + sizeof(double), SEEK_CUR);
+				if (CMFD_obj)
+					fseek( in, 2*sizeof(int), SEEK_CUR);
+			}
+		}
+	}
+
+	// Allocate contiguous space for segments 
+	Segment * contiguous_segments = (Segment *) malloc( tot_num_segments 
+			* sizeof(Segment));
+	*nbytes += tot_num_segments * sizeof(Segment);
+
+	// Reset file handle 
+	fsetpos(in, &checkpoint);
+	
+	/* Initialize 2D track unique identifer (uid) end semgnet index (idx)*/
+	int uid = 0;
+	int idx = 0;
+	
+	/* Loop over Tracks */
+	for (int i=0; i < I->n_azimuthal; i++)
+	{
+		//TODO: recored actual azimuthal angles
+		for (int j=0; j < num_tracks[i]; j++)
+		{
+			/* Import data for this Track from Track file */
+			ret = fread(&x0, sizeof(double), 1, in);
+			ret = fread(&y0, sizeof(double), 1, in);
+			ret = fread(&x1, sizeof(double), 1, in);
+			ret = fread(&y1, sizeof(double), 1, in);
+			ret = fread(&phi, sizeof(double), 1, in);
+			ret = fread(&azim_angle_index, sizeof(int), 1, in);
+			
+			/* Load number of segments */
+			ret = fread(&num_segments, sizeof(int), 1, in);
+			tracks[uid].n_segments = (long) num_segments;
+
+			/* Allocate memory for segments */
+			tracks[uid].segments = &contiguous_segments[idx];
+			idx += tracks[uid].n_segments;
+
+			// TODO: set real azimuthal weight
+			tracks[uid].az_weight = urand();
+		
+			/* Loop over all segments in this Track */
+			for (int s=0; s < num_segments; s++)
+			{
+				/* Import data for this segment from Track file */
+				ret = fread(&length, sizeof(double), 1, in);
+				ret = fread(&material_id, sizeof(int), 1, in);
+				ret = fread(&region_id, sizeof(int), 1, in);
+			
+				/* record track length */
+				tracks[uid].segments[s].length = (float) length;
+
+				/* record region_id (not used at the moment) */
+				tracks[uid].segments[s].source_id = (long) region_id;
+
+				/* Import CMFD-related data if needed */
+				if (CMFD_obj)
+				{
+					ret = fread(&mesh_surface_fwd, sizeof(int), 1, in);
+					ret = fread(&mesh_surface_bwd, sizeof(int), 1, in);
+				}
+			}
+			uid++;
+		}
+	}
+	// recalculate average number of segments per track
+	I->segments_per_track = tot_num_segments / I->ntracks_2D;	
+
+	printf("Number of 2D tracks = %ld\n", I->ntracks_2D);
+	I->ntracks = I->ntracks_2D * I->n_polar_angles * I->z_stacked; 
+	printf("Number of 3D tracks = %ld\n", I->ntracks);
+	printf("Number of segments = %ld\n", tot_num_segments);
+
+	/* Close the Track file */
+	fclose(in);
+	return tracks;
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c
@@ -0,0 +1,155 @@
+#include"SimpleMOC_header.h"
+
+// generates a random number between 0 and 1
+float urand(void)
+{
+	return (float)rand() / (float)RAND_MAX;
+}
+
+/* generates a random number from a normal distribution of a 
+ * given mean and standard deviation (Box-Muller) */
+float nrand(float mean, float sigma)
+{
+	// generate two random numbers
+	float rand1 = urand();
+	float rand2 = urand();
+
+	/* use first for "exponential part" and second for
+	 *  "azimuthal part" to create a standard normal random number */
+	float x = sqrt( -2 * log(rand1) ) * cos( 2 * M_PI * rand2);
+
+	// NOTE: We can generate a second random number if neeeded
+	// y = sqrt(-2*log(rand1)) * sin( 2 * M_PI * rand2);
+	
+	// shift random number to appropriate normal distribution and return
+	return x * sigma + mean;
+}
+
+// pairwise summation for long arrays
+float pairwise_sum( float * vector, long size ){
+	float sum = 0;
+
+	// Base case: perform summation if size <= 16
+	if( size <= 16)
+		for( int i = 0; i < size; i++ )
+			sum += vector[i];
+
+	else
+	{
+		// otherwise, split
+		sum = pairwise_sum( &vector[0], size/2 ) +
+			pairwise_sum( &vector[size/2], size - size/2 );
+	}
+	
+	return sum;
+}
+
+// Builds a table of exponential values for linear interpolation
+Table buildExponentialTable( float precision, float maxVal )
+{
+	// define table
+	Table table;
+
+	// compute number of arry values
+	int N = (int) ( maxVal * sqrt(1.0 / ( 8.0 * precision * 0.01 ) ) ); 
+
+	// compute spacing
+	float dx = maxVal / (float) N;
+
+	// allocate an array to store information
+	float * tableVals = malloc( (2 * N ) * sizeof(float) );
+
+	// store linear segment information (slope and y-intercept)
+	for( int n = 0; n < N; n++ )
+	{
+		// compute slope and y-intercept for ( 1 - exp(-x) )
+		float exponential = exp( - n * dx );
+		tableVals[ 2*n ] = - exponential;
+		tableVals[ 2*n + 1 ] = 1 + ( n * dx - 1 ) * exponential;
+	}
+
+	// assign data to table
+	table.dx = dx;
+	table.values = tableVals;
+	table.maxVal = maxVal - table.dx;
+	table.N = N;
+
+	return table;
+}
+
+// Timer function. Depends on if compiled with MPI, openmp, or vanilla
+double get_time(void)
+{
+	#ifdef MPI
+	return MPI_Wtime();
+	#endif
+
+	#ifdef OPENMP
+	return omp_get_wtime();
+	#endif
+
+	time_t time;
+	time = clock();
+
+	return (double) time / (double) CLOCKS_PER_SEC;
+}
+
+// Memory Usage Estimator
+size_t est_mem_usage( Input I )
+{
+	size_t nbytes = 0;
+	
+	int z_stacked = (int) ( I.height / (I.axial_z_sep * 
+				I.decomp_assemblies_ax) );
+	
+	long n_xs_regions = I.n_source_regions_per_node / 8;
+
+	nbytes += I.ntracks_2D * sizeof(Track2D);
+	nbytes += I.segments_per_track * I.ntracks_2D * sizeof(Segment);
+	nbytes += I.ntracks_2D * sizeof(Track **);
+	nbytes += I.ntracks_2D * I.n_polar_angles * sizeof(Track *);
+	nbytes += I.ntracks * sizeof(Track);
+	nbytes += I.ntracks_2D * I.n_polar_angles * z_stacked 
+		* I.n_egroups * sizeof(float) * 2;
+	nbytes += I.n_source_regions_per_node * sizeof(Source);
+	nbytes += n_xs_regions * sizeof(float **);
+	nbytes += n_xs_regions * sizeof(float **);
+	nbytes += n_xs_regions * I.n_egroups * I.n_egroups * sizeof(float);
+	nbytes += n_xs_regions * sizeof(float **);
+	nbytes += n_xs_regions * I.n_egroups * sizeof(float *);
+	nbytes += n_xs_regions * I.n_egroups * 3 * sizeof(float);
+	nbytes += I.n_source_regions_per_node * sizeof(float **);
+	nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *);
+	nbytes += I.n_source_regions_per_node * sizeof(float **);
+	nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *);
+	nbytes += I.n_source_regions_per_node * I.fai * I.n_egroups
+		* sizeof(float);
+	
+	// 2way tracking memory
+	nbytes += I.nthreads * I.z_stacked * sizeof(double *);
+	nbytes += I.nthreads * I.z_stacked * sizeof(Source**);
+	nbytes += I.nthreads * I.z_stacked * sizeof(int);
+	nbytes += I.nthreads * I.z_stacked * sizeof(int);			
+	nbytes += I.nthreads * I.z_stacked * 2 * I.segments_per_track 
+		* sizeof(double);
+	nbytes += I.nthreads * I.z_stacked * 2 * I.segments_per_track 
+		* sizeof(Source *);		
+
+	// MPI Buffers
+	#ifdef MPI
+	nbytes += 6 * I.n_egroups * 10000 * sizeof(float);
+	#endif
+	
+	return nbytes;
+}
+
+// Calculates Time per Intersection
+double time_per_intersection( Input I, double time )
+{
+	/* This was the old estimate - new way uses actual tracking data
+	double tpi = time / (double) I.ntracks /
+		(double) I.segments_per_track / (double) I.n_egroups / 1e-9 / 2; 
+		*/
+	double tpi = time / (double) I.segments_processed * 1.0e9 / (double) I.n_egroups;
+	return tpi;
+}