Index: LICENSE.TXT =================================================================== --- LICENSE.TXT +++ LICENSE.TXT @@ -63,6 +63,7 @@ ------- --------- Autoconf: llvm-test/autoconf Benchmark: llvm-test/libs/benchmark-1.1.0 +LCALS: llvm-test/MicroBenchmarks/LCALS Burg: llvm-test/MultiSource/Applications/Burg Aha: llvm-test/MultiSource/Applications/aha SGEFA: llvm-test/MultiSource/Applications/sgefa Index: MicroBenchmarks/LCALS/CMakeLists.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/CMakeLists.txt @@ -0,0 +1,6 @@ +add_subdirectory(SubsetARawLoops) +add_subdirectory(SubsetALambdaLoops) +add_subdirectory(SubsetBRawLoops) +add_subdirectory(SubsetBLambdaLoops) +add_subdirectory(SubsetCRawLoops) +add_subdirectory(SubsetCLambdaLoops) Index: MicroBenchmarks/LCALS/LCALSParams.hxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/LCALSParams.hxx @@ -0,0 +1,836 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Header file with macros and constants for data types, execution, +// timing options, etc. used in LCALS +// + +#ifndef LCALSParams_HXX +#define LCALSParams_HXX + + +//////////////////////////////////////////////////////////////////////////////// +// +// This file contains various parameters that control compilation and some +// aspects of execution of the loop suite. The macro constants and typedefs in +// this file provide the ability to make changes that will propagate +// throughout the LCALS code when compiled. Parameters in this file specify: +// +// o Timing and checksum output options +// o Scalar data types and pointer types (e.g., restrict & alignment properties) +// o Loop variants that can be built by each compiler +// o Loop execution policies for traversal templates (used with loop bodies +// represented as lambda expressions or functors) +// +// +// IMPORTANT: MANY OF THE MACROS CONTROLLING THESE OPTIONS +// ARE SET IN THE LCALS_rules.mk FILE. +// +//////////////////////////////////////////////////////////////////////////////// + +#if defined(LCALS_VERIFY_CHECKSUM_ABBREVIATED) +static const int num_checksum_suite_passes = 1; +static const int num_checksum_samples = 3; +#endif + + + +//////////////////////////////////////////////////////////////////////////////// +// +// Define/undefine macro constants and other paramters used to control data +// type definitions. +// +//////////////////////////////////////////////////////////////////////////////// + +// +// Parameterized scalar data types. +// + +typedef int Index_type; + +#if defined(LCALS_USE_DOUBLE) +/// +typedef double Real_type; + +#elif defined(LCALS_USE_FLOAT) +/// +typedef float Real_type; + + +#else +#error LCALS Real_type is undefined! + +#endif + +#include +typedef std::complex Complex_type; + +// +// Use volatile keyword on loop variable for sampling loops to prevent +// compilers from potentially optimizing out loops where result is +// identical for each sample iteration. +// +typedef volatile int SampIndex_type; +// +// Use unsigned long for loop variable used in loops to flush cache to +// allow for large caches with size possibly bigger than what int can adderss. +// +typedef unsigned long CacheIndex_type; + +// +// Floating point array data alignmnent value. Typically, same as +// SIMD vector width. +// +const int LCALS_DATA_ALIGN = 32; + + +// +// Compiler-specific definitions for inline directives, data alignment +// intrinsics, and SIMD vector pragmas +// +// Variables for compiler instrinsics, directives, typedefs +// +// LCALS_INLINE - macro to enforce method inlining +// +// LCALS_ALIGN_DATA() - macro to express alignment of data, +// loop bounds, etc. +// +// LCALS_SIMD - macro to express SIMD vectorization pragma to force +// loop vectorization +// + +#if defined(LCALS_COMPILER_ICC) +// +// Configuration options for Intel compilers +// + +#define LCALS_INLINE inline __attribute__((always_inline)) + +#if __ICC < 1300 // use alignment intrinsic +#define LCALS_ALIGN_DATA(d) __assume_aligned(d, LCALS_DATA_ALIGN) +#else +#define LCALS_ALIGN_DATA(d) // TODO: Define this... +#endif + +#define LCALS_SIMD // TODO: Define this... + + +#elif defined(LCALS_COMPILER_GNU) +// +// Configuration options for GNU compilers +// + +#define LCALS_INLINE inline __attribute__((always_inline)) + +#define LCALS_ALIGN_DATA(d) __builtin_assume_aligned(d, LCALS_DATA_ALIGN) + +#define LCALS_SIMD // TODO: Define this... + + +#elif defined(LCALS_COMPILER_XLC12) +// +// Configuration options for xlc v12 compiler (i.e., bgq/sequoia). +// + +#define LCALS_INLINE inline __attribute__((always_inline)) + +#define LCALS_ALIGN_DATA(d) __alignx(LCALS_DATA_ALIGN, d) + +//#define LCALS_SIMD _Pragma("simd_level(10)") +#define LCALS_SIMD // TODO: Define this... + + +#elif defined(LCALS_COMPILER_CLANG) +// +// Configuration options for clang compilers +// + +#define LCALS_INLINE inline __attribute__((always_inline)) + +#define LCALS_ALIGN_DATA(d) // TODO: Define this... + +#define LCALS_SIMD // TODO: Define this... + + +#else +#error LCALS compiler is undefined! + +#endif + + +//////////////////////////////////////////////////////////////////////////////// +// +// The following items include some setup items for pointer type definitions +// that follow. +// +//////////////////////////////////////////////////////////////////////////////// + +#if defined(LCALS_COMPILER_ICC) +// +// alignment attribute supported for versions > 12 +// +#if __ICC >= 1300 +typedef Real_type* __restrict__ __attribute__((align_value(LCALS_DATA_ALIGN))) TDRAReal_ptr; + +typedef const Real_type* __restrict__ __attribute__((align_value(LCALS_DATA_ALIGN))) const_TDRAReal_ptr; +#endif + + +#elif defined(LCALS_COMPILER_GNU) +// +// Nothing here for now because alignment attribute is not working... +// + + +#elif defined(LCALS_COMPILER_XLC12) +extern +#ifdef __cplusplus +"builtin" +#endif +void __alignx(int n, const void* addr); + + +#elif defined(LCALS_COMPILER_CLANG) +typedef Real_type aligned_real_type __attribute__((aligned (LCALS_DATA_ALIGN))); +typedef aligned_real_type* __restrict__ TDRAReal_ptr; + +typedef const aligned_real_type* __restrict__ const_TDRAReal_ptr; + +#else +#error LCALS compiler is undefined! + +#endif + + +#if defined(LCALS_USE_PTR_CLASS) +/*! + ****************************************************************************** + * + * \brief Class representing a restricted Real_type const pointer. + * + ****************************************************************************** + */ +class ConstRestrictRealPtr +{ +public: + + /// + /// Ctors and assignment op. + /// + + ConstRestrictRealPtr() : dptr(0) { ; } + + ConstRestrictRealPtr(const Real_type* d) : dptr(d) { ; } + + ConstRestrictRealPtr& operator=(const Real_type* d) { + ConstRestrictRealPtr copy(d); + std::swap(dptr, copy.dptr); + return *this; + } + + /// + /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op. + /// + + /// + /// Implicit conversion operator to bare const pointer. + /// + operator const Real_type*() { return dptr; } + + /// + /// "Explicit conversion operator" to bare const pointer, + /// consistent with boost shared ptr. + /// + const Real_type* get() const { return dptr; } + + /// + /// Bracket operator. + /// + const Real_type& operator [] (Index_type i) const + { + return( (const Real_type* __restrict__) dptr)[i]; + } + + /// + /// + operator for pointer arithmetic. + /// + const Real_type* operator+ (Index_type i) const { return dptr+i; } + +private: + const Real_type* dptr; +}; + + +/*! + ****************************************************************************** + * + * \brief Class representing a restricted Real_type (non-const) pointer. + * + ****************************************************************************** + */ +class RestrictRealPtr +{ +public: + + /// + /// Ctors and assignment op. + /// + + RestrictRealPtr() : dptr(0) { ; } + + RestrictRealPtr(Real_type* d) : dptr(d) { ; } + + RestrictRealPtr& operator=(Real_type* d) { + RestrictRealPtr copy(d); + std::swap(dptr, copy.dptr); + return *this; + } + + /// + /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op. + /// + + /// + /// Implicit conversion operator to (non-const) bare pointer. + /// + operator Real_type*() { return dptr; } + + /// + /// Implicit conversion operator to const bare pointer. + /// + operator const Real_type*() const { return dptr; } + + /// + /// "Explicit conversion operator" to (non-const) bare pointer, + /// consistent with boost shared ptr. + /// + Real_type* get() { return dptr; } + + /// + /// "Explicit conversion operator" to const bare pointer, + /// consistent with boost shared ptr. + /// + const Real_type* get() const { return dptr; } + + /// + /// Operator that enables implicit conversion from RestrictRealPtr to + /// RestrictRealConstPtr. + /// + operator ConstRestrictRealPtr () + { return ConstRestrictRealPtr(dptr); } + + + /// + /// Bracket operator. + /// + Real_type& operator [] (Index_type i) + { + return( (Real_type* __restrict__) dptr)[i]; + } + + /// + /// + operator for (non-const) pointer arithmetic. + /// + Real_type* operator+ (Index_type i) { return dptr+i; } + + /// + /// + operator for const pointer arithmetic. + /// + const Real_type* operator+ (Index_type i) const { return dptr+i; } + +private: + Real_type* dptr; +}; + + +/*! + ****************************************************************************** + * + * \brief Class representing a restricted aligned Real_type const pointer. + * + ****************************************************************************** + */ +class ConstRestrictAlignedRealPtr +{ +public: + + /// + /// Ctors and assignment op. + /// + + ConstRestrictAlignedRealPtr() : dptr(0) { ; } + + ConstRestrictAlignedRealPtr(const Real_type* d) : dptr(d) { ; } + + ConstRestrictAlignedRealPtr& operator=(const Real_type* d) { + ConstRestrictAlignedRealPtr copy(d); + std::swap(dptr, copy.dptr); + return *this; + } + + /// + /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op. + /// + + /// + /// Implicit conversion operator to bare const pointer. + /// + operator const Real_type*() { return dptr; } + + /// + /// "Explicit conversion operator" to bare const pointer, + /// consistent with boost shared ptr. + /// + const Real_type* get() const { return dptr; } + + /// + /// Compiler-specific bracket operators. + /// +#if defined(LCALS_COMPILER_ICC) + /// + const Real_type& operator [] (Index_type i) const + { +#if __ICC < 1300 // use alignment intrinsic + LCALS_ALIGN_DATA(dptr); + return( (const Real_type* __restrict__) dptr)[i]; +#else // use alignment attribute + return( (const_TDRAReal_ptr) dptr)[i]; +#endif + } + + +#elif defined(LCALS_COMPILER_GNU) + /// + const Real_type& operator [] (Index_type i) const + { +#if 1 // NOTE: alignment instrinsic not available for older GNU compilers + return( (const Real_type* __restrict__) LCALS_ALIGN_DATA(dptr) )[i]; +#else + return( (const Real_type* __restrict__) dptr)[i]; +#endif + } + + +#elif defined(LCALS_COMPILER_XLC12) + const Real_type& operator [] (Index_type i) const + { + LCALS_ALIGN_DATA(dptr); + return( (const Real_type* __restrict__) dptr)[i]; + } + + +#elif defined(LCALS_COMPILER_CLANG) + const Real_type& operator [] (Index_type i) const + { + return( (const_TDRAReal_ptr) dptr)[i]; + } + + +#else +#error LCALS compiler macro is undefined! + +#endif + + /// + /// + operator for pointer arithmetic. + /// + const Real_type* operator+ (Index_type i) const { return dptr+i; } + +private: + const Real_type* dptr; +}; + + +/*! + ****************************************************************************** + * + * \brief Class representing a restricted aligned Real_type (non-const) pointer. + * + ****************************************************************************** + */ +class RestrictAlignedRealPtr +{ +public: + + /// + /// Ctors and assignment op. + /// + + RestrictAlignedRealPtr() : dptr(0) { ; } + + RestrictAlignedRealPtr(Real_type* d) : dptr(d) { ; } + + RestrictAlignedRealPtr& operator=(Real_type* d) { RestrictAlignedRealPtr copy(d); + std::swap(dptr, copy.dptr); + return *this; } + + /// + /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op. + /// + + /// + /// Implicit conversion operator to (non-const) bare pointer. + /// + operator Real_type*() { return dptr; } + + /// + /// Implicit conversion operator to const bare pointer. + /// + operator const Real_type*() const { return dptr; } + + /// + /// "Explicit conversion operator" to (non-const) bare pointer, + /// consistent with boost shared ptr. + /// + Real_type* get() { return dptr; } + + /// + /// "Explicit conversion operator" to const bare pointer, + /// consistent with boost shared ptr. + /// + const Real_type* get() const { return dptr; } + + /// + /// Operator that enables implicit conversion from RestrictAlignedRealPtr to + /// RestrictAlignedRealConstPtr. + /// + operator ConstRestrictAlignedRealPtr () + { return ConstRestrictAlignedRealPtr(dptr); } + + + /// + /// Compiler-specific bracket operators. + /// + +#if defined(LCALS_COMPILER_ICC) + /// + Real_type& operator [] (Index_type i) + { +#if __ICC < 1300 // use alignment intrinsic + LCALS_ALIGN_DATA(dptr); + return( (Real_type* __restrict__) dptr)[i]; +#else // use alignment attribute + return( (TDRAReal_ptr) dptr)[i]; +#endif + } + + /// + const Real_type& operator [] (Index_type i) const + { +#if __ICC < 1300 // use alignment intrinsic + LCALS_ALIGN_DATA(dptr); + return( (Real_type* __restrict__) dptr)[i]; +#else // use alignment attribute + return( (TDRAReal_ptr) dptr)[i]; +#endif + } + +#elif defined(LCALS_COMPILER_GNU) + /// + Real_type& operator [] (Index_type i) + { +#if 1 // NOTE: alignment instrinsic not available for older GNU compilers + return( (Real_type* __restrict__) LCALS_ALIGN_DATA(dptr) )[i]; +#else + return( (Real_type* __restrict__) dptr)[i]; +#endif + } + + /// + const Real_type& operator [] (Index_type i) const + { +#if 1 // NOTE: alignment instrinsic not available for older GNU compilers + return( (Real_type* __restrict__) LCALS_ALIGN_DATA(dptr) )[i]; +#else + return( (Real_type* __restrict__) dptr)[i]; +#endif + } + + +#elif defined(LCALS_COMPILER_XLC12) + /// + Real_type& operator [] (Index_type i) + { + LCALS_ALIGN_DATA(dptr); + return( (Real_type* __restrict__) dptr)[i]; + } + + /// + const Real_type& operator [] (Index_type i) const + { + LCALS_ALIGN_DATA(dptr); + return( (Real_type* __restrict__) dptr)[i]; + } + + +#elif defined(LCALS_COMPILER_CLANG) + /// + Real_type& operator [] (Index_type i) + { + return( (TDRAReal_ptr) dptr)[i]; + } + + /// + const Real_type& operator [] (Index_type i) const + { + return( (TDRAReal_ptr) dptr)[i]; + } + + +#else +#error LCALS compiler macro is undefined! + +#endif + + /// + /// + operator for (non-const) pointer arithmetic. + /// + Real_type* operator+ (Index_type i) { return dptr+i; } + + /// + /// + operator for const pointer arithmetic. + /// + const Real_type* operator+ (Index_type i) const { return dptr+i; } + +private: + Real_type* dptr; +}; + + +/*! + ****************************************************************************** + * + * \brief Class representing a restricted Complex_type const pointer. + * + ****************************************************************************** + */ +class ConstRestrictComplexPtr +{ +public: + + /// + /// Ctors and assignment op. + /// + + ConstRestrictComplexPtr() : dptr(0) { ; } + + ConstRestrictComplexPtr(const Complex_type* d) : dptr(d) { ; } + + ConstRestrictComplexPtr& operator=(const Complex_type* d) { + ConstRestrictComplexPtr copy(d); + std::swap(dptr, copy.dptr); + return *this; + } + + /// + /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op. + /// + + /// + /// Implicit conversion operator to bare const pointer. + /// + operator const Complex_type*() const { return dptr; } + + /// + /// "Explicit conversion operator" to bare const pointer, + /// consistent with boost shared ptr. + /// + const Complex_type* get() const { return dptr; } + + /// + /// Bracket operator. + /// + const Complex_type& operator [] (Index_type i) const + { + return( (const Complex_type* __restrict__) dptr)[i]; + } + + /// + /// + operator for pointer arithmetic. + /// + const Complex_type* operator+ (Index_type i) const { return dptr+i; } + +private: + const Complex_type* dptr; +}; + + +/*! + ****************************************************************************** + * + * \brief Class representing a restricted Complex_type (non-const) pointer. + * + ****************************************************************************** + */ +class RestrictComplexPtr +{ +public: + + /// + /// Ctors and assignment op. + /// + + RestrictComplexPtr() : dptr(0) { ; } + + RestrictComplexPtr(Complex_type* d) : dptr(d) { ; } + + RestrictComplexPtr& operator=(Complex_type* d) { RestrictComplexPtr copy(d); + std::swap(dptr, copy.dptr); + return *this; } + + /// + /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op. + /// + + /// + /// Implicit conversion operator to (non-const) bare pointer. + /// + operator Complex_type*() { return dptr; } + + /// + /// Implicit conversion operator to const bare pointer. + /// + operator const Complex_type*() const { return dptr; } + + /// + /// "Explicit conversion operator" to (non-const) bare pointer, + /// consistent with boost shared ptr. + /// + Complex_type* get() { return dptr; } + + /// + /// "Explicit conversion operator" to const bare pointer, + /// consistent with boost shared ptr. + /// + const Complex_type* get() const { return dptr; } + + /// + /// Operator that enables implicit conversion from RestrictComplexPtr to + /// RestrictComplexConstPtr. + /// + operator ConstRestrictComplexPtr () + { return ConstRestrictComplexPtr(dptr); } + + /// + /// (Non-const) bracket operator. + /// + Complex_type& operator [] (Index_type i) + { + return( (Complex_type* __restrict__) dptr)[i]; + } + + /// + /// Const bracket operator. + /// + const Complex_type& operator [] (Index_type i) const + { + return( (Complex_type* __restrict__) dptr)[i]; + } + + /// + /// + operator for (non-const) pointer arithmetic. + /// + Complex_type* operator+ (Index_type i) { return dptr+i; } + + /// + /// + operator for const pointer arithmetic. + /// + const Complex_type* operator+ (Index_type i) const { return dptr+i; } + +private: + Complex_type* dptr; +}; +#endif // defined(LCALS_USE_PTR_CLASS) + + +/* + ****************************************************************************** + * + * Finally, we define data pointer types based on definitions above and + * -D value given at compile time. + * + ****************************************************************************** + */ +#if defined(LCALS_USE_BARE_PTR) +typedef Real_type* Real_ptr; +typedef const Real_type* const_Real_ptr; +typedef Complex_type* Complex_ptr; +typedef const Complex_type* const_Complex_ptr; + +typedef Real_type* UnalignedReal_ptr; +typedef const Real_type* const_UnalignedReal_ptr; + + +#elif defined(LCALS_USE_RESTRICT_PTR) +typedef Real_type* __restrict__ Real_ptr; +typedef const Real_type* __restrict__ const_Real_ptr; +typedef Complex_type* __restrict__ Complex_ptr; +typedef const Complex_type* __restrict__ const_Complex_ptr; + +typedef Real_type* __restrict__ UnalignedReal_ptr; +typedef const Real_type* __restrict__ const_UnalignedReal_ptr; + + +#elif defined(LCALS_USE_RESTRICT_ALIGNED_PTR) +typedef TDRAReal_ptr Real_ptr; +typedef const_TDRAReal_ptr const_Real_ptr; +typedef Complex_type* __restrict__ Complex_ptr; +typedef const Complex_type* __restrict__ const_Complex_ptr; + +typedef Real_type* __restrict__ UnalignedReal_ptr; +typedef const Real_type* __restrict__ const_UnalignedReal_ptr; + + +#elif defined(LCALS_USE_PTR_CLASS) +typedef RestrictAlignedRealPtr Real_ptr; +typedef ConstRestrictAlignedRealPtr const_Real_ptr; +typedef RestrictComplexPtr Complex_ptr; +typedef ConstRestrictComplexPtr const_Complex_ptr; + +typedef RestrictRealPtr UnalignedReal_ptr; +typedef ConstRestrictRealPtr const_UnalignedReal_ptr; + + +#else +#error LCALS pointer type is undefined! + +#endif + + +// +// By default, all loop variants defined as supported in the +// compiler-specific sections above are turned on here (for +// both compilation and execution). +// +// Loop variants can be turned off via the #define/#undef macros below. +// It's a bit cheesy, but it's simple and it works! +// +// Execution of individual loop variants can also be controlled +// by modifying the strings that get added to the run_loop_variants +// vector in main.cxx +// + + +// +// Execution policies applicable to "forall" loop variants. +// Traversal method templates are defined in LCALSTraversalMethods.hxx +// header files. +// + +// Tag struct types define available forall method execution policies +struct seq_exec {}; +struct simd_exec {}; +struct omp_parallel_for_exec {}; +struct omp_for_nowait_exec {}; + +// +// Execution policy used in (non-OpenMP) "forall" loop variants. +// To use another policy in all such loops, change this typedef. +// +typedef simd_exec exec_policy; + + + +#endif // closing endif for header file include guard Index: MicroBenchmarks/LCALS/LCALSStats.hxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/LCALSStats.hxx @@ -0,0 +1,306 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Header file defining routines and structures to gather and report +// LCALS loop suite execution information. +// + +#ifndef LCALSStats_HXX +#define LCALSStats_HXX + +#include "LCALSParams.hxx" + +#include +#include +#include +#include + +#if defined(LCALS_USE_CYCLE) +#include "cycle.h" +typedef ticks LoopTime; + +#elif defined(LCALS_USE_CLOCK) +#include +typedef clock_t LoopTime; + +#else +#error LCALS_TIMER_TYPE is undefined! + +#endif + + +class LoopStat; + +// +// Loop timing should follow the implementation described here. +// (See files containing loop implementations for details.) +// +// 1) Execute loop (identified by integer variable "iloop"): +// +// LoopTimer ltimer; +// +// flushCache(); +// +// TIMER_START(ltimer); +// +// ...CODE FOR LOOP "iloop" GOES HERE... +// +// TIMER_STOP(ltimer); +// +// 2) At some point after loop is run, but before next loop in file is run, +// copy timing information to appropriate loop stat object: +// +// copyTimers(loop_stat, ilength, ltimer); +// + +struct LoopTimer +{ + LoopTime start; + LoopTime stop; + bool was_run; + + LoopTimer() : start(0), stop(0), was_run(false) { ; } +}; + +void flushCache(); +void copyTimer(LoopStat& loop_stat, int ilength, + const LoopTimer& loop_timer); + +#if defined(LCALS_USE_CYCLE) + +#define TIMER_START(lt) lt.start = getticks(); +#define TIMER_STOP(lt) lt.stop = getticks(); \ + +#elif defined(LCALS_USE_CLOCK) + +#define TIMER_START(lt) lt.start = clock(); +#define TIMER_STOP(lt) lt.stop = clock(); \ + lt.was_run = true; + +#else +#error LCALS_TIMER_TYPE is undefined! + +#endif + + + +////////////////////////////////////////////////////////////////// +// +// Routines to set up loop data, run loops, generate output, etc. +// +////////////////////////////////////////////////////////////////// + +// +// Forward declarations for structs defined below. +// +struct LoopStat; +struct LoopSuiteRunInfo; + +// +// Routines for accessing loop suite run info. +// +LoopSuiteRunInfo& getLoopSuiteRunInfo(); + + +// +// Routine to allocate and setup basic structures used to run loop suite +// and free them when done. +// +void allocateLoopSuiteRunInfo(const std::string& host_name, + unsigned num_loops, + unsigned num_loop_lengths, + unsigned num_suite_passes, + bool run_loop_length[], + CacheIndex_type cache_size); +void freeLoopSuiteRunInfo(); + + +// +// Routine to generate loop excution timing report. +// +// Also write output files if non-empty directory name is given. +// +void generateTimingReport(const std::vector< std::string >& run_loop_variants, + const std::string& output_dirname); + +// +// Routine to generate report about loop checksums. +// +// Also write output files if non-empty directory name is given. +// +void generateChecksumReport(const std::vector< std::string >& run_loop_variants, + const std::string& output_dirname); + +// +// Routine to generate FOM report. +// +// Also write output files if non-empty directory name is given. +// +void generateFOMReport(const std::vector< std::string >& run_loop_variants, + const std::string& output_dirname); + + + +////////////////////////////////////////////////////////////////// +// +// Structures holding parameters defining execution of loop suite +// and loop timing statistic information. +// +////////////////////////////////////////////////////////////////// + +class LoopStat +{ +public: + + bool loop_is_run; + + double loop_weight; + + // + // The following vectors are indexed by loop length ID. + // + // The second vector index for loop_run_time + // is number of suite pass. + // + + std::vector< std::vector > loop_run_time; + std::vector< unsigned long > loop_run_count; + + std::vector< long double > mean; + std::vector< long double > std_dev; + std::vector< long double > min; + std::vector< long double > max; + std::vector< long double > harm_mean; + std::vector< long double > meanrel2ref; + + std::vector< int > loop_length; + std::vector< int > samples_per_pass; + + std::vector< long double > loop_chksum; + + explicit LoopStat(unsigned num_loop_lengths) + : loop_is_run(false), + loop_weight(0.0), + loop_run_time(num_loop_lengths), + loop_run_count(num_loop_lengths, 0), + mean(num_loop_lengths, 0.0), + std_dev(num_loop_lengths, 0.0), + min(num_loop_lengths, 0.0), + max(num_loop_lengths, 0.0), + harm_mean(num_loop_lengths, 0.0), + meanrel2ref(num_loop_lengths, 0.0), + loop_length(num_loop_lengths, 0), + samples_per_pass(num_loop_lengths, 0), + loop_chksum(num_loop_lengths, 0.0) + { ; } + + // + // Print routine for debugging. + // + void print(std::ostream& os) const; + +private: + // + // The following methods are not implemented. + // + LoopStat(); +}; + +class LoopSuiteRunInfo +{ +public: + + std::string host_name; + + // + // The following vectors are indexed by loop ID. + // + unsigned num_loops; + std::vector loop_names; + + // + // The following vectors are indexed by loop length ID. + // + unsigned num_loop_lengths; + std::vector run_loop_length; + std::vector loop_length_names; + + unsigned num_suite_passes; + double loop_samp_frac; + + LoopStat ref_loop_stat; + // + // The following vectors are indexed by loop WeightGroup + // + std::vector loop_weights; + + // + // The following vectors are indexed first by loop variant + // (according to order in LoopStatMap, which is the same as + // run_loop_variants vector in main.cxx) and then by loop length. + // So we have NUM_LENGTHS values for each variant. + // + std::vector< std::vector< int > > num_loops_run; + std::vector< std::vector< long double > > tot_time; + std::vector< std::vector< long double > > fom_rel; + std::vector< std::vector< long double > > fom_rate; + + + CacheIndex_type cache_flush_data_len; + double* cache_flush_data; + long double cache_flush_data_sum; + + LoopSuiteRunInfo() + : ref_loop_stat(static_cast(0)), + num_loops(0), + num_loop_lengths(0), + num_suite_passes(0), + loop_samp_frac(0.0), + cache_flush_data_len(0), + cache_flush_data(0), + cache_flush_data_sum(0.0) + { ; } + + + typedef std::map< std::string, std::vector* > LoopStatMap; + + ~LoopSuiteRunInfo() + { + LoopStatMap::iterator lsi = loop_test_stats.begin(); + for ( ; lsi != loop_test_stats.end(); ++ lsi ) { + delete (*lsi).second; + } + } + + // + // Add vector of loop stats for loop test with given name. + // + void addLoopStats(const std::string& name) + { + std::vector* stat_vec = new std::vector(); + loop_test_stats.insert( LoopStatMap::value_type( name, stat_vec ) ); + } + + // + // Return reference to vector of loop stats for loop test with given name. + // + std::vector& getLoopStats( const std::string& name ) + { + LoopStatMap::iterator lsi = loop_test_stats.find(name); + return( *( (*lsi).second ) ); + } + +private: + // + // The following methods are not implemented. + // + LoopSuiteRunInfo(const LoopSuiteRunInfo&); + LoopSuiteRunInfo& operator=(const LoopStat&); + + LoopStatMap loop_test_stats; +}; + + +#endif // closing endif for header file include guard Index: MicroBenchmarks/LCALS/LCALSStats.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/LCALSStats.cxx @@ -0,0 +1,1024 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file containing routines used to gather and report +// performance data for LCALS suite +// + +#include "LCALSStats.hxx" + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +// +// LoopStat print routine for debugging. +// +void LoopStat::print(ostream& os) const +{ + os << "\nLoopStat::print..." << endl; + os << "\tloop_is_run = " << loop_is_run << endl; + os << "\tnum loop lengths = " << loop_length.size() << endl; + for (unsigned i = 0; i < loop_length.size(); ++i) { + os << "\t\t ilength = " << i << " --> " << endl; + os << "\t\t\t loop_length = " << loop_length[i] << endl; + os << "\t\t\t samples_per_pass = " << samples_per_pass[i] << endl; + os << "\t\t\t loop_run_count = " << loop_run_count[i] << endl; + if ( loop_run_count[i] > 0 ) { + for (unsigned j = 0; j < loop_run_time[i].size(); ++j) { + os << "\t\t\t\t sample time = " << loop_run_time[i][j] << endl; + } + os << "\t\t\t\t mean = " << mean[i] << endl; + os << "\t\t\t\t std_dev = " << std_dev[i] << endl; + os << "\t\t\t\t min = " << min[i] << endl; + os << "\t\t\t\t max = " << max[i] << endl; + os << "\t\t\t\t harm_mean = " << harm_mean[i] << endl; + os << "\t\t\t\t meanrel2ref = " << meanrel2ref[i] << endl; + os << endl; + for (unsigned j = 0; j < loop_run_time[i].size(); ++j) { + os << "\t\t\t\t sample time = " << loop_run_time[i][j] << endl; + } + } + } + os << endl; +} + + +// +// File scope data holding structures needed to execute and time loops. +// +static LoopSuiteRunInfo* s_loop_suite_run_info = 0; + +// +// Accessor routine for suite run info. +// +LoopSuiteRunInfo& getLoopSuiteRunInfo() { return *s_loop_suite_run_info; } + + +// +// Define how suite will run and initialize timing structures for loops. +// +void allocateLoopSuiteRunInfo(const string& host_name, + unsigned num_loops, + unsigned num_loop_lengths, + unsigned num_suite_passes, + bool run_loop_length[], + CacheIndex_type cache_size) +{ +#ifdef TESTSUITE + cout << "\n allocateLoopSuiteRunInfo..." << endl; +#endif + if ( s_loop_suite_run_info == 0 ) { + s_loop_suite_run_info = new LoopSuiteRunInfo(); + } + + s_loop_suite_run_info->host_name = host_name; + + s_loop_suite_run_info->num_loops = num_loops; + s_loop_suite_run_info->num_loop_lengths = num_loop_lengths; + s_loop_suite_run_info->num_suite_passes = num_suite_passes; + for (unsigned ilen = 0; ilen < num_loop_lengths; ++ilen) { + s_loop_suite_run_info->run_loop_length.push_back( + run_loop_length[ilen]); + } + + // + // To make sure all data cache levels are flushed completely, we + // define a data buffer with length equal to twice given cache size. + // + s_loop_suite_run_info->cache_flush_data_len = + (cache_size*2)/sizeof(Real_type); + s_loop_suite_run_info->cache_flush_data = + new double[s_loop_suite_run_info->cache_flush_data_len]; + for (CacheIndex_type i = 0; + i < s_loop_suite_run_info->cache_flush_data_len; ++i) { + s_loop_suite_run_info->cache_flush_data[i] = drand48() + 0.1; + } + +} + +// +// Free data structures defining loop suite execution. +// +void freeLoopSuiteRunInfo() +{ + if ( s_loop_suite_run_info ) { + if ( s_loop_suite_run_info->cache_flush_data ) { + delete [] s_loop_suite_run_info->cache_flush_data; + } + delete s_loop_suite_run_info; + s_loop_suite_run_info = 0; + } +} + +////////////////////////////////////////////////////////////////// +// +// Routines used for loop timing... +// +////////////////////////////////////////////////////////////////// + +// +// Flush cache before each loop is run to minimize impact of one +// loop on another's execution. +// +void flushCache() +{ + for (CacheIndex_type i = 0; + i < s_loop_suite_run_info->cache_flush_data_len; ++i) { + s_loop_suite_run_info->cache_flush_data_sum += + s_loop_suite_run_info->cache_flush_data[i]; + } + s_loop_suite_run_info->cache_flush_data_sum /= + s_loop_suite_run_info->cache_flush_data_len; +} + + +// +// Copy loop run time to LoopStat. +// +void copyTimer(LoopStat& loop_stat, int ilength, + const LoopTimer& loop_timer) +{ + if ( loop_timer.was_run ) { +#if defined(LCALS_USE_CYCLE) + long double run_time = elapsed(loop_timer.stop, + loop_timer.start); +#elif defined(LCALS_USE_CLOCK) + long double run_time = + static_cast(loop_timer.stop - + loop_timer.start) / CLOCKS_PER_SEC; +#else +#error LCALS_TIMER_TYPE is undefined! + +#endif + loop_stat.loop_run_time[ilength].push_back( run_time ); + } +} + + + +// +// Compute statistics for loop run for variant with given index. +// +void computeStats( unsigned ilv, vector& loop_stats, + bool do_fom ) +{ + + // compute stats for each loop... + for ( unsigned iloop = 0; iloop < loop_stats.size(); ++iloop ) { + + LoopStat& stat = loop_stats[iloop]; + + if ( stat.loop_is_run ) { + + // compute stats for each length loop is run... + for ( unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen ) { + + if ( stat.loop_run_count[ilen] > 0 ) { + + vector& time_sample = + stat.loop_run_time[ilen]; + unsigned sample_size = time_sample.size(); + + long double mean = 0.0; + long double sdev = 0.0; + long double max = -std::numeric_limits::max(); + long double min = std::numeric_limits::max(); + long double harm = 0.0; + + for (unsigned is = 0; is < sample_size; ++is) { + mean += time_sample[is]; + max = std::max(max, time_sample[is]); + min = std::min(min, time_sample[is]); + if ( time_sample[is] > 0.0 ) { + harm += 1.0/time_sample[is]; + } + } + + mean /= sample_size; + + if ( harm > 0.0 ) { harm = sample_size/harm; } + + for (unsigned is = 0; is < sample_size; ++is) { + sdev += (time_sample[is] - mean)*(time_sample[is] - mean); + } + + sdev /= sample_size; + + stat.mean[ilen] = mean; + stat.std_dev[ilen] = sdev; + stat.min[ilen] = min; + stat.max[ilen] = max; + stat.harm_mean[ilen] = harm; + + } // if loop length was run + + } // iterate over loop lengths + + } // if loop is run + + } // iterate over loops + + // + // FOM calculations (done separately for simplicity) + // + if ( do_fom ) { + + LoopSuiteRunInfo& suite_info = getLoopSuiteRunInfo(); + LoopStat& ref_loop_stat = suite_info.ref_loop_stat; + + std::vector num_loops_run(suite_info.num_loop_lengths, 0); + std::vector< long double > tot_weight(suite_info.num_loop_lengths, 0.0); + std::vector< long double > tot_time(suite_info.num_loop_lengths, 0.0); + std::vector< long double > fom_rel(suite_info.num_loop_lengths, 0.0); + std::vector< long double > fom_rate(suite_info.num_loop_lengths, 0.0); + + for ( unsigned iloop = 0; iloop < loop_stats.size(); ++iloop ) { + + LoopStat& stat = loop_stats[iloop]; + + if ( stat.loop_is_run ) { + + for ( unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen ) { + + if ( stat.loop_run_count[ilen] > 0 ) { + + num_loops_run[ilen]++; + tot_weight[ilen] += stat.loop_weight; + tot_time[ilen] += stat.mean[ilen]; + + // + // sum weighted loop time + // + fom_rel[ilen] += stat.loop_weight * stat.mean[ilen]; + + // + // sum weighted loop iteration rate + // + fom_rate[ilen] += (stat.loop_weight * stat.mean[ilen]) / + (stat.loop_length[ilen] * stat.samples_per_pass[ilen]); + + } // if loop length was run + + } // iterate over loop lengths + + } // if loop is run + + } // iterate over loops + + for (unsigned ilen = 0; ilen < suite_info.num_loop_lengths; ++ilen) { + suite_info.num_loops_run[ilv][ilen] = num_loops_run[ilen]; + suite_info.tot_time[ilv][ilen] = tot_time[ilen]; + + long double ref_time = ref_loop_stat.loop_run_time[ilen][0]; + + if ( num_loops_run[ilen] > 0 ) { +#if 0 // this makes 0 <= fom_rel <= 1/tot_time + suite_info.fom_rel[ilv][ilen] = + ref_time * tot_weight[ilen] / ( tot_time[ilen] * fom_rel[ilen] ); +#else // this makes 0 <= fom_rel <= 1 + suite_info.fom_rel[ilv][ilen] = + ref_time * tot_weight[ilen] / fom_rel[ilen] ; +#endif + suite_info.fom_rate[ilv][ilen] = 1.0 / fom_rate[ilen]; + } + } + + } + +} + + +// +// Forward declarations for routines that write loop reports. +// +namespace { + + void writeTimingSummaryReport( + const vector< string >& run_loop_variants, + ostream& os); + + void writeChecksumReport( + const vector< string >& run_loop_variants, + ostream& os); + + void writeFOMReport( + const vector< string >& run_loop_variants, + ostream& os); + + void writeMeanTimeReport(const string& variant_name, + const string& output_dirname); + + void writeRelativeTimeReport(const string& variant_name, + const string& output_dirname); + + std::string buildVersionInfo(); + +}; // unnamed namespace + + +// +// Routine called from main() to generate timing report(s). +// +void generateTimingReport(const vector< string >& run_loop_variants, + const string& output_dirname) +{ + if ( run_loop_variants.size() == 0 ) return; + + bool do_fom = true; + + std::string ver_info = buildVersionInfo(); + + // + // Compute statistics for all loops. + // + LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo(); + const unsigned nvariants = run_loop_variants.size(); + for (unsigned ilv = 0; ilv < nvariants; ++ilv) { + computeStats( ilv, suite_run_info.getLoopStats(run_loop_variants[ilv]), + do_fom ); + } + + // + // If output directory name is given, write files in that directory. + // Else, write only summary to standard output. + // + if (!output_dirname.empty()) { + + string timing_fname(output_dirname + "/" + "timing.txt"); + ofstream file(timing_fname.c_str(), ios::out | ios::trunc); + if ( !file ) { + cout << " ERROR: Can't open output file " + << timing_fname << endl; + } + cout << "\n writeTimingSummaryReport... " << timing_fname << endl; + writeTimingSummaryReport(run_loop_variants, file); + + // + // Write mean run time file for each loop variant. + // + for (unsigned ilv = 0; ilv < nvariants; ++ilv) { + writeMeanTimeReport( run_loop_variants[ilv], output_dirname ); + } + + // + // Write relative run time file for each loop variant. + // + // NOTE: We assume variant "zero" is reference. + // + for (unsigned ilv = 1; ilv < nvariants; ++ilv) { + writeRelativeTimeReport( run_loop_variants[ilv], output_dirname ); + } + + } else { + + writeTimingSummaryReport(run_loop_variants, cout); + + } +} + +// +// Routine called from main() to generate checksum report. +// +void generateChecksumReport( + const vector< string >& run_loop_variants, + const string& output_dirname) +{ +#if defined(LCALS_VERIFY_CHECKSUM) + if ( run_loop_variants.size() == 0 ) return; + + // + // If output directory name is given, write file in that directory. + // Else, write summary to standard output. + // + if (!output_dirname.empty()) { + string checksum_fname(output_dirname + "/" + "checksum.txt"); + ofstream file(checksum_fname.c_str(), ios::out | ios::trunc); + if ( !file ) { + cout << " ERROR: Can't open output file " + << checksum_fname << endl; + } + cout << "\n writeChecksumReport... " << checksum_fname << endl; + writeChecksumReport(run_loop_variants, file); + } else { + writeChecksumReport(run_loop_variants, cout); + } +#endif +} + +// +// Routine called from main() to generate FOM report. +// +void generateFOMReport( + const vector< string >& run_loop_variants, + const string& output_dirname) +{ + if ( run_loop_variants.size() == 0 ) return; + + // + // If output directory name is given, write file in that directory. + // Else, write only summary to standard output. + // + if (!output_dirname.empty()) { + string fom_fname(output_dirname + "/" + "fom.txt"); + ofstream file(fom_fname.c_str(), ios::out | ios::trunc); + if ( !file ) { + cout << " ERROR: Can't open output file " + << fom_fname << endl; + } + cout << "\n writeFOMReport... " << fom_fname << endl; + writeFOMReport(run_loop_variants, file); + } else { + writeFOMReport(run_loop_variants, cout); + } +} + + +// +// Implementation of file-scope routines that write loop reports. +// +namespace { + +// +// Write report about loop execution timings to given output stream. +// +void writeTimingSummaryReport(const vector< string >& run_loop_variants, + ostream& os) +{ + LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo(); + const unsigned nvariants = run_loop_variants.size(); + + const string& ref_variant = run_loop_variants[0]; + vector& loop_names = suite_run_info.loop_names; + + // + // Define some strings used to print summary table. + // + string equal_line("===========================================================================================================\n"); + string dash_line("------------------------------------------------------------------------------------------------------------\n"); + string dash_line_part("-------------------------------------------------------\n"); + string dot_line_part("............................................\n"); + vector len_id(suite_run_info.loop_length_names.size()); + for (unsigned ilen = 0; ilen < len_id.size(); ++ilen) { + len_id[ilen] = suite_run_info.loop_length_names[ilen][0]; + } + + std::string ver_info = buildVersionInfo(); + + // + // Print compilation summary information. + // + os << "\n\n\n"; + os << equal_line; + os << equal_line; + + os << "LCALS compilation summary: " << endl; + os << ver_info << endl; + + // + // Print basic run summary information. + // + os << "\n\n"; + os << equal_line; + os << equal_line; + + os << "LCALS run summary: " << endl; + os << "sizeof(Real_type) = " << sizeof(Real_type) << endl; + os << " num suite passes = " << suite_run_info.num_suite_passes << endl; + os << " loop sample fraction = " << suite_run_info.loop_samp_frac << endl; + os << " loop variants run : "; + for (unsigned ilv = 0; ilv < nvariants; ++ilv) { + string last_char; + if ( ilv+1 < run_loop_variants.size() ) last_char = string(" , "); + os << run_loop_variants[ilv] << last_char; + } + os << "\n reference variant : " << ref_variant << endl; + os << equal_line; + os << equal_line; + + // + // Set basic table formatting. + // + size_t max_name_len = 0; + for (size_t iloop = 0; iloop < loop_names.size(); ++iloop) { + max_name_len = max(max_name_len, loop_names[iloop].size()); + } + + size_t max_var_name_len = 0; + for (size_t ilv = 0; ilv < nvariants; ++ilv) { + max_var_name_len = + max(max_var_name_len, run_loop_variants[ilv].size()); + } + + string var_field("Variant(length id)"); + size_t var_field_len = var_field.size(); + unsigned prec = 10; + unsigned prec_buf = prec + 8; + unsigned reldiff_prec = 6; + + // + // Print table column headers. + // + os << "Loop name(Loop ID) --> :(length, samples/pass), etc." + << endl; + os < ref_mean(ref_variant_stat.mean); + + if ( !loop_names[iloop].empty() && ref_variant_stat.loop_is_run ) { + + if ( iloop > 1 ) { // magic numbers are bad!! + os << endl << dash_line_part; + } + os < "; + + for (unsigned ilv = 0; ilv < nvariants; ++ilv) { + + LoopStat& stat = suite_run_info. + getLoopStats(run_loop_variants[ilv])[iloop]; + + if ( stat.loop_is_run ) { + + // + // Print separator line for new loop or new variant. + // + if ( ilv == 0 ) { + + for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) { + os << " " << len_id[ilen] << ":(" + << stat.loop_length[ilen] << ", " + << stat.samples_per_pass[ilen] << ")"; + } + os << endl; + + } else { + + os << dot_line_part; + + } + + // + // Print statistics for each length of loop run. + // + for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) { + + if ( stat.loop_run_count[ilen] > 0 ) { + + string var_string(run_loop_variants[ilv] + + "(" + len_id[ilen] + ")"); + + os << showpoint << setprecision(prec) + < 0 ) { + // compare mean run time to reference variant + + long double rel_mean_diff = 0; + if ( ref_mean[ilen] != 0.0 ) { + rel_mean_diff = 1.0 + + (stat.mean[ilen]-ref_mean[ilen])/ref_mean[ilen]; + } + os <& run_loop_variants, + ostream& os) +{ + LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo(); + const unsigned nvariants = run_loop_variants.size(); + const string& ref_variant = run_loop_variants[0]; + vector& loop_names = suite_run_info.loop_names; + + // + // Define some strings used to print summary table. + // + string equal_line("===========================================================================================================\n"); + string dash_line("------------------------------------------------------------------------------------------------------------\n"); + string dash_line_part("-------------------------------------------------------\n"); + string dot_line_part("............................................\n"); + vector len_id(suite_run_info.loop_length_names.size()); + for (unsigned ilen = 0; ilen < len_id.size(); ++ilen) { + len_id[ilen] = suite_run_info.loop_length_names[ilen][0]; + } + + std::string ver_info = buildVersionInfo(); + + // + // Print compilation summary information. + // + os << "\n\n\n"; + os << equal_line; + os << equal_line; + + os << "LCALS compilation summary: " << endl; + os << ver_info << endl; + + // + // Print checksum information. + // + os << "\n\n"; + os << equal_line; + os << equal_line; + + // + // Set basic table formatting. + // + size_t max_name_len = 0; + for (size_t iloop = 0; iloop < loop_names.size(); ++iloop) { + max_name_len = max(max_name_len, loop_names[iloop].size()); + } + + size_t max_var_name_len = 0; + for (size_t ilv = 0; ilv < nvariants; ++ilv) { + max_var_name_len = + max(max_var_name_len, run_loop_variants[ilv].size()); + } + + string var_field("Variant(length #)"); + size_t var_field_len = var_field.size(); + unsigned prec = 32; + unsigned prec_buf = prec + 8; + + // + // Print table column headers. + // + os << "Loop name -->" << endl; + os < ref_chksum(ref_variant_stat.loop_chksum); + + if ( !loop_names[iloop].empty() && ref_variant_stat.loop_is_run ) { + + if ( iloop > 1 ) { // magic numbers are bad!! + os << endl << dash_line_part; + } + os < "; + + for (unsigned ilv = 0; ilv < nvariants; ++ilv) { + + LoopStat& stat = suite_run_info. + getLoopStats(run_loop_variants[ilv])[iloop]; + + if ( stat.loop_is_run ) { + + // + // Print separator line for new loop or new variant. + // + if ( ilv == 0 ) { + os << endl; + } else { + os << dot_line_part; + } + + // + // Print checksum for each length of loop run. + // + for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) { + + if ( stat.loop_run_count[ilen] > 0 ) { + + string var_string(run_loop_variants[ilv] + + "(" + len_id[ilen] + ")"); + + os << showpoint << setprecision(prec) + < 0 ) { + // compare checksum to reference variant + long double chksum_diff = fabs( + stat.loop_chksum[ilen]-ref_chksum[ilen] ); + os <& run_loop_variants, + ostream& os) +{ + LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo(); + const unsigned nvariants = run_loop_variants.size(); + + // + // Define some strings used to print FOM summary table. + // + string equal_line("===========================================================================================================\n"); + string dash_line_part("-------------------------------------------------------\n"); + string dot_line_part("............................................\n"); + + std::string ver_info = buildVersionInfo(); + + // + // Print compilation summary information. + // + os << "\n\n\n"; + os << equal_line; + os << equal_line; + + os << "LCALS compilation summary: " << endl; + os << ver_info << endl; + + // + // Print checksum information. + // + os << "\n\n"; + os << equal_line; + os << equal_line; + + os << "LCALS FOM results: " << endl; + os << equal_line; + + vector& len_name = suite_run_info.loop_length_names; + + unsigned prec = 32; + // + // Output FOM for each loop variant (and loop lengths) + // + for (unsigned ilv = 0; ilv < nvariants; ++ilv) { + + vector< int >& num_loops_run = suite_run_info.num_loops_run[ilv]; + vector< long double >& tot_time = suite_run_info.tot_time[ilv]; + vector< long double >& fom_rel = suite_run_info.fom_rel[ilv]; + vector< long double >& fom_rate = suite_run_info.fom_rate[ilv]; + + os <& loop_names = suite_run_info.loop_names; + vector& len_names = suite_run_info.loop_length_names; + + const string sepchr(" , "); + unsigned prec = 8; + + // + // Print title line. + // + file << variant_name << " Mean Run Times "; + for (unsigned i = 0; i < len_names.size(); ++i) { + file << sepchr; + } + file << endl; + + // + // Print column header line. + // + for (unsigned i = 0; i < len_names.size(); ++i) { + file << sepchr << len_names[i]; + } + file << endl; + + // + // Print row of times for each loop. + // + for (unsigned iloop = 0; iloop < loop_names.size(); ++iloop) { + + LoopStat& stat = suite_run_info. + getLoopStats(variant_name)[iloop]; + + if ( !loop_names[iloop].empty() && stat.loop_is_run ) { + + file << loop_names[iloop]; + for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) { + file << sepchr << setprecision(prec) << stat.mean[ilen]; + } + file << endl; + + } + + } + + file.flush(); +} + +// +// Write relative run time report file. +// +void writeRelativeTimeReport(const string& variant_name, + const string& output_dirname) +{ + string rept_fname(output_dirname + "/"); + rept_fname += variant_name; + rept_fname += string("-reltime.txt"); + + ofstream file(rept_fname.c_str(), ios::out | ios::trunc); + if ( !file ) { + cout << " ERROR: Can't open output file " << rept_fname << endl; + } + cout << "\n writeRelativeTimeReport... " << rept_fname << endl; + + LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo(); + + vector& loop_names = suite_run_info.loop_names; + vector& len_names = suite_run_info.loop_length_names; + + const string sepchr(" , "); + unsigned prec = 6; + + // + // Print title line. + // + file << variant_name << " Relative Run Times "; + for (unsigned i = 0; i < len_names.size(); ++i) { + file << sepchr; + } + file << endl; + + // + // Print column header line. + // + for (unsigned i = 0; i < len_names.size(); ++i) { + file << sepchr << len_names[i]; + } + file << endl; + + // + // Print row of times for each loop. + // + for (unsigned iloop = 0; iloop < loop_names.size(); ++iloop) { + + LoopStat& stat = suite_run_info. + getLoopStats(variant_name)[iloop]; + + if ( !loop_names[iloop].empty() && stat.loop_is_run ) { + + file << loop_names[iloop]; + for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) { + file << sepchr << setprecision(prec) << stat.meanrel2ref[ilen]; + } + file << endl; + + } + + } + + file.flush(); +} + +// +// Build string containing LCALS compilation information from +// file created when make is invoked. +// +std::string buildVersionInfo() +{ + std::ifstream infile("lcalsversioninfo.txt", std::ios::in); + + std::string ver_info; + + infile.seekg(0, std::ios::end); + ver_info.reserve(infile.tellg()); + infile.seekg(0, std::ios::beg); + + ver_info.assign((std::istreambuf_iterator(infile)), + std::istreambuf_iterator()); + infile.close(); + +#if 0 + std::string ver_info = "LCALS compilation info: \n" + << "\tUser = " << VER_PERSON << "\n" + << "\tDate, Time = " << VER_DATE << " , " << VER_TIME << "\n" + << "\tMachine = " << VER_MACHINE << "\n" + << "\tOS = " << VER_OS << "\n" + << "\t-----------------------------------------------" << "\n" + << "\tCompiler + options = " << lcals_ver_info_values[0] << "\n" + << "\tLCALS rules (defines) = " << lcals_ver_info_values[1] << "\n"; +#endif + return ver_info; +} + +}; // unnamed namespace + + + Index: MicroBenchmarks/LCALS/LCALSSuite.hxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/LCALSSuite.hxx @@ -0,0 +1,586 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Header file with enums, macros, routines and structures used to +// compile and run loops in LCALS suite and to generate execution +// statistics. +// + +#ifndef LCALSSuite_HXX +#define LCALSSuite_HXX + +#include "LCALSParams.hxx" +#include "LCALSStats.hxx" + +#include +#include + + +// +// Enumeration defining unique id for each loop KERNEL in suite. +// +// IMPORTANT: Generally, this should not need modification unless +// new loops (i.e., kernels) are added to the suite. +// +// Note: To keep output understandable, keep this consistent with +// routine defineLoopSuiteRunInfo(). +// +enum LoopKernelID { + + // Keep this one first and don't comment out (!!) + // This insures loop ids start at zero so all array indexing + // or data structures is correct. Also, this loop is not + // executed the same way the others are. + REF_LOOP = 0, + + // + // Loop Subset A: Loops extracted from LLNL app codes. + // They are implemented in runALoops.cxx files. + // + PRESSURE_CALC, + PRESSURE_CALC_ALT, + ENERGY_CALC, + ENERGY_CALC_ALT, + VOL3D_CALC, + DEL_DOT_VEC_2D, + COUPLE, + FIR, + + // + // Loop Subset B: "Basic" Loops. + // They are implemented in runBLoops.cxx files. + // + INIT3, + MULADDSUB, + IF_QUAD, + TRAP_INT, + + // + // Loop Subset C: Loops from older Livermore Loops in "C" suite. + // They are implemented in runCLoops.cxx files. + // + HYDRO_1D, + ICCG, + INNER_PROD, + BAND_LIN_EQ, + TRIDIAG_ELIM, + EOS, + ADI, + INT_PREDICT, + DIFF_PREDICT, + FIRST_SUM, + FIRST_DIFF, + PIC_2D, + PIC_1D, + HYDRO_2D, + GEN_LIN_RECUR, + DISC_ORD, + MAT_X_MAT, + PLANCKIAN, + IMP_HYDRO_2D, + FIND_FIRST_MIN, + + NUM_LOOP_KERNELS // Keep this one last and NEVER comment out (!!) + +}; + + +// +// Enumeration defining unique id for each loop VARIANT in suite. +// +// IMPORTANT: Generally, this should not need modification unless +// new loop variants are added to the suite. +// +enum LoopVariantID { + // + // These variants define LCALS benchmark + // + RAW, + RAW_OMP, + FORALL_LAMBDA, + FORALL_LAMBDA_OMP, + +#if defined(LCALS_DO_MISC) + + // + // These variants are used in miscellaneous LCALS studies + // + FORALL_HYBRID_LAMBDA, +#if 0 // THESE ARE NOT AVAILABLE YET!!! + FORALL_HYBRID_LAMBDA_OMP, +#endif + FORALL_FUNCTOR, + FORALL_FUNCTOR_OMP, +#if 0 // THESE ARE NOT AVAILABLE YET!!! + FORALL_HYBRID_FUNCTOR, + FORALL_HYBRID_FUNCTOR_OMP, +#endif + RAW_FUNC, + FORALL_LAMBDA_TYPEFIX, + FORALL_LAMBDA_OMP_TYPEFIX, + FORALL_HYBRID_LAMBDA_TYPEFIX, + +#endif // if LCALS_DO_MISC + +}; + + +// +// Enumeration defining possible loop lengths to run. +// +enum LoopLength { + + LONG = 0, + MEDIUM, + SHORT, + + NUM_LENGTHS // Keep this one last (!!) + +}; + + +//////////////////////////////////////////////////////////////////////////////// +// +// The following macro constants define which loop VARIANTS can be compiled +// (and potentially) run for a given compiler. +// +// NOTE: The Makefile sets the LCALS_COMPILER_* macro constant. +// +// --> IMPORTANT: Actual selection of which loop variants are run is done +// in main.cxx via the vector 'run_variants'. +// +//////////////////////////////////////////////////////////////////////////////// + +#if defined(LCALS_COMPILER_ICC) +// +// Configuration options for Intel compilers +// + +#define COMPILE_RAW_VARIANTS +#define COMPILE_LAMBDA_VARIANTS +#define COMPILE_FUNCTOR_VARIANTS +#define COMPILE_OMP_VARIANTS + + +#elif defined(LCALS_COMPILER_GNU) +// +// Configuration options for GNU compilers +// + +#define COMPILE_RAW_VARIANTS +#define COMPILE_LAMBDA_VARIANTS +#define COMPILE_FUNCTOR_VARIANTS +#define COMPILE_OMP_VARIANTS + + +#elif defined(LCALS_COMPILER_XLC12) +// +// Configuration options for IBM xlC compilers +// + +// +// xlC compilers DO NOT support lambda functions currently!! +// +#define COMPILE_RAW_VARIANTS +#undef COMPILE_LAMBDA_VARIANTS +#define COMPILE_FUNCTOR_VARIANTS +#define COMPILE_OMP_VARIANTS + + +#elif defined(LCALS_COMPILER_CLANG) +// +// Configuration options for clang compilers +// + +// +// Clang compilers DO NOT support OpenMP currently!! +// +#define COMPILE_RAW_VARIANTS +#define COMPILE_LAMBDA_VARIANTS +#define COMPILE_FUNCTOR_VARIANTS +#undef COMPILE_OMP_VARIANTS + + +#else +#error LCALS compiler is undefined! + +#endif + + +// +// The following macro constants are used to turn on/off compilation of +// individual loop KERNELS in suite. Names are consistent with LoopID +// enum above. +// + +#if defined (LCALS_DO_OMP_ONLY) +// +// Only these loops have OpenMP implementations. The imlementations are +// found in runOMPLoops.cxx files. +// + +// Loop Subset A: Loops extracted from LLNL app codes. +#define COMPILE_PRESSURE_CALC +#define COMPILE_PRESSURE_CALC_ALT +#define COMPILE_ENERGY_CALC +#define COMPILE_ENERGY_CALC_ALT +#define COMPILE_VOL3D_CALC +#define COMPILE_DEL_DOT_VEC_2D +#define COMPILE_COUPLE +#define COMPILE_FIR + +// Loop Subset B: "Basic" Loops. +#define COMPILE_INIT3 +#define COMPILE_MULADDSUB +#define COMPILE_IF_QUAD +#define COMPILE_TRAP_INT + +// Loop Subset C: Loops from older Livermore Loops in "C" suite. +#define COMPILE_PIC_2D + +#else // compile all loop kernels +// +// Loop Subset A: Loops extracted from LLNL app codes. +// They are implemented in runALoops.cxx files. +// +#define COMPILE_PRESSURE_CALC +#define COMPILE_PRESSURE_CALC_ALT +#define COMPILE_ENERGY_CALC +#define COMPILE_ENERGY_CALC_ALT +#define COMPILE_VOL3D_CALC +#define COMPILE_DEL_DOT_VEC_2D +#define COMPILE_COUPLE +#define COMPILE_FIR + +// +// Loop Subset B: "Basic" Loops. +// They are implemented in runBLoops.cxx files. +// +#define COMPILE_INIT3 +#define COMPILE_MULADDSUB +#define COMPILE_IF_QUAD +#define COMPILE_TRAP_INT + +// +// Loop Subset C: Loops from older Livermore Loops in "C" suite. +// They are implemented in runLCKLoops.cxx files. +// +#define COMPILE_HYDRO_1D +#define COMPILE_ICCG +#define COMPILE_INNER_PROD +#define COMPILE_BAND_LIN_EQ +#define COMPILE_TRIDIAG_ELIM +#define COMPILE_EOS +#define COMPILE_ADI +#define COMPILE_INT_PREDICT +#define COMPILE_DIFF_PREDICT +#define COMPILE_FIRST_SUM +#define COMPILE_FIRST_DIFF +#define COMPILE_PIC_2D +#define COMPILE_PIC_1D +#define COMPILE_HYDRO_2D +#define COMPILE_GEN_LIN_RECUR +#define COMPILE_DISC_ORD +#define COMPILE_MAT_X_MAT +#define COMPILE_PLANCKIAN +#define COMPILE_IMP_HYDRO_2D +#define COMPILE_FIND_FIRST_MIN + +#endif + + + +////////////////////////////////////////////////////////////////// +// +// Structure holding double arrays and scalars used in loops. +// +// Note: These are initialized in allocateLoopData(). +// +/////////////////////////////////////////////////////////////////// + +struct LoopData +{ + + // + // Structures to hold data for easy reinitialization + // (useful for verifying result checksums, etc.) + // + struct RealArray + { + int id; + Real_ptr data; + Index_type len; + }; + + struct IndxArray + { + int id; + Index_type* data; + Index_type len; + }; + + struct ComplexArray + { + int id; + Complex_ptr data; + Index_type len; + }; + + + Index_type max_loop_length; + + // + // Static values indicating number of data arrays + // of various forms used in loop suite. + // + // NOTE: These number may need to change to accomodate new loops. + // Also, other arrays may need to be added. + // + static const unsigned s_num_1D_Real_arrays = 16; + static const unsigned s_num_1D_Nx4_Real_arrays = 2; + static const unsigned s_num_1D_Indx_arrays = 5; + static const unsigned s_num_1D_Complex_arrays = 5; + + static const unsigned s_num_2D_Nx25_Real_arrays = 4; + static const unsigned s_num_2D_7xN_Real_arrays = 11; + static const unsigned s_num_2D_64x64_Real_arrays = 1; + + static const unsigned s_num_3D_2xNx4_Real_arrays = 3; + + static const unsigned s_num_Real_scalars = 10; + + // + // NOTE: To see how the following data structures are related, + // please see the routine allocateLoopData() in the + // file LCALSSuite.cxx. + // + // The reason that we hold on to the same data in two + // different ways is two-fold: + // 1) The first set of arrays below makes it easy to + // access pointers to data based on what is used in + // each loop kernel; e.g., arrays of variaous dimensions. + // 2) The second set of arrays makes it easy to process + // arrays for (re)initialization and checksum + // computation to verify results; e.g., we simply + // iterate through 1-dim arrays without having to + // know their lengths, if they are really being used + // as 2- or 3-dimensional arrays, for example. + // + + // + // Data arrays and scalars used in loop execution. + // + Real_ptr array_1D_Real[s_num_1D_Real_arrays]; + Real_ptr array_1D_Nx4_Real[s_num_1D_Nx4_Real_arrays]; + Index_type* array_1D_Indx[s_num_1D_Indx_arrays]; + Complex_ptr array_1D_Complex[s_num_1D_Complex_arrays]; + + Real_ptr* array_2D_Nx25_Real[s_num_2D_Nx25_Real_arrays]; + Real_ptr* array_2D_7xN_Real[s_num_2D_7xN_Real_arrays]; + Real_ptr* array_2D_64x64_Real[s_num_2D_64x64_Real_arrays]; + + Real_ptr** array_3D_2xNx4_Real[s_num_3D_2xNx4_Real_arrays]; + + Real_type scalar_Real[s_num_Real_scalars]; + + // + // Arrays of structs holding data arrays used for data initialization + // and checksum verification. + // + RealArray RealArray_1D[s_num_1D_Real_arrays]; + RealArray RealArray_1D_Nx4[s_num_1D_Nx4_Real_arrays]; + IndxArray IndxArray_1D[s_num_1D_Indx_arrays]; + ComplexArray ComplexArray_1D[s_num_1D_Complex_arrays]; + + RealArray RealArray_2D_Nx25[s_num_2D_Nx25_Real_arrays]; + RealArray RealArray_2D_7xN[s_num_2D_7xN_Real_arrays]; + RealArray RealArray_2D_64x64[s_num_2D_64x64_Real_arrays]; + + RealArray RealArray_3D_2xNx4[s_num_3D_2xNx4_Real_arrays]; + + RealArray RealArray_scalars; + +}; + +// +// Routine to access data structure that holds data needed to execute loops. +// +LoopData& getLoopData(); + + +// +// Routine that generates vector of loop variant names string +// from vector of LoopVariantID enum values. +// +std::vector getVariantNames( + const std::vector& lvids); + +// +// Routine that maps LoopVariantID enum value (used in main to help +// insure correctness) to string (used in loop framework for flexibility). +// +std::string getVariantName(LoopVariantID lvid); + + +////////////////////////////////////////////////////////////////// +// +// Routines to define how loop suite will be run and +// to set up data for loop suite. +// +////////////////////////////////////////////////////////////////// + +// +// Routines to define specific details about how to run loop suite. +// +// Note: Individual loop lengths and sampling parameters +// are defined in this routine. +// +void defineLoopSuiteRunInfo(const std::vector& run_variants, + bool run_loop[], + double sample_frac, + double loop_length_factor ); + + +// +// Routines to allocate and initialize arrays (and scalars) for +// loops in suite and to free those arrays when done. +// +void allocateLoopData(); +void freeLoopData(); + + +// +// Routines to initialize and finalize loop data, statistics, timers, etc. +// +// Each of these routines must be called before and after the execution +// of each loop. +// +void loopInit(unsigned iloop, LoopStat& stat); +void loopInit(unsigned iloop); //, LoopStat& stat); +// +void loopFinalize(unsigned iloop, LoopStat& stat, LoopLength ilength); + + +// +// Routines to run reference loops for figure of merit (FOM) calculations. +// +void defineReferenceLoopRunInfo(); +void computeReferenceLoopTimes(); + +// +// Routine called in main to execute loops corresponding to given +// variant ID and length. The run_loop boolean array indicates which +// loop kernels in suite to execute +// +void runLoopVariant( LoopVariantID lvid, + bool run_loop[], + LoopLength ilength ); + + +// +// Routines to run specific loop variants for suite. +// +// THESE SHOULD NOT BE CALLED BY ROUTINE ABOVE, NOT DIRECTLY!!! +// +// loop_stats is vector of LoopStat objects corresponding to loop variant. +// run_loop boolean array indicates which loop kernels in suite to execute. +// ilength indicates which loop length to run (see LoopLength enum). +// +void runARawLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runBRawLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runCRawLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); + +void runARawFuncLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runBRawFuncLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runCRawFuncLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); + +void runOMPRawLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); + + +void runAForallLambdaLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runBForallLambdaLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runCForallLambdaLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runOMPForallLambdaLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); + +void runAForallLambdaLoops_TYPEFIX( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runBForallLambdaLoops_TYPEFIX( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runCForallLambdaLoops_TYPEFIX( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runOMPForallLambdaLoops_TYPEFIX( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); + + +void runAForallFunctorLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runBForallFunctorLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runCForallFunctorLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runOMPForallFunctorLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); + +void runAForallHybridLambdaLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runBForallHybridLambdaLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runCForallHybridLambdaLoops( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); + +void runAForallHybridLambdaLoops_TYPEFIX( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runBForallHybridLambdaLoops_TYPEFIX( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); +void runCForallHybridLambdaLoops_TYPEFIX( std::vector& loop_stats, + bool run_loop[], + LoopLength ilength ); + +// +// Recursively construct directories based on a relative or +// absolute path name. Return true if directory created +// successfully, else false. +// +bool recursiveMkdir(const std::string& path); + + + + +#endif // closing endif for header file include guard Index: MicroBenchmarks/LCALS/LCALSSuite.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/LCALSSuite.cxx @@ -0,0 +1,2519 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file with routines to allocate data for LCALS suite +// and define parameters controlling execution of each loop. +// + +#include "LCALSSuite.hxx" +#include "LCALSStats.hxx" + +#include "SubsetDataA.hxx" + +#include +#include +#include + +#include +#include + +//#define LCALS_OMP_MEM_INIT +#undef LCALS_OMP_MEM_INIT + +// +// File scope data holding structures used in loop suite +// +static LoopData* s_loop_data = 0; + + +// +// Default value for static ADomain member; +// +double ADomain::loop_length_factor = 1.0; + +// +// Prototypes for file scope routines used in to manage loop data and checksums +// + +namespace { + +Real_ptr allocAndInitData(LoopData::RealArray& ra, Index_type len); +Index_type* allocAndInitData(LoopData::IndxArray& ia, Index_type len); +Complex_ptr allocAndInitData(LoopData::ComplexArray& ca, Index_type len); +void initData(LoopData::RealArray& ra); +void initData(LoopData::IndxArray& ia); +void initData(LoopData::ComplexArray& ca); + +void initChksum(LoopStat& stat, LoopLength ilength); +void updateChksum(LoopStat& stat, LoopLength ilength, + const LoopData::RealArray& ra, Real_type scale_factor = 1.0); +void updateChksum(LoopStat& stat, LoopLength ilength, + Real_type val); +void updateChksum(LoopStat& stat, LoopLength ilength, + const LoopData::ComplexArray& ca, Real_type scale_factor = 1.0); + +} // closing brace for unnamed namespace + + + + +// +// Accessor routine for suite kernel data. +// +LoopData& getLoopData() { return *s_loop_data; } + + +// +// Define how suite will run and initialize stat structures for loops. +// +// NOTE: Loop lengths, loop sample counts (and weights for optimization +// evaluation) are defined here! +// +// These values should be set large enough to accurately generate +// execution timings (i.e., not too small to be masked by CPU timing +// resolution and overhead). The values set here were manually determined +// so that O(1) seconds of execution time is required to sample each loop +// on some of our fastest Intel machines. +// +void defineLoopSuiteRunInfo(const std::vector& run_variants, + bool run_loop[], + double sample_frac, + double loop_length_factor) +{ +#ifdef TESTSUITE + std::cout << "\n defineLoopSuiteRunInfo..." << std::endl; +#endif + std::vector run_variant_names = getVariantNames(run_variants); + + if ( s_loop_data == 0 ) { + s_loop_data = new LoopData(); + } + + // + // + // Enumeration defining loop groups for relative weighting of + // execution timing based on what we think is most important. + // + // In computation of figures of merit (FOM), loops with higher + // weights will reduce FOM value more for higher run-time than + // those with lower weights. + // + enum WeightGroup { + + DATA_PARALLEL = 0, + ORDER_DEPENDENT, + TRANSCENDENTAL, + DATA_DEPENDENT, + POINTER_NEST, + COMPLEX, + + NUM_WEIGHT_GROUPS // Keep this one last and NEVER comment out (!!) + }; + + + // + // Initialize structure holding loop suite execution data. + // + LoopSuiteRunInfo& suite_info = getLoopSuiteRunInfo(); + + suite_info.loop_samp_frac = sample_frac; + + suite_info.loop_weights.resize(NUM_WEIGHT_GROUPS); + suite_info.loop_weights[DATA_PARALLEL] = 2.0; + suite_info.loop_weights[ORDER_DEPENDENT] = 1.8; + suite_info.loop_weights[TRANSCENDENTAL] = 1.7; + suite_info.loop_weights[DATA_DEPENDENT] = 1.7; + suite_info.loop_weights[POINTER_NEST] = 1.4; + suite_info.loop_weights[COMPLEX] = 1.0; + + suite_info.loop_length_names.resize(NUM_LENGTHS); + suite_info.loop_length_names[LONG] = std::string("LONG"); + suite_info.loop_length_names[MEDIUM] = std::string("MEDIUM"); + suite_info.loop_length_names[SHORT] = std::string("SHORT"); + + suite_info.num_loops_run.resize( run_variant_names.size() ); + suite_info.tot_time.resize( run_variant_names.size() ); + suite_info.fom_rel.resize( run_variant_names.size() ); + suite_info.fom_rate.resize( run_variant_names.size() ); + + for (unsigned ilv = 0; ilv < run_variant_names.size(); ++ilv) { + suite_info.addLoopStats(run_variant_names[ilv]); + + suite_info.num_loops_run[ilv].resize(NUM_LENGTHS, 0); + suite_info.tot_time[ilv].resize(NUM_LENGTHS, 0.0); + suite_info.fom_rel[ilv].resize(NUM_LENGTHS, 0.0); + suite_info.fom_rate[ilv].resize(NUM_LENGTHS, 0.0); + } + + + // + // Define common loop lengths for LONG, MEDIUM, SHORT loops. + // + // The values assigned here are propagated across all kernels + // (with a few exceptions) to simplify suite configuration en masse. + // These can also be set per-kernel below. + // + std::vector< int > shared_loop_length(NUM_LENGTHS); + shared_loop_length[LONG] = static_cast(44217 * loop_length_factor); + shared_loop_length[MEDIUM] = static_cast(5001 * loop_length_factor); + shared_loop_length[SHORT] = static_cast(171 * loop_length_factor); + + ADomain::loop_length_factor = loop_length_factor; + + + std::vector& weight = suite_info.loop_weights; + + Index_type max_loop_length = 0; + + for (unsigned iloop = 0 ; iloop < suite_info.num_loops; ++iloop) { + + std::string loop_name; + LoopStat loop_stat(suite_info.num_loop_lengths); + + Index_type max_loop_indx = 0; + + if ( run_loop[iloop] ) { + + switch ( iloop ) { + + case REF_LOOP : { + loop_name = std::string("REF_LOOP"); + // + // Note: Reference loop stats are not used in + // in suite. Parameters are defined in + // defineReferenceLoopRunInfo( ) routine. + // + break; + } + + + // + // Parameters defining how loops in Subset A are run... + // + case PRESSURE_CALC : + case PRESSURE_CALC_ALT : { + + if ( static_cast(iloop) == PRESSURE_CALC ) { + loop_name = std::string("PRESSURE_CALC"); + } else { + loop_name = std::string("PRESSURE_CALC_ALT"); + } + + loop_stat.loop_weight = weight[DATA_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 15000; + loop_stat.samples_per_pass[MEDIUM] = 200000; + loop_stat.samples_per_pass[SHORT] = 10000000; + + break; + } + + case ENERGY_CALC : + case ENERGY_CALC_ALT : { + + if ( static_cast(iloop) == ENERGY_CALC ) { + loop_name = std::string("ENERGY_CALC"); + } else { + loop_name = std::string("ENERGY_CALC_ALT"); + } + + loop_stat.loop_weight = weight[DATA_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 3000; + loop_stat.samples_per_pass[MEDIUM] = 30000; + loop_stat.samples_per_pass[SHORT] = 1000000; + + break; + } + + case VOL3D_CALC : { + loop_name = std::string("VOL3D_CALC"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + Index_type ndims = 3; + + ADomain Ldomain(LONG, ndims); + loop_stat.loop_length[LONG] = Ldomain.lpz - Ldomain.fpz + 1; + ADomain Mdomain(MEDIUM, ndims); + loop_stat.loop_length[MEDIUM] = Mdomain.lpz - Mdomain.fpz + 1; + ADomain Sdomain(SHORT, ndims); + loop_stat.loop_length[SHORT] = Sdomain.lpz - Sdomain.fpz + 1; + + max_loop_indx = Ldomain.lpn; + + loop_stat.samples_per_pass[LONG] = 6500; + loop_stat.samples_per_pass[MEDIUM] = 30000; + loop_stat.samples_per_pass[SHORT] = 800000; + + break; + } + + case DEL_DOT_VEC_2D : { + loop_name = std::string("DEL_DOT_VEC_2D"); + + loop_stat.loop_weight = weight[DATA_PARALLEL]; + + Index_type ndims = 2; + + ADomain Ldomain(LONG, ndims); + loop_stat.loop_length[LONG] = Ldomain.n_real_zones; + ADomain Mdomain(MEDIUM, ndims); + loop_stat.loop_length[MEDIUM] = Mdomain.n_real_zones; + ADomain Sdomain(SHORT, ndims); + loop_stat.loop_length[SHORT] = Sdomain.n_real_zones; + + max_loop_indx = Ldomain.lrn; + + loop_stat.samples_per_pass[LONG] = 4000; + loop_stat.samples_per_pass[MEDIUM] = 25000; + loop_stat.samples_per_pass[SHORT] = 2000000; + + break; + } + + case COUPLE : { + loop_name = std::string("COUPLE"); + + loop_stat.loop_weight = weight[TRANSCENDENTAL]; + + Index_type ndims = 3; + + ADomain Ldomain(LONG, ndims); + loop_stat.loop_length[LONG] = Ldomain.lpz - Ldomain.fpz + 1; + ADomain Mdomain(MEDIUM, ndims); + loop_stat.loop_length[MEDIUM] = Mdomain.lpz - Mdomain.fpz + 1; + ADomain Sdomain(SHORT, ndims); + loop_stat.loop_length[SHORT] = Sdomain.lpz - Sdomain.fpz + 1; + + max_loop_indx = Ldomain.lrn; + + loop_stat.samples_per_pass[LONG] = 2000; + loop_stat.samples_per_pass[MEDIUM] = 10000; + loop_stat.samples_per_pass[SHORT] = 600000; + + break; + } + + case FIR : { + loop_name = std::string("FIR"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 10000; + loop_stat.samples_per_pass[MEDIUM] = 80000; + loop_stat.samples_per_pass[SHORT] = 3000000; + + break; + } + + + // + // Parameters defining how loops in Subset B are run... + // + case INIT3 : { + loop_name = std::string("INIT3"); + + loop_stat.loop_weight = weight[DATA_PARALLEL]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 10000; + loop_stat.samples_per_pass[MEDIUM] = 110000; + loop_stat.samples_per_pass[SHORT] = 12000000; + + break; + } + + case MULADDSUB : { + loop_name = std::string("MULADDSUB"); + + loop_stat.loop_weight = weight[DATA_PARALLEL]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 12000; + loop_stat.samples_per_pass[MEDIUM] = 140000; + loop_stat.samples_per_pass[SHORT] = 15000000; + + break; + } + + case IF_QUAD : { + loop_name = std::string("IF_QUAD"); + + loop_stat.loop_weight = weight[DATA_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 3000; + loop_stat.samples_per_pass[MEDIUM] = 30000; + loop_stat.samples_per_pass[SHORT] = 1000000; + + break; + } + + case TRAP_INT : { + loop_name = std::string("TRAP_INT"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 4000; + loop_stat.samples_per_pass[MEDIUM] = 32000; + loop_stat.samples_per_pass[SHORT] = 1000000; + + break; + } + + + // + // Parameters defining how loops in Subset C are run... + // + case HYDRO_1D : { + loop_name = std::string("HYDRO_1D"); + + loop_stat.loop_weight = weight[DATA_PARALLEL]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 30000; + loop_stat.samples_per_pass[MEDIUM] = 320000; + loop_stat.samples_per_pass[SHORT] = 15000000; + + break; + } + + case ICCG : { + loop_name = std::string("ICCG"); + + loop_stat.loop_weight = weight[COMPLEX]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 20000; + loop_stat.samples_per_pass[MEDIUM] = 200000; + loop_stat.samples_per_pass[SHORT] = 6000000; + + break; + } + + case INNER_PROD : { + loop_name = std::string("INNER_PROD"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 50000; + loop_stat.samples_per_pass[MEDIUM] = 600000; + loop_stat.samples_per_pass[SHORT] = 30000000; + + break; + } + + case BAND_LIN_EQ : { + loop_name = std::string("BAND_LIN_EQ"); + + loop_stat.loop_weight = weight[COMPLEX]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 40000; + loop_stat.samples_per_pass[MEDIUM] = 600000; + loop_stat.samples_per_pass[SHORT] = 20000000; + + break; + } + + case TRIDIAG_ELIM : { + loop_name = std::string("TRIDIAG_ELIM"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 10000; + loop_stat.samples_per_pass[MEDIUM] = 100000; + loop_stat.samples_per_pass[SHORT] = 3000000; + + break; + } + + case EOS : { + loop_name = std::string("EOS"); + + loop_stat.loop_weight = weight[DATA_PARALLEL]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 18000; + loop_stat.samples_per_pass[MEDIUM] = 140000; + loop_stat.samples_per_pass[SHORT] = 5000000; + + break; + } + + case ADI : { + loop_name = std::string("ADI"); + + loop_stat.loop_weight = weight[COMPLEX]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 1000; + loop_stat.samples_per_pass[MEDIUM] = 9000; + loop_stat.samples_per_pass[SHORT] = 300000; + + break; + } + + case INT_PREDICT : { + loop_name = std::string("INT_PREDICT"); + + loop_stat.loop_weight = weight[POINTER_NEST]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 3000; + loop_stat.samples_per_pass[MEDIUM] = 30000; + loop_stat.samples_per_pass[SHORT] = 2000000; + + break; + } + + case DIFF_PREDICT : { + loop_name = std::string("DIFF_PREDICT"); + + loop_stat.loop_weight = weight[POINTER_NEST]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 2000; + loop_stat.samples_per_pass[MEDIUM] = 22000; + loop_stat.samples_per_pass[SHORT] = 1800000; + + break; + } + + case FIRST_SUM : { + loop_name = std::string("FIRST_SUM"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 30000; + loop_stat.samples_per_pass[MEDIUM] = 250000; + loop_stat.samples_per_pass[SHORT] = 8000000; + + break; + } + + case FIRST_DIFF : { + loop_name = std::string("FIRST_DIFF"); + + loop_stat.loop_weight = weight[DATA_PARALLEL]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 30000; + loop_stat.samples_per_pass[MEDIUM] = 500000; + loop_stat.samples_per_pass[SHORT] = 30000000; + + break; + } + + case PIC_2D : { + loop_name = std::string("PIC_2D"); + + loop_stat.loop_weight = weight[COMPLEX]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 2000; + loop_stat.samples_per_pass[MEDIUM] = 18000; + loop_stat.samples_per_pass[SHORT] = 700000; + + break; + } + + case PIC_1D : { + loop_name = std::string("PIC_1D"); + + loop_stat.loop_weight = weight[DATA_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 3000; + loop_stat.samples_per_pass[MEDIUM] = 24000; + loop_stat.samples_per_pass[SHORT] = 1000000; + + break; + } + + case HYDRO_2D : { + loop_name = std::string("HYDRO_2D"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 300; + loop_stat.samples_per_pass[MEDIUM] = 2000; + loop_stat.samples_per_pass[SHORT] = 50000; + + break; + } + + case GEN_LIN_RECUR : { + loop_name = std::string("GEN_LIN_RECUR"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 4000; + loop_stat.samples_per_pass[MEDIUM] = 36000; + loop_stat.samples_per_pass[SHORT] = 1000000; + + break; + } + + case DISC_ORD : { + loop_name = std::string("DISC_ORD"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 1000; + loop_stat.samples_per_pass[MEDIUM] = 8000; + loop_stat.samples_per_pass[SHORT] = 200000; + + break; + } + + case MAT_X_MAT : { + loop_name = std::string("MAT_X_MAT"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 8; + loop_stat.samples_per_pass[MEDIUM] = 70; + loop_stat.samples_per_pass[SHORT] = 8000; + + break; + } + + case PLANCKIAN : { + loop_name = std::string("PLANCKIAN"); + + loop_stat.loop_weight = weight[TRANSCENDENTAL]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 4000; + loop_stat.samples_per_pass[MEDIUM] = 30000; + loop_stat.samples_per_pass[SHORT] = 1000000; + + break; + } + + case IMP_HYDRO_2D : { + loop_name = std::string("IMP_HYDRO_2D"); + + loop_stat.loop_weight = weight[ORDER_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 800; + loop_stat.samples_per_pass[MEDIUM] = 6000; + loop_stat.samples_per_pass[SHORT] = 150000; + + break; + } + + case FIND_FIRST_MIN : { + loop_name = std::string("FIND_FIRST_MIN"); + + loop_stat.loop_weight = weight[DATA_DEPENDENT]; + + for (int i = 0; i < NUM_LENGTHS; ++i) { + loop_stat.loop_length[i] = shared_loop_length[i]; + } + max_loop_indx = loop_stat.loop_length[LONG]; + + loop_stat.samples_per_pass[LONG] = 50000; + loop_stat.samples_per_pass[MEDIUM] = 330000; + loop_stat.samples_per_pass[SHORT] = 8000000; + + break; + } + + default : { + std::cout << "\n Unknown loop id = " << iloop << std::endl; + } + + } // switch statement on loop id + + } // if loop with id is to be run + + + suite_info.loop_names.push_back(loop_name); + + // + // Set max loop length to be largest loop index used over all loops. + // + max_loop_length = + std::max(max_loop_length, max_loop_indx); + + // + // Set number of times each loop length will be run. + // + for (unsigned i = 0; i < suite_info.num_loop_lengths; ++i) { + + loop_stat.samples_per_pass[i] = static_cast( + loop_stat.samples_per_pass[i] * suite_info.loop_samp_frac / + loop_length_factor); + + if ( suite_info.run_loop_length[i] ) { + loop_stat.loop_run_count[i] = + loop_stat.samples_per_pass[i] * suite_info.num_suite_passes; + } else { + loop_stat.loop_run_count[i] = 0; + } + + } + + // + // We add loop stat for each loop to maintain consistent array indexing. + // However, only loops specified to be run will be executed. + // + for (unsigned ilv = 0; ilv < run_variant_names.size(); ++ilv) { + suite_info.getLoopStats(run_variant_names[ilv]).push_back(loop_stat); + } + + } // loop over loop IDs + + + defineReferenceLoopRunInfo(); + + s_loop_data->max_loop_length = + std::max(max_loop_length, suite_info.ref_loop_stat.loop_length[LONG]); +} + +// +// Generate vector of loop variant names string from vector of +// LoopVariantID enum values. +// +std::vector getVariantNames( + const std::vector& lvids) +{ + std::vector run_variant_names; + for (unsigned ilv = 0; ilv < lvids.size(); ++ilv) { + std::string variant_name = getVariantName(lvids[ilv]); + run_variant_names.push_back(variant_name); + } + return run_variant_names; +} + +// +// Generate loop variant name string from LoopVariantID enum value. +// +std::string getVariantName(LoopVariantID lvid) +{ + std::string lvname; + + switch ( lvid ) { + +// Bechmark variants +// + case RAW: { + lvname = "Raw"; break; + } + case RAW_OMP: { + lvname = "Raw_OMP"; break; + } + case FORALL_LAMBDA: { + lvname = "Forall_Lambda"; break; + } + case FORALL_LAMBDA_OMP: { + lvname = "Forall_Lambda_OMP"; break; + } + +#if defined(LCALS_DO_MISC) + +// Misc variants +// + case FORALL_HYBRID_LAMBDA: { + lvname = "Hybrid_Lambda"; break; + } +#if 0 // THESE ARE AVAILABLE YET!!! + case FORALL_HYBRID_LAMBDA_OMP: { + lvname = "Hybrid_Lambda_OMP"; break; + } +#endif + case FORALL_FUNCTOR: { + lvname = "Forall_Functor"; break; + } + case FORALL_FUNCTOR_OMP: { + lvname = "Forall_Functor_OMP"; break; + } +#if 0 // THESE ARE AVAILABLE YET!!! + case FORALL_HYBRID_FUNCTOR: { + lvname = "Hybrid_Functor"; break; + } + case FORALL_HYBRID_FUNCTOR_OMP: { + lvname = "Hybrid_Functor_OMP"; break; + } +#endif + case RAW_FUNC: { + lvname = "Raw_Func"; break; + } + case FORALL_LAMBDA_TYPEFIX: { + lvname = "Forall_Lambda_TYPEFIX"; break; + } + case FORALL_LAMBDA_OMP_TYPEFIX: { + lvname = "Forall_Lambda_OMP_TYPEFIX"; break; + } + case FORALL_HYBRID_LAMBDA_TYPEFIX: { + lvname = "Hybrid_Lambda_TYPEFIX"; break; + } + +#endif // if LCALS_DO_MISC + + default: { + std::cout << "\n Unknown loop variant id = " << lvid << std::endl; + } + + } + + return lvname; +} + +#ifdef TEST_SUITE +// +// Execute loop variant identified by function args. +// +void runLoopVariant( LoopVariantID lvid, + bool run_loop[], + LoopLength ilength ) +{ + LoopSuiteRunInfo& loop_suite_run_info = getLoopSuiteRunInfo(); + + std::string loop_variant_name = getVariantName(lvid); + std::vector& loop_stats = + loop_suite_run_info.getLoopStats(loop_variant_name); + + switch ( lvid ) { + +// Bechmark variants +// + case RAW: { + runARawLoops(loop_stats, run_loop, ilength); + runBRawLoops(loop_stats, run_loop, ilength); + runCRawLoops(loop_stats, run_loop, ilength); + break; + } + case FORALL_LAMBDA: { + runAForallLambdaLoops(loop_stats, run_loop, ilength); + runBForallLambdaLoops(loop_stats, run_loop, ilength); + runCForallLambdaLoops(loop_stats, run_loop, ilength); + break; + } + case RAW_OMP: { + runOMPRawLoops(loop_stats, run_loop, ilength); + break; + } + case FORALL_LAMBDA_OMP: { + runOMPForallLambdaLoops(loop_stats, run_loop, ilength); + break; + } + +#if defined(LCALS_DO_MISC) + +// Misc variants +// + case FORALL_HYBRID_LAMBDA: { + runAForallHybridLambdaLoops(loop_stats, run_loop, ilength); + runBForallHybridLambdaLoops(loop_stats, run_loop, ilength); + runCForallHybridLambdaLoops(loop_stats, run_loop, ilength); + break; + } +#if 0 // THESE ARE NOT DEFINED YET!!! + case FORALL_HYBRID_LAMBDA_OMP: { + break; + } +#endif + case FORALL_FUNCTOR: { + runAForallFunctorLoops(loop_stats, run_loop, ilength); + runBForallFunctorLoops(loop_stats, run_loop, ilength); + runCForallFunctorLoops(loop_stats, run_loop, ilength); + break; + } + case FORALL_FUNCTOR_OMP: { + runOMPForallFunctorLoops(loop_stats, run_loop, ilength); + break; + } +#if 0 // THESE ARE NOT DEFINED YET!!! + case FORALL_HYBRID_FUNCTOR: { + break; + } + case FORALL_HYBRID_FUNCTOR_OMP: { + break; + } +#endif + case RAW_FUNC: { + runARawFuncLoops(loop_stats, run_loop, ilength); + runBRawFuncLoops(loop_stats, run_loop, ilength); + runCRawFuncLoops(loop_stats, run_loop, ilength); + break; + } + case FORALL_LAMBDA_TYPEFIX: { + runAForallLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength); + runBForallLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength); + runCForallLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength); + break; + } + case FORALL_LAMBDA_OMP_TYPEFIX: { + runOMPForallLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength); + break; + } + case FORALL_HYBRID_LAMBDA_TYPEFIX: { + runAForallHybridLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength); + runBForallHybridLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength); + runCForallHybridLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength); + break; + } + +#endif // if LCALS_DO_MISC + + default: { + std::cout << "\n Unknown loop variant id = " << lvid << std::endl; + } + + } + +} +#endif + +// +// Initialize data to run loop with given ID. Note that this routine +// assumes that it is called before the loop with given ID is run and +// that data initialization calls in here are concistent with what is +// needed to execute loop. +// +// Loop data is initialized in this routine so all variants of loop +// tun the same way. Note that data arrays are initialized for +// each loop only under the circumstances that it is actually required. +// +// + +void loopInit(unsigned iloop, LoopStat& stat) +{ + LoopData& loop_data = getLoopData(); + + flushCache(); + + stat.loop_is_run = true; + + + switch ( iloop ) { + + case REF_LOOP : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + break; + } + + case PRESSURE_CALC : + case PRESSURE_CALC_ALT : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case ENERGY_CALC : + case ENERGY_CALC_ALT : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + initData(loop_data.RealArray_1D[5]); + initData(loop_data.RealArray_1D[6]); + initData(loop_data.RealArray_1D[7]); + initData(loop_data.RealArray_1D[8]); + initData(loop_data.RealArray_1D[9]); + initData(loop_data.RealArray_1D[10]); + initData(loop_data.RealArray_1D[11]); + initData(loop_data.RealArray_1D[12]); + initData(loop_data.RealArray_1D[13]); + initData(loop_data.RealArray_1D[14]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case VOL3D_CALC : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + + break; + } + + case DEL_DOT_VEC_2D : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case COUPLE : { + + initData(loop_data.ComplexArray_1D[0]); + initData(loop_data.ComplexArray_1D[1]); + initData(loop_data.ComplexArray_1D[2]); + initData(loop_data.ComplexArray_1D[3]); + initData(loop_data.ComplexArray_1D[4]); + + break; + } + + case FIR : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + case INIT3 : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case MULADDSUB : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case IF_QUAD : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case TRAP_INT : { + + initData(loop_data.IndxArray_1D[0]); + + initData(loop_data.RealArray_scalars); + + break; + } + + + case HYDRO_1D : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case ICCG : { + + initData(loop_data.RealArray_1D_Nx4[0]); + initData(loop_data.RealArray_1D_Nx4[1]); + + break; + } + + case INNER_PROD : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + case BAND_LIN_EQ : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + case TRIDIAG_ELIM : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + break; + } + + case EOS : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case ADI : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + initData(loop_data.RealArray_3D_2xNx4[0]); + initData(loop_data.RealArray_3D_2xNx4[1]); + initData(loop_data.RealArray_3D_2xNx4[2]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case INT_PREDICT : { + + initData(loop_data.RealArray_2D_Nx25[0]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case DIFF_PREDICT : { + + initData(loop_data.RealArray_2D_Nx25[0]); + initData(loop_data.RealArray_2D_Nx25[1]); + + break; + } + + case FIRST_SUM : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + case FIRST_DIFF : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + case PIC_2D : { + + initData(loop_data.RealArray_2D_Nx25[0]); + initData(loop_data.RealArray_2D_Nx25[1]); + initData(loop_data.RealArray_2D_Nx25[2]); + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + initData(loop_data.IndxArray_1D[0]); + initData(loop_data.IndxArray_1D[1]); + + initData(loop_data.RealArray_2D_64x64[0]); + + break; + } + + case PIC_1D : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + initData(loop_data.RealArray_1D[5]); + initData(loop_data.RealArray_1D[6]); + initData(loop_data.RealArray_1D[7]); + initData(loop_data.RealArray_1D[8]); + + initData(loop_data.RealArray_scalars); + + initData(loop_data.IndxArray_1D[2]); + initData(loop_data.IndxArray_1D[3]); + initData(loop_data.IndxArray_1D[4]); + + break; + } + + case HYDRO_2D : { + + initData(loop_data.RealArray_2D_7xN[0]); + initData(loop_data.RealArray_2D_7xN[1]); + initData(loop_data.RealArray_2D_7xN[2]); + initData(loop_data.RealArray_2D_7xN[3]); + initData(loop_data.RealArray_2D_7xN[4]); + initData(loop_data.RealArray_2D_7xN[5]); + initData(loop_data.RealArray_2D_7xN[6]); + initData(loop_data.RealArray_2D_7xN[7]); + initData(loop_data.RealArray_2D_7xN[8]); + initData(loop_data.RealArray_2D_7xN[9]); + initData(loop_data.RealArray_2D_7xN[10]); + + break; + } + + case GEN_LIN_RECUR : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case DISC_ORD : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + initData(loop_data.RealArray_1D[5]); + initData(loop_data.RealArray_1D[6]); + initData(loop_data.RealArray_1D[7]); + initData(loop_data.RealArray_1D[8]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case MAT_X_MAT : { + + initData(loop_data.RealArray_2D_Nx25[0]); + initData(loop_data.RealArray_2D_Nx25[1]); + initData(loop_data.RealArray_2D_64x64[0]); + + break; + } + + case PLANCKIAN : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case IMP_HYDRO_2D : { + + initData(loop_data.RealArray_2D_7xN[0]); + initData(loop_data.RealArray_2D_7xN[1]); + initData(loop_data.RealArray_2D_7xN[2]); + initData(loop_data.RealArray_2D_7xN[3]); + initData(loop_data.RealArray_2D_7xN[4]); + initData(loop_data.RealArray_2D_7xN[5]); + + break; + } + + case FIND_FIRST_MIN : { + + initData(loop_data.RealArray_1D[0]); + + break; + } + + + default : { + std::cout << "\n Unknown loop id = " << iloop << std::endl; + } + } +} + +/* *********** LLVM Test Suite ************* * + * * + * Overloaded for use in the test suite. * + * Removes LoopStat argument and setting * + * the loop as run. Benchmark library * + * replaces the stat object for timing * + * statistics. * + * * + * ***************************************** */ + +void loopInit(unsigned iloop) //, LoopStat& stat) +{ + LoopData& loop_data = getLoopData(); + + flushCache(); + +// stat.loop_is_run = true; + + + switch ( iloop ) { + + case REF_LOOP : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + break; + } + + + // + // Initialize data for Loop Subset A... + // + case PRESSURE_CALC : + case PRESSURE_CALC_ALT : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case ENERGY_CALC : + case ENERGY_CALC_ALT : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + initData(loop_data.RealArray_1D[5]); + initData(loop_data.RealArray_1D[6]); + initData(loop_data.RealArray_1D[7]); + initData(loop_data.RealArray_1D[8]); + initData(loop_data.RealArray_1D[9]); + initData(loop_data.RealArray_1D[10]); + initData(loop_data.RealArray_1D[11]); + initData(loop_data.RealArray_1D[12]); + initData(loop_data.RealArray_1D[13]); + initData(loop_data.RealArray_1D[14]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case VOL3D_CALC : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + + break; + } + + case DEL_DOT_VEC_2D : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case COUPLE : { + + initData(loop_data.ComplexArray_1D[0]); + initData(loop_data.ComplexArray_1D[1]); + initData(loop_data.ComplexArray_1D[2]); + initData(loop_data.ComplexArray_1D[3]); + initData(loop_data.ComplexArray_1D[4]); + + break; + } + + case FIR : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + + // + // Initialize data for Loop Subset B... + // + case INIT3 : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case MULADDSUB : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case IF_QUAD : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case TRAP_INT : { + + initData(loop_data.IndxArray_1D[0]); + + initData(loop_data.RealArray_scalars); + + break; + } + + + // + // Initialize data for Loop Subset C... + // + case HYDRO_1D : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case ICCG : { + + initData(loop_data.RealArray_1D_Nx4[0]); + initData(loop_data.RealArray_1D_Nx4[1]); + + break; + } + + case INNER_PROD : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + case BAND_LIN_EQ : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + case TRIDIAG_ELIM : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + break; + } + + case EOS : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case ADI : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + initData(loop_data.RealArray_3D_2xNx4[0]); + initData(loop_data.RealArray_3D_2xNx4[1]); + initData(loop_data.RealArray_3D_2xNx4[2]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case INT_PREDICT : { + + initData(loop_data.RealArray_2D_Nx25[0]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case DIFF_PREDICT : { + + initData(loop_data.RealArray_2D_Nx25[0]); + initData(loop_data.RealArray_2D_Nx25[1]); + + break; + } + + case FIRST_SUM : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + case FIRST_DIFF : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + break; + } + + case PIC_2D : { + + initData(loop_data.RealArray_2D_Nx25[0]); + initData(loop_data.RealArray_2D_Nx25[1]); + initData(loop_data.RealArray_2D_Nx25[2]); + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + + initData(loop_data.IndxArray_1D[0]); + initData(loop_data.IndxArray_1D[1]); + + initData(loop_data.RealArray_2D_64x64[0]); + + break; + } + + case PIC_1D : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + initData(loop_data.RealArray_1D[5]); + initData(loop_data.RealArray_1D[6]); + initData(loop_data.RealArray_1D[7]); + initData(loop_data.RealArray_1D[8]); + + initData(loop_data.RealArray_scalars); + + initData(loop_data.IndxArray_1D[2]); + initData(loop_data.IndxArray_1D[3]); + initData(loop_data.IndxArray_1D[4]); + + break; + } + + case HYDRO_2D : { + + initData(loop_data.RealArray_2D_7xN[0]); + initData(loop_data.RealArray_2D_7xN[1]); + initData(loop_data.RealArray_2D_7xN[2]); + initData(loop_data.RealArray_2D_7xN[3]); + initData(loop_data.RealArray_2D_7xN[4]); + initData(loop_data.RealArray_2D_7xN[5]); + initData(loop_data.RealArray_2D_7xN[6]); + initData(loop_data.RealArray_2D_7xN[7]); + initData(loop_data.RealArray_2D_7xN[8]); + initData(loop_data.RealArray_2D_7xN[9]); + initData(loop_data.RealArray_2D_7xN[10]); + + break; + } + + case GEN_LIN_RECUR : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case DISC_ORD : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + initData(loop_data.RealArray_1D[5]); + initData(loop_data.RealArray_1D[6]); + initData(loop_data.RealArray_1D[7]); + initData(loop_data.RealArray_1D[8]); + + initData(loop_data.RealArray_scalars); + + break; + } + + case MAT_X_MAT : { + + initData(loop_data.RealArray_2D_Nx25[0]); + initData(loop_data.RealArray_2D_Nx25[1]); + initData(loop_data.RealArray_2D_64x64[0]); + + break; + } + + case PLANCKIAN : { + + initData(loop_data.RealArray_1D[0]); + initData(loop_data.RealArray_1D[1]); + initData(loop_data.RealArray_1D[2]); + initData(loop_data.RealArray_1D[3]); + initData(loop_data.RealArray_1D[4]); + + break; + } + + case IMP_HYDRO_2D : { + + initData(loop_data.RealArray_2D_7xN[0]); + initData(loop_data.RealArray_2D_7xN[1]); + initData(loop_data.RealArray_2D_7xN[2]); + initData(loop_data.RealArray_2D_7xN[3]); + initData(loop_data.RealArray_2D_7xN[4]); + initData(loop_data.RealArray_2D_7xN[5]); + + break; + } + + case FIND_FIRST_MIN : { + + initData(loop_data.RealArray_1D[0]); + + break; + } + + + default : { + std::cout << "\n Unknown loop id = " << iloop << std::endl; + } + + } // switch statement on loop id + +} + + +// +// Finalize data for loop with given ID. Note that this routine assumes +// that it is called after the loop with given ID is run and that checksum +// calls in here are concistent with what is needed for loop. +// +void loopFinalize(unsigned iloop, LoopStat& stat, LoopLength ilength) +{ +#if defined(LCALS_VERIFY_CHECKSUM) + initChksum(stat, ilength); + + LoopData& loop_data = getLoopData(); + + switch ( iloop ) { + + case REF_LOOP : { + + // Nothing to do for REF_LOOP case... + + break; + } + + + // + // Update checksums for Loop Subset A... + // + case PRESSURE_CALC : + case PRESSURE_CALC_ALT : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[2]); + + break; + } + + case ENERGY_CALC : + case ENERGY_CALC_ALT : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + updateChksum(stat, ilength, loop_data.RealArray_1D[5]); + + break; + } + + case VOL3D_CALC : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[3]); + + break; + } + + case DEL_DOT_VEC_2D : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[4]); + + break; + } + + case COUPLE : { + + updateChksum(stat, ilength, loop_data.ComplexArray_1D[0]); + updateChksum(stat, ilength, loop_data.ComplexArray_1D[1]); + updateChksum(stat, ilength, loop_data.ComplexArray_1D[2]); + + break; + } + + case FIR : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + + break; + } + + + // + // Update checksums for Loop Subset B... + // + case INIT3 : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + updateChksum(stat, ilength, loop_data.RealArray_1D[1]); + updateChksum(stat, ilength, loop_data.RealArray_1D[2]); + + break; + } + + case MULADDSUB : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + updateChksum(stat, ilength, loop_data.RealArray_1D[1]); + updateChksum(stat, ilength, loop_data.RealArray_1D[2]); + + break; + } + + case IF_QUAD : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[3]); + updateChksum(stat, ilength, loop_data.RealArray_1D[4]); + + break; + } + + case TRAP_INT : { + + updateChksum(stat, ilength, loop_data.scalar_Real[0]); + + break; + } + + + // + // Update checksums for Loop Subset C... + // + case HYDRO_1D : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + + break; + } + + case ICCG : { + + updateChksum(stat, ilength, loop_data.RealArray_1D_Nx4[0]); + + break; + } + + case INNER_PROD : { + + updateChksum(stat, ilength, loop_data.scalar_Real[0]); + + break; + } + + case BAND_LIN_EQ : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + + break; + } + + case TRIDIAG_ELIM : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + + break; + } + + case EOS : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + + break; + } + + case ADI : { + + updateChksum(stat, ilength, loop_data.RealArray_3D_2xNx4[0]); + updateChksum(stat, ilength, loop_data.RealArray_3D_2xNx4[1]); + updateChksum(stat, ilength, loop_data.RealArray_3D_2xNx4[2]); + + break; + } + + case INT_PREDICT : { + + updateChksum(stat, ilength, loop_data.RealArray_2D_Nx25[0]); + + break; + } + + case DIFF_PREDICT : { + + updateChksum(stat, ilength, loop_data.RealArray_2D_Nx25[0]); + + break; + } + + case FIRST_SUM : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + + break; + } + + case FIRST_DIFF : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + + break; + } + + case PIC_2D : { + + updateChksum(stat, ilength, loop_data.RealArray_2D_Nx25[0]); + updateChksum(stat, ilength, loop_data.RealArray_2D_64x64[0]); + + break; + } + + case PIC_1D : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[6]); + updateChksum(stat, ilength, loop_data.RealArray_1D[1]); + updateChksum(stat, ilength, loop_data.RealArray_1D[7]); + + break; + } + + case HYDRO_2D : { + + updateChksum(stat, ilength, loop_data.RealArray_2D_7xN[9]); + updateChksum(stat, ilength, loop_data.RealArray_2D_7xN[10]); + + break; + } + + case GEN_LIN_RECUR : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[0]); + + break; + } + + case DISC_ORD : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[7]); + + break; + } + + case MAT_X_MAT : { + + updateChksum(stat, ilength, loop_data.RealArray_2D_Nx25[0]); + + break; + } + + case PLANCKIAN : { + + updateChksum(stat, ilength, loop_data.RealArray_1D[4]); + + break; + } + + case IMP_HYDRO_2D : { + + updateChksum(stat, ilength, loop_data.RealArray_2D_7xN[0]); + + break; + } + + case FIND_FIRST_MIN : { + + updateChksum(stat, ilength, loop_data.scalar_Real[0]); + + break; + } + + + default : { + std::cout << "\n Unknown loop id = " << iloop << std::endl; + } + + } // switch statement on loop id + +#endif // if LCALS_VERIFY_CHECKSUM +} + + +// +// Allocate and initialize arrays (and scalars) used to execute loops in suite. +// +void allocateLoopData() +{ +#ifdef TESTSUITE + std::cout << "\n allocateLoopData..." << std::endl; +#endif + unsigned num_aligned_segments = + (s_loop_data->max_loop_length + 20)/LCALS_DATA_ALIGN + 1; + unsigned aligned_chunksize = num_aligned_segments * LCALS_DATA_ALIGN; + + // + // Allocate and initialize 1D loop length Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_1D_Real_arrays; ++i) { + Index_type data_len = aligned_chunksize; + + LoopData::RealArray* rarray = s_loop_data->RealArray_1D; + rarray[i].id = i+1; + Real_ptr data = allocAndInitData(rarray[i], data_len); + + s_loop_data->array_1D_Real[i] = data; + } + + // + // Allocate and initialize 1D loop length X 4 Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_1D_Nx4_Real_arrays; ++i) { + Index_type data_len = aligned_chunksize*4; + + LoopData::RealArray* rarray = s_loop_data->RealArray_1D_Nx4; + rarray[i].id = i+1; + Real_ptr data = allocAndInitData(rarray[i], data_len); + + s_loop_data->array_1D_Nx4_Real[i] = data; + } + + // + // Allocate and initialize 1D loop length Indx arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_1D_Indx_arrays; ++i) { + Index_type data_len = aligned_chunksize; + + LoopData::IndxArray* iarray = s_loop_data->IndxArray_1D; + iarray[i].id = i; + Index_type* data = allocAndInitData(iarray[i], data_len); + + s_loop_data->array_1D_Indx[i] = data; + } + + // + // Allocate and initialize 1D loop length Complex arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_1D_Complex_arrays; ++i) { + Index_type data_len = aligned_chunksize; + + LoopData::ComplexArray* carray = s_loop_data->ComplexArray_1D; + carray[i].id = i+1; + Complex_ptr data = allocAndInitData(carray[i], data_len); + + s_loop_data->array_1D_Complex[i] = data; + } + + // + // Allocate and initialize 2D loop length X 25 Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_2D_Nx25_Real_arrays; ++i) { + Index_type data_len = aligned_chunksize*25; + + LoopData::RealArray* rarray = s_loop_data->RealArray_2D_Nx25; + rarray[i].id = i+1; + Real_ptr data = allocAndInitData(rarray[i], data_len); + + s_loop_data->array_2D_Nx25_Real[i] = new Real_ptr[aligned_chunksize]; + for (Index_type k = 0; k < aligned_chunksize; ++k) { + s_loop_data->array_2D_Nx25_Real[i][k] = &data[k*25]; + } + } + + // + // Allocate and initialize 2D 7 X loop length Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_2D_7xN_Real_arrays; ++i) { + Index_type data_len = 7*aligned_chunksize; + + LoopData::RealArray* rarray = s_loop_data->RealArray_2D_7xN; + rarray[i].id = i+1; + Real_ptr data = allocAndInitData(rarray[i], data_len); + + s_loop_data->array_2D_7xN_Real[i] = new Real_ptr[7]; + for (Index_type k = 0; k < 7; ++k) { + s_loop_data->array_2D_7xN_Real[i][k] = &data[k*aligned_chunksize]; + } + } + + // + // Allocate and initialize 2D 64 X 64 Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_2D_64x64_Real_arrays; ++i) { + Index_type data_len = 64*64; + + LoopData::RealArray* rarray = s_loop_data->RealArray_2D_64x64; + rarray[i].id = i+1; + Real_ptr data = allocAndInitData(rarray[i], data_len); + + s_loop_data->array_2D_64x64_Real[i] = new Real_ptr[64]; + for (Index_type k = 0; k < 64; ++k) { + s_loop_data->array_2D_64x64_Real[i][k] = &data[k*64]; + } + } + + // + // Allocate and initialize 3D 2 X loop length X 4 Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_3D_2xNx4_Real_arrays; ++i) { + Index_type data_len = 2*aligned_chunksize*4; + + LoopData::RealArray* rarray = s_loop_data->RealArray_3D_2xNx4; + rarray[i].id = i+1; + Real_ptr data = allocAndInitData(rarray[i], data_len); + + s_loop_data->array_3D_2xNx4_Real[i] = new Real_ptr*[2]; + for (Index_type k = 0; k < 2; ++k) { + s_loop_data->array_3D_2xNx4_Real[i][k] = new Real_ptr[aligned_chunksize]; + } + + for (Index_type k = 0; k < 2; ++k) { + for (Index_type l = 0; l < aligned_chunksize; ++l) { + s_loop_data->array_3D_2xNx4_Real[i][k][l] = &data[k*l*4]; + } + } + } + + + // + // Initialize Real scalars. + // + s_loop_data->RealArray_scalars.id = 21; + s_loop_data->RealArray_scalars.data = s_loop_data->scalar_Real; + s_loop_data->RealArray_scalars.len = s_loop_data->s_num_Real_scalars; + initData(s_loop_data->RealArray_scalars); + +} + +// +// Free arrays used in loop suite loop execution (allocated in routine above). +// +void freeLoopData() +{ + if ( s_loop_data != 0 ) return; +#ifdef TESTSUITE + std::cout << "\n freeLoopData..." << std::endl; +#endif + // + // De-allocate 1D loop length Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_1D_Real_arrays; ++i) { +#if defined(USE_PTR_CLASS) + free( s_loop_data->array_1D_Real[i].get() ); +#else + free( s_loop_data->array_1D_Real[i] ); +#endif + } + + // + // De-allocate 1D loop length X 4 Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_1D_Nx4_Real_arrays; ++i) { +#if defined(USE_PTR_CLASS) + free( s_loop_data->array_1D_Nx4_Real[i].get() ); +#else + free( s_loop_data->array_1D_Nx4_Real[i] ); +#endif + } + + // + // De-allocate 1D loop length Indx arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_1D_Indx_arrays; ++i) { + free( s_loop_data->array_1D_Indx[i] ); + } + + // + // De-allocate 1D loop length Complex arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_1D_Complex_arrays; ++i) { +#if defined(USE_PTR_CLASS) + free( s_loop_data->array_1D_Complex[i].get() ); +#else + free( s_loop_data->array_1D_Complex[i] ); +#endif + } + + // + // De-allocate 2D 7 X loop length Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_2D_7xN_Real_arrays; ++i) { +#if defined(USE_PTR_CLASS) + free( s_loop_data->array_2D_7xN_Real[i][0].get() ); +#else + free( s_loop_data->array_2D_7xN_Real[i][0] ); +#endif + delete [] s_loop_data->array_2D_7xN_Real[i]; + } + + // + // De-allocate 2D 64 X 64 Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_2D_64x64_Real_arrays; ++i) { +#if defined(USE_PTR_CLASS) + free( s_loop_data->array_2D_64x64_Real[i][0].get() ); +#else + free( s_loop_data->array_2D_64x64_Real[i][0] ); +#endif + delete [] s_loop_data->array_2D_64x64_Real[i]; + } + + // + // De-allocate and initialize 3D 2 X loop length X 4 Real arrays. + // + for (unsigned i = 0; i < s_loop_data->s_num_3D_2xNx4_Real_arrays; ++i) { +#if defined(USE_PTR_CLASS) + free( s_loop_data->array_3D_2xNx4_Real[i][0][0].get() ); +#else + free( s_loop_data->array_3D_2xNx4_Real[i][0][0] ); +#endif + for (Index_type k = 0; k < 2; ++k) { + delete [] s_loop_data->array_3D_2xNx4_Real[i][k]; + } + delete [] s_loop_data->array_3D_2xNx4_Real[i]; + } + + delete s_loop_data; + s_loop_data = 0; +} + + +// +// Implementations of file scope routines used to manage loop data +// and checksums +// + +namespace { + + +// +// Routines to allocate and initialize individual arrays consistently for +// checking results. +// +Real_ptr allocAndInitData(LoopData::RealArray& ra, Index_type len) +{ + Real_ptr data = 0; + posix_memalign( (void **)&data, LCALS_DATA_ALIGN, len*sizeof(Real_type) ); + ra.data = data; + ra.len = len; + + initData(ra); + + return data; +} + +Index_type* allocAndInitData(LoopData::IndxArray& ia, Index_type len) +{ + Index_type* data = 0; + posix_memalign( (void **)&data, LCALS_DATA_ALIGN, len*sizeof(Index_type) ); + ia.data = data; + ia.len = len; + + initData(ia); + + return data; +} + +Complex_ptr allocAndInitData(LoopData::ComplexArray& ca, Index_type len) +{ + Complex_ptr data = new Complex_type[len]; + ca.data = data; + ca.len = len; + + initData(ca); + + return data; +} + +void initData(LoopData::RealArray& ra) +{ + int id = ra.id; + Real_type factor = ( id % 2 ? 0.1 : 0.2 ); + Real_ptr data = ra.data; + Index_type totlen = ra.len; +#if defined(LCALS_OMP_MEM_INIT) +#pragma omp parallel for + for (Index_type j = 0; j < totlen; ++j) { + data[j] = factor*(j + 1.1)/(j + 1.12345); + } +#else + for (Index_type j = 0; j < totlen; ++j) { + data[j] = factor*(j + 1.1)/(j + 1.12345); + } +#endif +} + +void initData(LoopData::IndxArray& ia) +{ + int id = ia.id; + Index_type* data = ia.data; + Index_type totlen = ia.len; +#if defined(LCALS_OMP_MEM_INIT) +#pragma omp parallel for + for (Index_type j = 0; j < totlen; ++j) { + data[j] = 0; + } +#else + for (Index_type j = 0; j < totlen; ++j) { + data[j] = 0; + } +#endif +} + +void initData(LoopData::ComplexArray& ca) +{ + int id = ca.id; + Complex_type factor = ( id % 2 ? Complex_type(0.1,0.2) : + Complex_type(0.2,0.3) ); + Complex_ptr data = ca.data; + Index_type totlen = ca.len; +#if defined(LCALS_OMP_MEM_INIT) +#pragma omp parallel for + for (Index_type j = 0; j < totlen; ++j) { + data[j] = factor*(j + 1.1)/(j + 1.12345); + } +#else + for (Index_type j = 0; j < totlen; ++j) { + data[j] = factor*(j + 1.1)/(j + 1.12345); + } +#endif +} + + +// +// Routines to initialize loop check sum. +// +void initChksum(LoopStat& stat, LoopLength ilength) +{ + stat.loop_chksum[ilength] = 0.0; +} + +// +// Routines to update loop check sum. +// +void updateChksum(LoopStat& stat, LoopLength ilength, + const LoopData::RealArray& ra, + Real_type scale_factor) +{ + Real_ptr data = ra.data; + Index_type len = ra.len; + long double tchk = stat.loop_chksum[ilength]; + for (Index_type j = 0; j < len; ++j) { + tchk += (j+1)*data[j]*scale_factor; + } + stat.loop_chksum[ilength] = tchk; +} + +void updateChksum(LoopStat& stat, LoopLength ilength, + Real_type val) +{ + stat.loop_chksum[ilength] += val; +} + +void updateChksum(LoopStat& stat, LoopLength ilength, + const LoopData::ComplexArray& ca, + Real_type scale_factor) +{ + Complex_ptr data = ca.data; + Index_type len = ca.len; + long double tchk = stat.loop_chksum[ilength]; + for (Index_type j = 0; j < len; ++j) { + tchk += (j+1)*(real(data[j])+imag(data[j]))*scale_factor; + } + stat.loop_chksum[ilength] = tchk; +} + +} // closing brace for unnamed namespace + + + +// +// Recursively construct directories for given path name. +// +bool recursiveMkdir(const std::string& path) +{ + bool retval = true; + + mode_t mode = (S_IRUSR | S_IWUSR | S_IXUSR); + const char separator = '/'; + + int length = static_cast(path.length()); + char* path_buf = new char[length + 1]; + sprintf(path_buf, "%s", path.c_str()); + struct stat status; + int pos = length - 1; + + /* find part of path that has not yet been created */ + while ((stat(path_buf, &status) != 0) && (pos >= 0)) { + + /* slide backwards in string until next slash found */ + bool slash_found = false; + while ((!slash_found) && (pos >= 0)) { + if (path_buf[pos] == separator) { + slash_found = true; + if (pos >= 0) path_buf[pos] = '\0'; + } else pos--; + } + } + + /* + * if there is a part of the path that already exists make sure + * it is really a directory + */ + if (pos >= 0) { + if (!S_ISDIR(status.st_mode)) { + std::cout << "Cannot create directories in path = " << path + << "\n because some intermediate item in path exists and" + << "is NOT a directory" << std::endl; + retval = false; + } + } + + /* + * make all directories that do not already exist + * + * if (pos < 0), then there is no part of the path that + * already exists. Need to make the first part of the + * path before sliding along path_buf. + */ + if ( retval && pos < 0) { + if (mkdir(path_buf, mode) != 0) { + std::cout << " Cannot create directory = " + << path_buf << std::endl; + retval = false; + } + pos = 0; + } + + if ( retval ) { + + /* make remaining directories */ + do { + + /* slide forward in string until next '\0' found */ + bool null_found = false; + while ((!null_found) && (pos < length)) { + if (path_buf[pos] == '\0') { + null_found = true; + path_buf[pos] = separator; + } + pos++; + } + + /* make directory if not at end of path */ + if (pos < length) { + if (mkdir(path_buf, mode) != 0) { + std::cout << " Cannot create directory = " + << path_buf << std::endl; + retval = false; + } + } + } while (pos < length && retval); + + } + + delete[] path_buf; + + return retval; +} Index: MicroBenchmarks/LCALS/LCALSTraversalMethods.hxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/LCALSTraversalMethods.hxx @@ -0,0 +1,455 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Header file containing LCALS traversal method templates used with +// "forall" loop variants. +// +// Tag structs for traversal types are located in LCALSParams.hxx. +// + +#ifndef LCALSTraversalMethods_HXX +#define LCALSTraversalMethods_HXX + +#include "LCALSParams.hxx" + +#include + + +/*! + ****************************************************************************** + * + * \brief Traverse contiguous range of indices using sequential execution. + * + ****************************************************************************** + */ +template +LCALS_INLINE +void forall(seq_exec, + Index_type begin, Index_type end, LOOP_BODY loop_body) +{ +#pragma novector + for ( Index_type ii = begin ; ii < end ; ++ii ) { + loop_body( ii ); + } +} + +/// with stride +template +LCALS_INLINE +void forall(seq_exec, + Index_type begin, Index_type end, Index_type stride, + LOOP_BODY loop_body) +{ +#pragma novector + for ( Index_type ii = begin ; ii < end ; ii += stride ) { + loop_body( ii ); + } +} + + +/*! + ****************************************************************************** + * + * \brief Traverse contiguous range of indices using SIMD vectorization. + * No assumption made on data alignment. + * + ****************************************************************************** + */ +template +LCALS_INLINE +void forall(simd_exec, + Index_type begin, Index_type end, LOOP_BODY loop_body) +{ + for ( Index_type ii = begin ; ii < end ; ++ii ) { + loop_body( ii ); + } +} + +/// with stride +template +LCALS_INLINE +void forall(simd_exec, + Index_type begin, Index_type end, Index_type stride, + LOOP_BODY loop_body) +{ + for ( Index_type ii = begin ; ii < end ; ii += stride ) { + loop_body( ii ); + } +} + + +/*! + ****************************************************************************** + * + * \brief Traverse contiguous range of indices using OpenMP parallel for. + * + ****************************************************************************** + */ +template +LCALS_INLINE +void forall(omp_parallel_for_exec, + Index_type begin, Index_type end, LOOP_BODY loop_body) +{ +//#pragma omp parallel for schedule(static) +#pragma omp parallel for + for ( Index_type ii = begin ; ii < end ; ++ii ) { + loop_body( ii ); + } +} + +/// with stride +template +LCALS_INLINE +void forall(omp_parallel_for_exec, + Index_type begin, Index_type end, Index_type stride, + LOOP_BODY loop_body) +{ +//#pragma omp parallel for schedule(static) +#pragma omp parallel for + for ( Index_type ii = begin ; ii < end ; ii += stride ) { + loop_body( ii ); + } +} + + +/*! + ****************************************************************************** + * + * \brief Traverse contiguous range of indices using OpenMP for with + * nowait clause. + * + ****************************************************************************** + */ +template +LCALS_INLINE +void forall(omp_for_nowait_exec, + Index_type begin, Index_type end, LOOP_BODY loop_body) +{ +//#pragma omp for nowait schedule(static) +#pragma omp for nowait + for ( Index_type ii = begin ; ii < end ; ++ii ) { + loop_body( ii ); + } +} + +/// with stride +template +LCALS_INLINE +void forall(omp_for_nowait_exec, + Index_type begin, Index_type end, Index_type stride, + LOOP_BODY loop_body) +{ +//#pragma omp for nowait schedule(static) +#pragma omp for nowait + for ( Index_type ii = begin ; ii < end ; ii += stride ) { + loop_body( ii ); + } +} + + +/*! + ****************************************************************************** + * + * \brief Class representing a contiguous range of indices. + * + * Range is specified by begin and end values. + * Traversal executes as: + * for (i = m_begin; i < m_end; ++i) { + * expression using i as array index. + * } + * + ****************************************************************************** + */ +class RangeIndexSet +{ +public: + + RangeIndexSet(Index_type begin, Index_type end) + : m_begin(begin), m_end(end) { ; } + + Index_type getBegin() const { return m_begin; } + Index_type getEnd() const { return m_end; } + + Index_type getLength() const { return (m_end-m_begin); } + + void print(std::ostream& os) const; + +private: + // + // The default ctor is not implemented. + // + RangeIndexSet(); + + Index_type m_begin; + Index_type m_end; +}; + + +/*! + ****************************************************************************** + * + * \brief Class representing a contiguous range of indices with stride. + * + * Range is specified by begin and end values. + * Traversal executes as: + * for (i = m_begin; i < m_end; i = i + m_stride) { + * expression using i as array index. + * } + * + ****************************************************************************** + */ +class RangeStrideIndexSet +{ +public: + + RangeStrideIndexSet(Index_type begin, Index_type end, Index_type stride) + : m_begin(begin), m_end(end), m_stride(stride) { ; } + + Index_type getBegin() const { return m_begin; } + Index_type getEnd() const { return m_end; } + Index_type getStride() const { return m_stride; } + + Index_type getLength() const { return (m_end-m_begin); } + + void print(std::ostream& os) const; + +private: + // + // The default ctor is not implemented. + // + RangeStrideIndexSet(); + + Index_type m_begin; + Index_type m_end; + Index_type m_stride; +}; + + +/*! + ****************************************************************************** + * + * \brief Traversal methods for index set objects passed as arguments. + * + ****************************************************************************** + */ +/// RangeIndexSet object +template +LCALS_INLINE +void forall(EXEC_T exec, + const RangeIndexSet& is, LOOP_BODY loop_body) +{ + forall( exec, + is.getBegin(), is.getEnd(), loop_body ); +} + +/// RangeStrideIndexSet object +template +LCALS_INLINE +void forall(EXEC_T exec, + const RangeStrideIndexSet& is, LOOP_BODY loop_body) +{ + forall( exec, + is.getBegin(), is.getEnd(), is.getStride(), loop_body ); +} + + +/*! + ****************************************************************************** + * + * \brief Class representing a hybrid index set which is a collection + * of index set objects defined above. Within a hybrid, the + * individual index sets are referred to as segments. + * + * NOTE: This class is an abreviated version of the actual RAJA class. + * + ****************************************************************************** + */ +class HybridIndexSet +{ +public: + + /// + /// Enum describing types of segments in hybrid index set. + /// + enum SegmentType { _Range_, _RangeStride_, _Unknown_ }; + + /// + /// Class holding segment and segment type. + /// + class Segment + { + public: + + Segment() + : m_type(_Unknown_), m_segment(0) { ; } + + Segment(SegmentType type, const void* segment) + : m_type(type), m_segment(segment) { ; } + + SegmentType m_type; + const void* m_segment; + + }; + + /// + /// Construct empty hybrid index set + /// + HybridIndexSet() + : m_len(0) { ; } + + // + // Copy-constructor for hybrid index set + // + HybridIndexSet(const HybridIndexSet& other) + : m_len(0) + { + copySegments(other); + } + + // + // Copy-assignment for hybrid index set + // + HybridIndexSet& operator=(const HybridIndexSet& rhs) + { + if (this != &rhs) { + copySegments(rhs); + } + return *this; + } + + /// + /// Hybrid index set destructor destroys all index set segments. + /// + ~HybridIndexSet(); + + /// + /// Create copy of given RangeIndexSet and add to hybrid index set. + /// + void addIndexSet(const RangeIndexSet& index_set); + + /// + /// Add contiguous range of indices to hybrid index set as a RangeIndexSet. + /// + void addRangeIndices(Index_type begin, Index_type end); + + /// + /// Create copy of given RangeStrideIndexSet and add to hybrid index set. + /// + void addIndexSet(const RangeStrideIndexSet& index_set); + + /// + /// Add contiguous range of indices with stride to hybrid index set + /// as a RangeStrideIndexSet. + /// + void addRangeStrideIndices(Index_type begin, Index_type end, Index_type stride); + + /// + /// Return total length of hybrid index set; i.e., sum of lengths + /// over all segments. + /// + Index_type getLength() const { return m_len; } + + /// + /// Return total number of segments in hybrid index set. + /// + int getNumSegments() const { return m_segments.size(); } + + /// + /// Return total number of segments in hybrid index set. + /// + const Segment* getSegments() const { return &m_segments[0]; } + +private: + // + // Copy segments (deep copy) from given HybridIndexSet object. + // + void copySegments(const HybridIndexSet& other); + + Index_type m_len; + std::vector m_segments; + +}; + + +/*! + ****************************************************************************** + * + * \brief Iterate over segments sequentially, and use exec policy + * specified by template parameter for individual segments. + * + ****************************************************************************** + */ +template +LCALS_INLINE +void forall(EXEC_T exec, + const HybridIndexSet& is, LOOP_BODY loop_body) +{ + const int num_seg = is.getNumSegments(); + const HybridIndexSet::Segment* seg = is.getSegments(); + for ( int isi = 0; isi < num_seg; ++isi ) { + + switch ( seg[isi].m_type ) { + + case HybridIndexSet::_Range_ : { + forall(exec, + *(static_cast(seg[isi].m_segment)), + loop_body + ); + break; + } + + case HybridIndexSet::_RangeStride_ : { + forall(exec, + *(static_cast(seg[isi].m_segment)), + loop_body + ); + break; + } + + default : { + } + + } // switch on segment type + + } // iterate over segments of hybrid index set +} + + + +/*! + ****************************************************************************** + * + * \brief Generic methods with exec policy specified by template + * parameter. + * + ****************************************************************************** + */ +template +LCALS_INLINE +void forall(Index_type begin, Index_type end, LOOP_BODY loop_body) +{ + forall( EXEC_T(), begin, end, loop_body ); +} + +/// with stride +template +LCALS_INLINE +void forall(Index_type begin, Index_type end, Index_type stride, + LOOP_BODY loop_body) +{ + forall( EXEC_T(), begin, end, stride, loop_body ); +} + +/// passing index set object +template +LCALS_INLINE +void forall(const INDEXSET_T& is, LOOP_BODY loop_body) +{ + forall(EXEC_T(), is, loop_body); +} + + +#endif // closing endif for header file include guard Index: MicroBenchmarks/LCALS/LCALSTraversalMethods.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/LCALSTraversalMethods.cxx @@ -0,0 +1,137 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file containing LCALS traversal method implementations +// used in forall-hybrid loop variants. +// + +#include "LCALSTraversalMethods.hxx" + +#include + + +/* +************************************************************************* +* +* HybridIndexSet class dtor. +* +************************************************************************* +*/ +HybridIndexSet::~HybridIndexSet() +{ + const int num_segs = m_segments.size(); + for ( int isi = 0; isi < num_segs; ++isi ) { + Segment& seg = m_segments[isi]; + + switch ( seg.m_type ) { + + case _Range_ : { + if ( seg.m_segment ) { + RangeIndexSet* is = + const_cast( + static_cast(seg.m_segment) + ); + delete is; + } + break; + } + + case _RangeStride_ : { + if ( seg.m_segment ) { + RangeStrideIndexSet* is = + const_cast( + static_cast(seg.m_segment) + ); + delete is; + } + break; + } + + default : { + std::cout << "\t HybridIndexSet dtor: case not implemented!!\n"; + } + + } // iterate over segments of hybrid index set + } +} + + +/* +************************************************************************* +* +* Private helper function to copy hybrid index set segments. +* +************************************************************************* +*/ +void HybridIndexSet::copySegments(const HybridIndexSet& other) +{ + const int num_segs = m_segments.size(); + for ( int isi = 0; isi < num_segs; ++isi ) { + const Segment& seg = m_segments[isi]; + + switch ( seg.m_type ) { + + case _Range_ : { + addIndexSet(*static_cast(seg.m_segment)); + break; + } + + case _RangeStride_ : { + addIndexSet(*static_cast(seg.m_segment)); + break; + } + + default : { + std::cout << "\t HybridIndexSet copySegments: case not implemented!!\n"; + } + + } // iterate over segments of hybrid index set + } +} + + +/* +************************************************************************* +* +* Methods to add indices to hybrid index set. +* +************************************************************************* +*/ + +void HybridIndexSet::addIndexSet(const RangeIndexSet& index_set) +{ + RangeIndexSet* new_is = + new RangeIndexSet(index_set.getBegin(), index_set.getEnd()); + m_segments.push_back( Segment( _Range_, new_is ) ); + + m_len += new_is->getLength(); +} + +void HybridIndexSet::addRangeIndices(Index_type begin, Index_type end) +{ + RangeIndexSet* new_is = new RangeIndexSet(begin, end); + m_segments.push_back( Segment( _Range_, new_is ) ); + + m_len += new_is->getLength(); +} + +void HybridIndexSet::addIndexSet(const RangeStrideIndexSet& index_set) +{ + RangeStrideIndexSet* new_is = + new RangeStrideIndexSet(index_set.getBegin(), index_set.getEnd(), + index_set.getStride()); + m_segments.push_back( Segment( _RangeStride_, new_is ) ); + + m_len += new_is->getLength() / new_is->getStride(); +} + +void HybridIndexSet::addRangeStrideIndices(Index_type begin, Index_type end, + Index_type stride) +{ + RangeStrideIndexSet* new_is = new RangeStrideIndexSet(begin, end, stride); + m_segments.push_back( Segment( _RangeStride_, new_is ) ); + + m_len += new_is->getLength() / new_is->getStride(); +} Index: MicroBenchmarks/LCALS/README-LCALS_instructions.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/README-LCALS_instructions.txt @@ -0,0 +1,312 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// +================================================================================ +================================================================================ +LCALS: Livermore Compiler Analysis Loop Suite + by Rich Hornung (hornung1@llnl.gov), + Center for Applied Scientific Computing, + Lawrence Livermore National Laboratory +================================================================================ +================================================================================ + + o This code is under continuing development. Go to http://codesign.llnl.gov + to acquire the latest released version. + + o This loop suite is designed to measure performance for a variety of loops + using different compilers and platforms. In particular, the suite + helps to understand compiler optimization, run-time performance issues, + and platform capabilities. The suite is also useful as a source of + example code snippets for interactions with compiler developers. + + o The loops in the suite are partitioned into three subsets based on their + origins (and also to avoid having them all in a single source file). Each + loop is implemented using multiple software constructs (i.e., referred + to herein as "variants"). The three loop subsets are: + + - Subset A: Loops representative of those found in application codes. + They are implemented in source files named runALoops.cxx. + + - Subset B: Basic loops that help to illustrate compiler optimization + issues. They are implemented in source files named runBLoops.cxx + + - Subset C: Loops extracted from "Livermore Loops coded in C" developed by + Steve Langer, which were derived from the Fortran version by Frank + McMahon. They are implemented in source files runCLoops.cxx + + Please see the contents of the loop source files to understand the + differences among the variants. + + o New loops may be added to the suite by inserting them into appropriate + loop source files and modifying a few other files that control suite + execution and parametrization. Details are provided below. + + o Various parameters can be adjusted to control how loops are defined and run. + + -- Each loop may be run with different loop lengths (currently up to three + lengths for each loop) and will be sampled some number of times to + generate execution timing data. Loop length and sampling parameters + may be modified to evaluate different platform performance + characteristics. Details are provided below. + + o Various run time statistics can be generated for analysis. Currently, + these include: min run time, max run time, average run time, + standard deviation across run times, and average execution time relative + to a reference loop variant. Here, run time is the time required to + execute the loop for one "sampling" pass through the suite. See below. + + +-------------------------------------------------------------------------------- +Loop kernels and variants: + + o Each loop in the suite is defined by its traditional C/C++ for-loop + "kernel". Then, each loop appears in multiple variants that use different + programming and execution constructs. + + o Loops that emply traditional C/C++ for-loop syntax are referred to as + "Raw" variants. The "Raw" variant of each loop represents the version + obtained from its original source, plus minor modifications necessary + to plug into the loop suite framework. For example, the loops in the + runCRawLoops.cxx file are essentially verbatim from the Livermore Loops + Coded in C" suite mentioned above. Typically, the "Raw" loops serve as + reference implemenation for runtime comparisons. + + o Other variants use loop traversal C++ template methods and represent the + loop body as a lambda function or functor class. One of the main goals + of the suite is to assess how SIMD vectorization, OpenMP multithreading, + etc. work with these different loop implementation choices. + + Note that only a subset of the loops in the suite appear in the OpenMP + variants since many of the loops do not benefit from thread parallelism + due to OpenMP overheads. OpenMP loops are implmented in source files + named runOMPLoops.cxx; in particular, they are not broken out + into separate source files based on the subsets described above. + + o Although all loop bodies contain only C-syntax, the loop framework + uses C++ classes and templates. So a C++ compiler is required to compile + the code. All C++ compilers should be able to compile the framework + code and "Raw" loop variants. + + o Not all compilers implement the OpenMP standard. Thus, those loop variants + may not be compiled and run depending on the compiler being used. + + o The intent of the C++ lambda and functor loop variants is to evaluate + compilers in the context of C++ abstraction layers using template methods. + Not all compilers support standard C++ lambda expressions at this time. + Thus, the lambda variants of the loops may not be compiled and run + depending on the compiler being used. + + +******************** Test Suite Note *********************** +* * +* Below is the original build instructions, the * +* test suite replaces this build system with the * +* llvm test-suite CMake system. The control of * +* loop suite and timing has been altered to use * +* the google benchmark library included in the * +* MicroBenchmarks directory of the llvm test-suite. * +* * +************************************************************ +-------------------------------------------------------------------------------- +Compiling and running the loop suite: + + The loop suite is typically compiled by typing 'make' and then executed as + + ./lcals.exe + + o The executable generated by the Makefile accepts an optional argument + which is the name of a directory for placing output files that contain + detailed timing, checksum, and FOM (when specified) results. Some of + these files provide a summary of loop suite performance. Othere + contain subsets of this information in comma-delimited text files that may + be imported into Microsoft Excel to generate spreadsheets and plots. + When no output directory is given, a summary of the results is printed + to standard output. + + o LCALS is highly parametrized to explore many compilation and execution + options. Exercising the full range of options can be achieved by making + straightforward modifications in a few files, as describe below: + + -- Makefile: This file contains a simple build system for the code. + It has a variety of configurations for current LLNL + computing systems. Building for other platforms or changing + any compiler options can done by modifying this file. + + -- LCALS_rules.mk: This file contains "-D" compilation options that + conrol some aspects of LCALS parametrization. The effect of + these options is described in the comments in this file. + It is also helpful to see how they are used in the + LCALSParams.hxx file. + + -- main.cxx: The main program determines many of the LCALS execution + options, such as which loops are run (kernels and variants). + + -- LCALSSuite.cxx: The routine defineLoopSuiteRunInfo() in this file + defines loop lengths and sampling parameters for each loop + in the suite. It also defines loop weights used in Figure + of Merit (FOM) calculations. + + -- LCALSSuite.hxx: This file contains '#define' preprocessor directives + that can be used to turn on/off compilation of individual + loop kernels and loop variants in the suite. This can be + helpful for generating assembly code in small doses. + + o Details on many of these items are given in the next section. + + +-------------------------------------------------------------------------------- +Controlling loop suite execution and timing output: + + o The execution of the loop suite follows the pattern described here: + + Iterate over specified number of passes through the loop suite { + + Iterate over specified loop variants to run { + + Iterate over loop lengths to run (e.g., long, medium, short) { + + Iterate over each loop specified to run { + + TIMER_START() + Iterate over specified number of samples (for loop and length) { + + Execute loop variant and length. + + } + TIMER_STOP() + + } // end iteration over loops to run + + } // end iteration over loop lengths + + } // end iteration over loop variants + + } // end iteration over suite passes + + o The loop suite is parametrized so that its execution may be controlled + by editing various items in a small number of source and header files + as described below: + + -- Set number of passes through the suite by setting the variable + 'num_suite_passes' in main.cxx. + + -- Set loop variants to run by adding the corresponding enumeration + constants to the vector 'run_variants' in main.cxx. To prevent a + variant from running, simply comment out the line which adds the + corresponding enum value to the vector. + + NOTE: The first entry added this array indicates the reference variant + for relative execution time statistics. + + NOTE: An additional argument may be given to the exectuable to run + loops outside of the standard LCASL benchmark. This requires + that "BUILD_MISC" is defined in the Makefile. + + -- Set which loop lengths to run by setting the appropriate entry in + the array 'run_loop_length' in main.cxx (true/false for each length). + + -- Set which loop kernels will run be setting entries in the array + 'run_loop' in main.cxx (true/false for each loop). + + -- The lengths and number of samples per pass for each loop are set + in the routine defineLoopSuiteRunInfo() in LCALSSuite.cxx. + + NOTE: The "samples per pass" values for each loop were determined + manually to give approximately 1 second of execution time for its + serial raw variant on an Intel ES-2670 node. To reduce or increase the + total suite execution time, or change the loop lengths used, change + the 'sample_frac' and/or 'loop_length_factor' variables in + main.cxx. All default loop lengths will be multiplied by the + loop_length_factor value. The sample count for each loop will be + multiplied by sample_frac/loop_length_factor. + + -- The "LoopKernelID" and "LoopLength" enumeration types in the file + LCALSSuite.hxx are used to identify loops and loop lengths + in the suite. Macros are also provided in that file to conditionally + compile each loop in the suite. + + The way in which the loops are compiled can influence execution times. + For example, some compilers perform optimizations for loops compiled + individually that they do not perform when the same loop is compiled as + part of a larger suite. + + o All loop forms use the same data arrays, which are pre-allocated based + on the loop lengths. To help with SIMD vectorization and ensure corretness + data arrays are allocated to be aligned width SIMD vector unit boundaries. + This can be changed by setting the 'LCALS_DATA_ALIGN' constant in the + file LCALSParams.hxx. + + o To minimize the effects of execution of each loop on the others, + data caches are flushed before each loop is run. + + -- Data cache size is set for some LLNL platforms based on hostname. + If unknown, a warning message will appear when loop suite is run. + Please edit main.cxx to set the largest data cache size for other + platforms. + + o A simple checksum mechanism is provided to verify that different variants + of each loop, and implementation changes made to individual loops, generate + the same numerical results. "-D" compiler options are provided in the + LCALS_rules.mk file to control this behavior. Note that certain levels + and types of compiler optimization will cause slight differences in + checksums due to changes in operation order, for example. Thus, the + checksums may only be a qualitative indicator of correct execution. + + -- Note that the routines loopInit() and loopFinalize() in LCALSSuite.cxx + initialize data and compute result checksums for each loop. These + must remain consistent with the data used in each loop for correctness. + + + o There are two mechanisms available to generate execution timing data for + loops in the suite. The choice is made by defining/undefining the + associated "-D" option in the LCALS_rules.mk file. See that file for + more information. + + +-------------------------------------------------------------------------------- +Figures of Merit: + + o The program output includes a Figure of Merit (FOM) value for each loop + variant and loop length that is executed. The intent of the FOM is to + complement execution timing data with another measure of performance and + compiler optimization. Using the FOM values and total loop suite execution + time information in the Figure of Merit report, one can compare different + compilers' abilities to optimize on a given platform, performance of + different optimization levels for a given compiler, or potential performance + of different architectures, etc. + + o In the FOM calculation, execution time for each loop is weighted by a + factor defined in the loop setup routines. The loops are partitioned into + six classes depending on their structure; e.g., data-parallel, order- + dependent, etc. The weight for each loop class indicates its relative + importance based on code constructs we want the suite to emphasize + and how easy we think it should be for a compiler to optimize. Each loop + in the suite is given a weight, w_i (i is the loop id), based on which + class it exists in. Loop classes and weights are defined in the file + LCALSSuite.cxx. + + o The FOM is calculated as follows. + + - Relative FOM (FOM_rel). The aim of the FOM_rel value is to measure + a compiler's ability to optimize different loop constructs. + + -- When the code is executed, a reference loop execution time, t_ref, is + computed using a loop that any compiler should be able to optimize + well and which should run faster than any loop in the suite. + To help insure this, two simple loops are run, an element-wise vector + product and a vector dot product. Then, t_ref is the minimum execution + time between the two. + + -- After the suite is run, FOM_rel is calulated as: + + FOM_rel = W * t_ref / Sum_i [ w_i * t_i ] + + The denominator is a weighted sum of execution times for the loops + that were run; t_i is the run time for loop i. W = Sum_i ( w_i ) is + the sum of loop weights. + + -- Note that FOM_rel is a dimensionless quantity that satisfies + 0 <= FOM_rel <= 1, and FOM_rel increases as loop execution times + decrease. In the ideal case, where each loop executes as fast as the + reference loop (which should be impossible), t_i = t_ref for each i. + So FOM_rel = 1. Index: MicroBenchmarks/LCALS/README-LCALS_license.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/README-LCALS_license.txt @@ -0,0 +1,170 @@ +******************************************************************************* +LCALS: Livermore Compiler Analysis Loop Suite, version 1.0 + by Rich Hornung, Center for Applied Scientific Computing, + Lawrence Livermore National Laboratory + +Unclassified/Unlimited Distribution +LLNL-CODE-638939 +OCEC-13-189 + +** NOTE: This code was originally released under the name LLoops21. + The content is essentially unchanged under the new name. + +******************************************************************************* + +This code was developed and is maintained by Lawrence Livermore +National Laboratory (LLNL). It is intended to be shared widely with the +HPC community (including other laboratories, universities, and industrial +partners) as part of ASC and DOE exascale co-design efforts. + +o The software is unrestricted in its distribution. + +o LLNL retains copyright (see Copyright statement below) + +o If the code and/or results generated from it are used in a publication, + please cite LCALS as follows: + + @misc{LCALScode, + author = {Richard D. Hornung}, + title = {{LCALS}, version 1.0}, + howpublished = {\texttt{https://codesign.llnl.gov/LCALS.php}}, + note = {{LLNL}-{CODE}-638939}, + year = {2013} + } + +o Please direct improvements, additions, comments, suggestions, etc. to + proxyapp-info@llnl.gov or hornung1@llnl.gov + +o This README-LCALS_license.txt file must be included in any redistribution of + the software (either partial or in its entiretly) as well as any of its + derivatives. + +******************************************************************************* +******************************************************************************* + +This work was produced at Lawrence Livermore National Laboratory (LLNL) under +contract no. DE-AC52-07NA27344 (Contract 44) between the U.S. Department of +Energy (DOE) and Lawrence Livermore National Security, LLC (LLNS) for the +operation of LLNL. Copyright is reserved to Lawrence Livermore National +Security, LLC for purposes of controlled dissemination, commercialization +through formal licensing, or other disposition under terms of Contract 44; DOE +policies, regulations and orders; and U.S. statutes. The rights of the Federal +Government are reserved under Contract 44. + +******************************************************************************* +******************************************************************************* + + DISCLAIMER + +This work was prepared as an account of work sponsored by an agency of the +United States Government. Neither the United States Government nor Lawrence +Livermore National Security, LLC nor any of their employees, makes any warranty, +express or implied, or assumes any liability or responsibility for the accuracy, +completeness, or usefulness of any information, apparatus, product, or process +disclosed, or represents that its use would not infringe privately-owned rights. +Reference herein to any specific commercial products, process, or service by +trade name, trademark, manufacturer or otherwise does not necessarily constitute +or imply its endorsement, recommendation, or favoring by the United States +Government or Lawrence Livermore National Security, LLC. The views and opinions +of authors expressed herein do not necessarily state or reflect those of the +United States Government or Lawrence Livermore National Security, LLC, and shall +not be used for advertising or product endorsement purposes. + +******************************************************************************* +******************************************************************************* + + NOTIFICATION OF COMMERCIAL USE + +Commercialization of this product is prohibited without notifying the +Department of Energy (DOE) or Lawrence Livermore National Laboratory (LLNL). + + + +******************************************************************************* +******************************************************************************* + +// +// The following is the original copyright statement from Steve Langer's +// Livermore Loops coded in C. +// +// NOTE: Fonzi's Law (mentioned below) is actually called +// Flon's Law (just Google it). +// + +/* + *********************************************************************** + * + * Livermore Loops coded in C Latest File Modification 27 Jul 90 + * + * NOTE NOTE NOTE: Modified for use in the pure ANSI C version + * of the LFK test program by Steven H. Langer. + * Changes include calling sequence from Fortran to C and + * minor changes in COMMON block arguments. + * Split into separate header and source code files for convenience + * in converting the main program to C. + * Feb. 14, 1995. + * + * Copyright (c) 1995. The Regents of the University of California. + * All rights reserved. + * + * + * SUBROUTINE KERNEL( TK) replaces the Fortran routine in LFK Test program. + ************************************************************************ + * * + * KERNEL executes 24 samples of "C" numerical computation * + * * + * TK(1) - total cpu time to execute only the 24 kernels.* + * TK(2) - total Flops executed by the 24 Kernels * + * * + * Link this C module with the rest of LFK Test compiled with Fortran * + * using a version of the LFK Test dated April 1990 or later. * + ************************************************************************ + * * + * L. L. N. L. " C " K E R N E L S T E S T: M F L O P S * + * * + * These kernels measure " C " numerical computation * + * rates for a spectrum of cpu-limited computational * + * structures or benchmarks. Mathematical through-put * + * is measured in units of millions of floating-point * + * operations executed per second, called Megaflops/sec. * + * * + * Fonzi's Law: There is not now and there never will be a language * + * in which it is the least bit difficult to write * + * bad programs. * + * * + *Originally from Greg Astfalk, AT&T, P.O.Box 900, Princeton, NJ. 08540* + *by way of Frank McMahon, LLNL, PO Box 808, Livermore, CA, 94550. 1986 * + * * + * Changes made to correct many array subscripting problems, * + * make more readable (added #define's), include the original * + * FORTRAN versions of the runs as comments, and make more * + * portable by Kelly O'Hair (LLNL) and Chuck Rasbold (LLNL) * + * and by Mark Seager (LLNL). * + * * + * please send copy of sdtout to: MCMAHON3@LLNL.GOV * + * or: mcmahon@lll-crg.llnl.gov * + * * + ************************************************************************ + * * + * REFERENCE * + * * + * F.H.McMahon, The Livermore Fortran Kernels: * + * A Computer Test Of The Numerical Performance Range, * + * Lawrence Livermore National Laboratory, * + * Livermore, California, UCRL-53745, December 1986. * + * * + * from: National Technical Information Service * + * U.S. Department of Commerce * + * 5285 Port Royal Road * + * Springfield, VA. 22161 * + * * + * * + * (C) Copyright 1986 the Regents of the * + * University of California. All Rights Reserved. * + * * + * This work was produced under the sponsorship of * + * the U.S. Department of Energy. The Government * + * retains certain rights therein. * + * * + ************************************************************************ + */ Index: MicroBenchmarks/LCALS/README-LCALS_llvm-test-suite.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/README-LCALS_llvm-test-suite.txt @@ -0,0 +1,21 @@ +##################### llvm test suite notes ######################## + +The following changes were made to the source to add LCALS to the +llvm test suite using the google benchmark library. + +Macro'd out reporting and the built in control of the suite to +allow the benchmark library to control run and time information. +The loop data initialization and cache flushing is maintained, but +the checksum information cannot be used in the test suite as it +can show slight differences due to compiler optimizations. +The "Raw" and "ForeachLambda" versions of the loops have been +rewritten to be used by the google benchmark library, while +the files included other versions have not been included at this +time. + +See the original README_LCALS_license.txt for copyright information + +See the original README_LCALS_instructions.txt for information about +the suite. + +#################################################################### Index: MicroBenchmarks/LCALS/SubsetALambdaLoops/CMakeLists.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetALambdaLoops/CMakeLists.txt @@ -0,0 +1,4 @@ +list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG) +llvm_test_run() +llvm_test_executable(lcalsALambda ../main.cxx LambdaSubsetAbenchmarks.cxx ../LCALSStats.cxx ../LCALSSuite.cxx ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx) +target_link_libraries(lcalsALambda benchmark) Index: MicroBenchmarks/LCALS/SubsetALambdaLoops/LambdaSubsetAbenchmarks.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetALambdaLoops/LambdaSubsetAbenchmarks.cxx @@ -0,0 +1,467 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file containing LCALS "A" subset forall lambda loops using +// the google benchmark library. +// + +#include +#include "../LCALSSuite.hxx" +#include "../SubsetDataA.hxx" +#include "../LCALSTraversalMethods.hxx" + +static void BM_PRESSURE_CALC_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(PRESSURE_CALC); + + Real_ptr compression = loop_data.array_1D_Real[0]; + Real_ptr bvc = loop_data.array_1D_Real[1]; + Real_ptr p_new = loop_data.array_1D_Real[2]; + Real_ptr e_old = loop_data.array_1D_Real[3]; + Real_ptr vnewc = loop_data.array_1D_Real[4]; + + const Real_type cls = loop_data.scalar_Real[0]; + const Real_type p_cut = loop_data.scalar_Real[1]; + const Real_type pmin = loop_data.scalar_Real[2]; + const Real_type eosvmax = loop_data.scalar_Real[3]; + + for( auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type i) { + bvc[i] = cls * (compression[i] + 1.0); + } ); + + forall(0, state.range(0), + [&] (Index_type i) { + p_new[i] = bvc[i] * e_old[i] ; + + if ( fabs(p_new[i]) < p_cut ) p_new[i] = 0.0 ; + + if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ; + + if ( p_new[i] < pmin ) p_new[i] = pmin ; + } ); + + } +} + +BENCHMARK(BM_PRESSURE_CALC_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + +static void BM_ENERGY_CALC_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(ENERGY_CALC); + + Real_ptr e_new = loop_data.array_1D_Real[0]; + Real_ptr e_old = loop_data.array_1D_Real[1]; + Real_ptr delvc = loop_data.array_1D_Real[2]; + Real_ptr p_new = loop_data.array_1D_Real[3]; + Real_ptr p_old = loop_data.array_1D_Real[4]; + Real_ptr q_new = loop_data.array_1D_Real[5]; + Real_ptr q_old = loop_data.array_1D_Real[6]; + Real_ptr work = loop_data.array_1D_Real[7]; + Real_ptr compHalfStep = loop_data.array_1D_Real[8]; + Real_ptr pHalfStep = loop_data.array_1D_Real[9]; + Real_ptr bvc = loop_data.array_1D_Real[10]; + Real_ptr pbvc = loop_data.array_1D_Real[11]; + Real_ptr ql_old = loop_data.array_1D_Real[12]; + Real_ptr qq_old = loop_data.array_1D_Real[13]; + Real_ptr vnewc = loop_data.array_1D_Real[14]; + + const Real_type rho0 = loop_data.scalar_Real[0]; + const Real_type e_cut = loop_data.scalar_Real[1]; + const Real_type emin = loop_data.scalar_Real[2]; + const Real_type q_cut = loop_data.scalar_Real[3]; + + for( auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type i) { + e_new[i] = e_old[i] - 0.5 * delvc[i] * + (p_old[i] + q_old[i]) + 0.5 * work[i]; + } ); + + forall(0, state.range(0), + [&] (Index_type i) { + if ( delvc[i] > 0.0 ) { + q_new[i] = 0.0 ; + } + else { + Real_type vhalf = 1.0 / (1.0 + compHalfStep[i]) ; + Real_type ssc = ( pbvc[i] * e_new[i] + + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ; + + if ( ssc <= 0.1111111e-36 ) { + ssc = 0.3333333e-18 ; + } else { + ssc = sqrt(ssc) ; + } + + q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; + } + } ); + + forall(0, state.range(0), + [&] (Index_type i) { + e_new[i] = e_new[i] + 0.5 * delvc[i] + * ( 3.0*(p_old[i] + q_old[i]) + - 4.0*(pHalfStep[i] + q_new[i])) ; + } ); + + forall(0, state.range(0), + [&] (Index_type i) { + e_new[i] += 0.5 * work[i]; + + if ( fabs(e_new[i]) < e_cut ) { e_new[i] = 0.0 ; } + + if ( e_new[i] < emin ) { e_new[i] = emin ; } + } ); + + forall(0, state.range(0), + [&] (Index_type i) { + Real_type q_tilde ; + + if (delvc[i] > 0.0) { + q_tilde = 0. ; + } + else { + Real_type ssc = ( pbvc[i] * e_new[i] + + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; + + if ( ssc <= 0.1111111e-36 ) { + ssc = 0.3333333e-18 ; + } else { + ssc = sqrt(ssc) ; + } + + q_tilde = (ssc*ql_old[i] + qq_old[i]) ; + } + + e_new[i] = e_new[i] - ( 7.0*(p_old[i] + q_old[i]) + - 8.0*(pHalfStep[i] + q_new[i]) + + (p_new[i] + q_tilde)) * delvc[i] / 6.0 ; + + if ( fabs(e_new[i]) < e_cut ) { + e_new[i] = 0.0 ; + } + if ( e_new[i] < emin ) { + e_new[i] = emin ; + } + } ); + + forall(0, state.range(0), + [&] (Index_type i) { + if ( delvc[i] <= 0.0 ) { + Real_type ssc = ( pbvc[i] * e_new[i] + + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; + + if ( ssc <= 0.1111111e-36 ) { + ssc = 0.3333333e-18 ; + } else { + ssc = sqrt(ssc) ; + } + + q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; + + if (fabs(q_new[i]) < q_cut) q_new[i] = 0.0 ; + } + } ); + + } +} + +BENCHMARK(BM_ENERGY_CALC_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_VOL3D_CALC_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(VOL3D_CALC); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + Real_ptr vol = loop_data.array_1D_Real[3]; + + ADomain domain(state.range(0), /* ndims = */ 3); + + UnalignedReal_ptr x0,x1,x2,x3,x4,x5,x6,x7 ; + UnalignedReal_ptr y0,y1,y2,y3,y4,y5,y6,y7 ; + UnalignedReal_ptr z0,z1,z2,z3,z4,z5,z6,z7 ; + + NDPTRSET(x,x0,x1,x2,x3,x4,x5,x6,x7) ; + NDPTRSET(y,y0,y1,y2,y3,y4,y5,y6,y7) ; + NDPTRSET(z,z0,z1,z2,z3,z4,z5,z6,z7) ; + + const Real_type vnormq = 0.083333333333333333; /* vnormq = 1/12 */ + + for (auto _ : state) { + + forall(domain.fpz, domain.lpz + 1, + [&] (Index_type i) { + + Real_type x71 = x7[i] - x1[i] ; + Real_type x72 = x7[i] - x2[i] ; + Real_type x74 = x7[i] - x4[i] ; + Real_type x30 = x3[i] - x0[i] ; + Real_type x50 = x5[i] - x0[i] ; + Real_type x60 = x6[i] - x0[i] ; + + Real_type y71 = y7[i] - y1[i] ; + Real_type y72 = y7[i] - y2[i] ; + Real_type y74 = y7[i] - y4[i] ; + Real_type y30 = y3[i] - y0[i] ; + Real_type y50 = y5[i] - y0[i] ; + Real_type y60 = y6[i] - y0[i] ; + + Real_type z71 = z7[i] - z1[i] ; + Real_type z72 = z7[i] - z2[i] ; + Real_type z74 = z7[i] - z4[i] ; + Real_type z30 = z3[i] - z0[i] ; + Real_type z50 = z5[i] - z0[i] ; + Real_type z60 = z6[i] - z0[i] ; + + Real_type xps = x71 + x60 ; + Real_type yps = y71 + y60 ; + Real_type zps = z71 + z60 ; + + Real_type cyz = y72 * z30 - z72 * y30 ; + Real_type czx = z72 * x30 - x72 * z30 ; + Real_type cxy = x72 * y30 - y72 * x30 ; + vol[i] = xps * cyz + yps * czx + zps * cxy ; + + xps = x72 + x50 ; + yps = y72 + y50 ; + zps = z72 + z50 ; + + cyz = y74 * z60 - z74 * y60 ; + czx = z74 * x60 - x74 * z60 ; + cxy = x74 * y60 - y74 * x60 ; + vol[i] += xps * cyz + yps * czx + zps * cxy ; + + xps = x74 + x30 ; + yps = y74 + y30 ; + zps = z74 + z30 ; + + cyz = y71 * z50 - z71 * y50 ; + czx = z71 * x50 - x71 * z50 ; + cxy = x71 * y50 - y71 * x50 ; + vol[i] += xps * cyz + yps * czx + zps * cxy ; + + vol[i] *= vnormq ; + } ); + + } +} + +BENCHMARK(BM_VOL3D_CALC_LAMBDA)->Arg(SHORT)->Arg(MEDIUM)-> + Arg(LONG)->Unit(benchmark::kMicrosecond); + +static void BM_DEL_DOT_VEC_2D_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(DEL_DOT_VEC_2D); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr xdot = loop_data.array_1D_Real[2]; + Real_ptr ydot = loop_data.array_1D_Real[3]; + Real_ptr div = loop_data.array_1D_Real[4]; + + ADomain domain(state.range(0), /* ndims = */ 2); + + UnalignedReal_ptr x1,x2,x3,x4 ; + UnalignedReal_ptr y1,y2,y3,y4 ; + UnalignedReal_ptr fx1,fx2,fx3,fx4 ; + UnalignedReal_ptr fy1,fy2,fy3,fy4 ; + + NDSET2D(x,x1,x2,x3,x4) ; + NDSET2D(y,y1,y2,y3,y4) ; + NDSET2D(xdot,fx1,fx2,fx3,fx4) ; + NDSET2D(ydot,fy1,fy2,fy3,fy4) ; + + const Real_type ptiny = 1.0e-20; + const Real_type half = 0.5; + + for ( auto _ : state ) { + + forall(0, domain.n_real_zones, + [&] (Index_type ii) { + + Index_type i = domain.real_zones[ii] ; + + Real_type xi = half * ( x1[i] + x2[i] - x3[i] - x4[i] ) ; + Real_type xj = half * ( x2[i] + x3[i] - x4[i] - x1[i] ) ; + + Real_type yi = half * ( y1[i] + y2[i] - y3[i] - y4[i] ) ; + Real_type yj = half * ( y2[i] + y3[i] - y4[i] - y1[i] ) ; + + Real_type fxi = half * ( fx1[i] + fx2[i] - fx3[i] - fx4[i] ) ; + Real_type fxj = half * ( fx2[i] + fx3[i] - fx4[i] - fx1[i] ) ; + + Real_type fyi = half * ( fy1[i] + fy2[i] - fy3[i] - fy4[i] ) ; + Real_type fyj = half * ( fy2[i] + fy3[i] - fy4[i] - fy1[i] ) ; + + Real_type rarea = 1.0 / ( xi * yj - xj * yi + ptiny ) ; + + Real_type dfxdx = rarea * ( fxi * yj - fxj * yi ) ; + + Real_type dfydy = rarea * ( fyj * xi - fyi * xj ) ; + + Real_type affine = ( fy1[i] + fy2[i] + fy3[i] + fy4[i] ) / + ( y1[i] + y2[i] + y3[i] + y4[i] ) ; + + div[i] = dfxdx + dfydy + affine ; + } ); + + } +} + +BENCHMARK(BM_DEL_DOT_VEC_2D_LAMBDA)->Arg(SHORT)->Arg(MEDIUM)-> + Arg(LONG)->Unit(benchmark::kMicrosecond); + +static void BM_COUPLE_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(COUPLE); + + Complex_ptr t0 = loop_data.array_1D_Complex[0]; + Complex_ptr t1 = loop_data.array_1D_Complex[1]; + Complex_ptr t2 = loop_data.array_1D_Complex[2]; + Complex_ptr denac = loop_data.array_1D_Complex[3]; + Complex_ptr denlw = loop_data.array_1D_Complex[4]; + + + ADomain domain(state.range(0), /* ndims = */ 3); + + Index_type imin = domain.imin; + Index_type imax = domain.imax; + Index_type jmin = domain.jmin; + Index_type jmax = domain.jmax; + Index_type kmin = domain.kmin; + Index_type kmax = domain.kmax; + + const Real_type clight=3.e+10; + const Real_type csound=3.09e+7; + const Real_type omega0= 0.9; + const Real_type omegar= 0.9; + const Real_type dt= 0.208; + const Real_type c10 = 0.25 * (clight / csound); + const Real_type fratio = sqrt(omegar / omega0); + const Real_type r_fratio = 1.0/fratio; + const Real_type c20 = 0.25 * (clight / csound) * r_fratio; + const Complex_type ireal(0.0, 1.0); + + for ( auto _ : state ) { + + forall(kmin, kmax, + [&] (Index_type k) { + + for (Index_type j = jmin; j < jmax; j++) { + + Index_type it0= ((k)*(jmax+1) + (j))*(imax+1) ; + Index_type idenac= ((k)*(jmax+2) + (j))*(imax+2) ; + + for (Index_type i = imin; i < imax; i++) { + + Complex_type c1 = c10 * denac[idenac+i]; + Complex_type c2 = c20 * denlw[it0+i]; + + /* promote to doubles to avoid possible divide by zero + errors later on. */ + Real_type c1re = real(c1); Real_type c1im = imag(c1); + Real_type c2re = real(c2); Real_type c2im = imag(c2); + + /* compute lamda = sqrt(|c1|^2 + |c2|^2) using doubles + to avoid underflow. */ + Real_type zlam = c1re*c1re + c1im*c1im + + c2re*c2re + c2im*c2im + 1.0e-34; + zlam = sqrt(zlam); + Real_type snlamt = sin(zlam * dt * 0.5); + Real_type cslamt = cos(zlam * dt * 0.5); + + Complex_type a0t = t0[it0+i]; + Complex_type a1t = t1[it0+i]; + Complex_type a2t = t2[it0+i] * fratio; + + Real_type r_zlam= 1.0/zlam; + c1 *= r_zlam; + c2 *= r_zlam; + Real_type zac1 = zabs2(c1); + Real_type zac2 = zabs2(c2); + + /* compute new A0 */ + Complex_type z3 = ( c1 * a1t + c2 * a2t ) * snlamt ; + t0[it0+i] = a0t * cslamt - ireal * z3; + + /* compute new A1 */ + Real_type r = zac1 * cslamt + zac2; + Complex_type z5 = c2 * a2t; + Complex_type z4 = conj(c1) * z5 * (cslamt-1); + z3 = conj(c1) * a0t * snlamt; + t1[it0+i] = a1t * r + z4 - ireal * z3; + + /* compute new A2 */ + r = zac1 + zac2 * cslamt; + z5 = c1 * a1t; + z4 = conj(c2) * z5 * (cslamt-1); + z3 = conj(c2) * a0t * snlamt; + t2[it0+i] = ( a2t * r + z4 - ireal * z3 ) * r_fratio; + + } // i loop + + } // j loop + + } ); // k loop + + } // google benchmark loop +} + +BENCHMARK(BM_COUPLE_LAMBDA)->Arg(SHORT)->Arg(MEDIUM)-> + Arg(LONG)->Unit(benchmark::kMicrosecond); + +static void BM_FIR_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(FIR); + + Real_ptr out = loop_data.array_1D_Real[0]; + Real_ptr in = loop_data.array_1D_Real[1]; + + const Index_type coefflen = 16; + Real_type coeff[coefflen] = { 3.0, -1.0, -1.0, -1.0, + -1.0, 3.0, -1.0, -1.0, + -1.0, -1.0, 3.0, -1.0, + -1.0, -1.0, -1.0, 3.0 }; + const Index_type len_minus_coeff = state.range(0) - coefflen; + + Index_type val = 0; + + for ( auto _ : state ) { + + forall(0, len_minus_coeff, + [&] (Index_type i) { + Real_type sum = 0.0; + for (Index_type j = 0; j < coefflen; ++j ) { + sum += coeff[j]*in[i+j]; + } + out[i] = sum; + } ); + + } +} + +BENCHMARK(BM_FIR_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); Index: MicroBenchmarks/LCALS/SubsetARawLoops/CMakeLists.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetARawLoops/CMakeLists.txt @@ -0,0 +1,4 @@ +list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG) +llvm_test_run() +llvm_test_executable(lcalsARaw ../main.cxx RawSubsetAbenchmarks.cxx ../LCALSStats.cxx ../LCALSSuite.cxx ../runReferenceLoops.cxx) +target_link_libraries(lcalsARaw benchmark) Index: MicroBenchmarks/LCALS/SubsetARawLoops/RawSubsetAbenchmarks.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetARawLoops/RawSubsetAbenchmarks.cxx @@ -0,0 +1,455 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file containing LCALS "A" subset raw loops using the google +// benchmark library +// + +#include +#include "../LCALSSuite.hxx" +#include "../SubsetDataA.hxx" + +static void BM_PRESSURE_CALC_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(PRESSURE_CALC); + + Real_ptr compression = loop_data.array_1D_Real[0]; + Real_ptr bvc = loop_data.array_1D_Real[1]; + Real_ptr p_new = loop_data.array_1D_Real[2]; + Real_ptr e_old = loop_data.array_1D_Real[3]; + Real_ptr vnewc = loop_data.array_1D_Real[4]; + + const Real_type cls = loop_data.scalar_Real[0]; + const Real_type p_cut = loop_data.scalar_Real[1]; + const Real_type pmin = loop_data.scalar_Real[2]; + const Real_type eosvmax = loop_data.scalar_Real[3]; + + for( auto _ : state) { + + for (Index_type i=0 ; i= eosvmax ) p_new[i] = 0.0 ; + + if ( p_new[i] < pmin ) p_new[i] = pmin ; + } + + } +} + +BENCHMARK(BM_PRESSURE_CALC_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + +static void BM_ENERGY_CALC_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(ENERGY_CALC); + + Real_ptr e_new = loop_data.array_1D_Real[0]; + Real_ptr e_old = loop_data.array_1D_Real[1]; + Real_ptr delvc = loop_data.array_1D_Real[2]; + Real_ptr p_new = loop_data.array_1D_Real[3]; + Real_ptr p_old = loop_data.array_1D_Real[4]; + Real_ptr q_new = loop_data.array_1D_Real[5]; + Real_ptr q_old = loop_data.array_1D_Real[6]; + Real_ptr work = loop_data.array_1D_Real[7]; + Real_ptr compHalfStep = loop_data.array_1D_Real[8]; + Real_ptr pHalfStep = loop_data.array_1D_Real[9]; + Real_ptr bvc = loop_data.array_1D_Real[10]; + Real_ptr pbvc = loop_data.array_1D_Real[11]; + Real_ptr ql_old = loop_data.array_1D_Real[12]; + Real_ptr qq_old = loop_data.array_1D_Real[13]; + Real_ptr vnewc = loop_data.array_1D_Real[14]; + + const Real_type rho0 = loop_data.scalar_Real[0]; + const Real_type e_cut = loop_data.scalar_Real[1]; + const Real_type emin = loop_data.scalar_Real[2]; + const Real_type q_cut = loop_data.scalar_Real[3]; + + for( auto _ : state) { + + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + e_new[i] = e_old[i] - 0.5 * delvc[i] * + (p_old[i] + q_old[i]) + 0.5 * work[i]; + } + + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + if ( delvc[i] > 0.0 ) { + q_new[i] = 0.0 ; + } + else { + Real_type vhalf = 1.0 / (1.0 + compHalfStep[i]) ; + Real_type ssc = ( pbvc[i] * e_new[i] + + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ; + + if ( ssc <= 0.1111111e-36 ) { + ssc = 0.3333333e-18 ; + } else { + ssc = sqrt(ssc) ; + } + + q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; + } + } + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + e_new[i] = e_new[i] + 0.5 * delvc[i] + * ( 3.0*(p_old[i] + q_old[i]) + - 4.0*(pHalfStep[i] + q_new[i])) ; + } + + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + e_new[i] += 0.5 * work[i]; + + if ( fabs(e_new[i]) < e_cut ) { e_new[i] = 0.0 ; } + + if ( e_new[i] < emin ) { e_new[i] = emin ; } + } + + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + Real_type q_tilde ; + + if (delvc[i] > 0.0) { + q_tilde = 0. ; + } + else { + Real_type ssc = ( pbvc[i] * e_new[i] + + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; + + if ( ssc <= 0.1111111e-36 ) { + ssc = 0.3333333e-18 ; + } else { + ssc = sqrt(ssc) ; + } + + q_tilde = (ssc*ql_old[i] + qq_old[i]) ; + } + + e_new[i] = e_new[i] - ( 7.0*(p_old[i] + q_old[i]) + - 8.0*(pHalfStep[i] + q_new[i]) + + (p_new[i] + q_tilde)) * delvc[i] / 6.0 ; + + if ( fabs(e_new[i]) < e_cut ) { + e_new[i] = 0.0 ; + } + if ( e_new[i] < emin ) { + e_new[i] = emin ; + } + } + + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + if ( delvc[i] <= 0.0 ) { + Real_type ssc = ( pbvc[i] * e_new[i] + + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; + + if ( ssc <= 0.1111111e-36 ) { + ssc = 0.3333333e-18 ; + } else { + ssc = sqrt(ssc) ; + } + + q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; + + if (fabs(q_new[i]) < q_cut) q_new[i] = 0.0 ; + } + } + + } +} + +BENCHMARK(BM_ENERGY_CALC_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_VOL3D_CALC_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(VOL3D_CALC); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + Real_ptr vol = loop_data.array_1D_Real[3]; + + ADomain domain(state.range(0), /* ndims = */ 3); + + UnalignedReal_ptr x0,x1,x2,x3,x4,x5,x6,x7 ; + UnalignedReal_ptr y0,y1,y2,y3,y4,y5,y6,y7 ; + UnalignedReal_ptr z0,z1,z2,z3,z4,z5,z6,z7 ; + + NDPTRSET(x,x0,x1,x2,x3,x4,x5,x6,x7) ; + NDPTRSET(y,y0,y1,y2,y3,y4,y5,y6,y7) ; + NDPTRSET(z,z0,z1,z2,z3,z4,z5,z6,z7) ; + + const Real_type vnormq = 0.083333333333333333; /* vnormq = 1/12 */ + + for (auto _ : state) { + + for (Index_type i = domain.fpz ; i <= domain.lpz ; i++ ) { + + Real_type x71 = x7[i] - x1[i] ; + Real_type x72 = x7[i] - x2[i] ; + Real_type x74 = x7[i] - x4[i] ; + Real_type x30 = x3[i] - x0[i] ; + Real_type x50 = x5[i] - x0[i] ; + Real_type x60 = x6[i] - x0[i] ; + + Real_type y71 = y7[i] - y1[i] ; + Real_type y72 = y7[i] - y2[i] ; + Real_type y74 = y7[i] - y4[i] ; + Real_type y30 = y3[i] - y0[i] ; + Real_type y50 = y5[i] - y0[i] ; + Real_type y60 = y6[i] - y0[i] ; + + Real_type z71 = z7[i] - z1[i] ; + Real_type z72 = z7[i] - z2[i] ; + Real_type z74 = z7[i] - z4[i] ; + Real_type z30 = z3[i] - z0[i] ; + Real_type z50 = z5[i] - z0[i] ; + Real_type z60 = z6[i] - z0[i] ; + + Real_type xps = x71 + x60 ; + Real_type yps = y71 + y60 ; + Real_type zps = z71 + z60 ; + + Real_type cyz = y72 * z30 - z72 * y30 ; + Real_type czx = z72 * x30 - x72 * z30 ; + Real_type cxy = x72 * y30 - y72 * x30 ; + vol[i] = xps * cyz + yps * czx + zps * cxy ; + + xps = x72 + x50 ; + yps = y72 + y50 ; + zps = z72 + z50 ; + + cyz = y74 * z60 - z74 * y60 ; + czx = z74 * x60 - x74 * z60 ; + cxy = x74 * y60 - y74 * x60 ; + vol[i] += xps * cyz + yps * czx + zps * cxy ; + + xps = x74 + x30 ; + yps = y74 + y30 ; + zps = z74 + z30 ; + + cyz = y71 * z50 - z71 * y50 ; + czx = z71 * x50 - x71 * z50 ; + cxy = x71 * y50 - y71 * x50 ; + vol[i] += xps * cyz + yps * czx + zps * cxy ; + + vol[i] *= vnormq ; + + } + + } +} + +BENCHMARK(BM_VOL3D_CALC_RAW)->Arg(SHORT)->Arg(MEDIUM)-> + Arg(LONG)->Unit(benchmark::kMicrosecond); + +static void BM_DEL_DOT_VEC_2D_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(DEL_DOT_VEC_2D); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr xdot = loop_data.array_1D_Real[2]; + Real_ptr ydot = loop_data.array_1D_Real[3]; + Real_ptr div = loop_data.array_1D_Real[4]; + + ADomain domain(state.range(0), /* ndims = */ 2); + + UnalignedReal_ptr x1,x2,x3,x4 ; + UnalignedReal_ptr y1,y2,y3,y4 ; + UnalignedReal_ptr fx1,fx2,fx3,fx4 ; + UnalignedReal_ptr fy1,fy2,fy3,fy4 ; + + NDSET2D(x,x1,x2,x3,x4) ; + NDSET2D(y,y1,y2,y3,y4) ; + NDSET2D(xdot,fx1,fx2,fx3,fx4) ; + NDSET2D(ydot,fy1,fy2,fy3,fy4) ; + + const Real_type ptiny = 1.0e-20; + const Real_type half = 0.5; + + for ( auto _ : state ) { + + for (Index_type ii = 0 ; ii < domain.n_real_zones ; ii++ ) { + + Index_type i = domain.real_zones[ii] ; + + Real_type xi = half * ( x1[i] + x2[i] - x3[i] - x4[i] ) ; + Real_type xj = half * ( x2[i] + x3[i] - x4[i] - x1[i] ) ; + + Real_type yi = half * ( y1[i] + y2[i] - y3[i] - y4[i] ) ; + Real_type yj = half * ( y2[i] + y3[i] - y4[i] - y1[i] ) ; + + Real_type fxi = half * ( fx1[i] + fx2[i] - fx3[i] - fx4[i] ) ; + Real_type fxj = half * ( fx2[i] + fx3[i] - fx4[i] - fx1[i] ) ; + + Real_type fyi = half * ( fy1[i] + fy2[i] - fy3[i] - fy4[i] ) ; + Real_type fyj = half * ( fy2[i] + fy3[i] - fy4[i] - fy1[i] ) ; + + Real_type rarea = 1.0 / ( xi * yj - xj * yi + ptiny ) ; + + Real_type dfxdx = rarea * ( fxi * yj - fxj * yi ) ; + + Real_type dfydy = rarea * ( fyj * xi - fyi * xj ) ; + + Real_type affine = ( fy1[i] + fy2[i] + fy3[i] + fy4[i] ) / + ( y1[i] + y2[i] + y3[i] + y4[i] ) ; + + div[i] = dfxdx + dfydy + affine ; + } + + } +} + +BENCHMARK(BM_DEL_DOT_VEC_2D_RAW)->Arg(SHORT)->Arg(MEDIUM)-> + Arg(LONG)->Unit(benchmark::kMicrosecond); + +static void BM_COUPLE_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(COUPLE); + + Complex_ptr t0 = loop_data.array_1D_Complex[0]; + Complex_ptr t1 = loop_data.array_1D_Complex[1]; + Complex_ptr t2 = loop_data.array_1D_Complex[2]; + Complex_ptr denac = loop_data.array_1D_Complex[3]; + Complex_ptr denlw = loop_data.array_1D_Complex[4]; + + + ADomain domain(state.range(0), /* ndims = */ 3); + + Index_type imin = domain.imin; + Index_type imax = domain.imax; + Index_type jmin = domain.jmin; + Index_type jmax = domain.jmax; + Index_type kmin = domain.kmin; + Index_type kmax = domain.kmax; + + const Real_type clight=3.e+10; + const Real_type csound=3.09e+7; + const Real_type omega0= 0.9; + const Real_type omegar= 0.9; + const Real_type dt= 0.208; + const Real_type c10 = 0.25 * (clight / csound); + const Real_type fratio = sqrt(omegar / omega0); + const Real_type r_fratio = 1.0/fratio; + const Real_type c20 = 0.25 * (clight / csound) * r_fratio; + const Complex_type ireal(0.0, 1.0); + + for ( auto _ : state ) { + + for (Index_type k = kmin; k < kmax; k++) { + + for (Index_type j = jmin; j < jmax; j++) { + + Index_type it0= ((k)*(jmax+1) + (j))*(imax+1) ; + Index_type idenac= ((k)*(jmax+2) + (j))*(imax+2) ; + + for (Index_type i = imin; i < imax; i++) { + + Complex_type c1 = c10 * denac[idenac+i]; + Complex_type c2 = c20 * denlw[it0+i]; + + /* promote to doubles to avoid possible divide by zero + errors later on. */ + Real_type c1re = real(c1); Real_type c1im = imag(c1); + Real_type c2re = real(c2); Real_type c2im = imag(c2); + + /* compute lamda = sqrt(|c1|^2 + |c2|^2) using doubles + to avoid underflow. */ + Real_type zlam = c1re*c1re + c1im*c1im + + c2re*c2re + c2im*c2im + 1.0e-34; + zlam = sqrt(zlam); + Real_type snlamt = sin(zlam * dt * 0.5); + Real_type cslamt = cos(zlam * dt * 0.5); + + Complex_type a0t = t0[it0+i]; + Complex_type a1t = t1[it0+i]; + Complex_type a2t = t2[it0+i] * fratio; + + Real_type r_zlam= 1.0/zlam; + c1 *= r_zlam; + c2 *= r_zlam; + Real_type zac1 = zabs2(c1); + Real_type zac2 = zabs2(c2); + + /* compute new A0 */ + Complex_type z3 = ( c1 * a1t + c2 * a2t ) * snlamt ; + t0[it0+i] = a0t * cslamt - ireal * z3; + + /* compute new A1 */ + Real_type r = zac1 * cslamt + zac2; + Complex_type z5 = c2 * a2t; + Complex_type z4 = conj(c1) * z5 * (cslamt-1); + z3 = conj(c1) * a0t * snlamt; + t1[it0+i] = a1t * r + z4 - ireal * z3; + + /* compute new A2 */ + r = zac1 + zac2 * cslamt; + z5 = c1 * a1t; + z4 = conj(c2) * z5 * (cslamt-1); + z3 = conj(c2) * a0t * snlamt; + t2[it0+i] = ( a2t * r + z4 - ireal * z3 ) * r_fratio; + + } // i loop + + } // j loop + + } // k loop + + } // benchmark loop +} + +BENCHMARK(BM_COUPLE_RAW)->Arg(SHORT)->Arg(MEDIUM)-> + Arg(LONG)->Unit(benchmark::kMicrosecond); + +static void BM_FIR_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(FIR); + + Real_ptr out = loop_data.array_1D_Real[0]; + Real_ptr in = loop_data.array_1D_Real[1]; + + const Index_type coefflen = 16; + Real_type coeff[coefflen] = { 3.0, -1.0, -1.0, -1.0, + -1.0, 3.0, -1.0, -1.0, + -1.0, -1.0, 3.0, -1.0, + -1.0, -1.0, -1.0, 3.0 }; + const Index_type len_minus_coeff = state.range(0) - coefflen; + + Index_type val = 0; + + for ( auto _ : state ) { + + for (Index_type i = 0 ; i < len_minus_coeff ; i++ ) { + Real_type sum = 0.0; + + for (Index_type j = 0; j < coefflen; ++j ) { + sum += coeff[j]*in[i+j]; + } + out[i] = sum; + } + + } +} + +BENCHMARK(BM_FIR_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); Index: MicroBenchmarks/LCALS/SubsetBLambdaLoops/CMakeLists.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetBLambdaLoops/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG) +llvm_test_run() +llvm_test_executable(lcalsBLambda ../main.cxx LambdaSubsetBbenchmarks.cxx ../LCALSStats.cxx ../LCALSSuite.cxx ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx) +target_link_libraries(lcalsBLambda benchmark) + Index: MicroBenchmarks/LCALS/SubsetBLambdaLoops/LambdaSubsetBbenchmarks.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetBLambdaLoops/LambdaSubsetBbenchmarks.cxx @@ -0,0 +1,137 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file containing LCALS "B" subset forall lambda loops using +// the google benchmark library. +// + +#include +#include "../LCALSSuite.hxx" +#include "../SubsetDataB.hxx" +#include "../LCALSTraversalMethods.hxx" + +static void BM_INIT3_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(INIT3); + + Real_ptr out1 = loop_data.array_1D_Real[0]; + Real_ptr out2 = loop_data.array_1D_Real[1]; + Real_ptr out3 = loop_data.array_1D_Real[2]; + Real_ptr in1 = loop_data.array_1D_Real[3]; + Real_ptr in2 = loop_data.array_1D_Real[4]; + + for( auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type i) { + out1[i] = out2[i] = out3[i] = - in1[i] - in2[i]; + } ); + + } +} + +BENCHMARK(BM_INIT3_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + +static void BM_MULADDSUB_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(MULADDSUB); + + Real_ptr out1 = loop_data.array_1D_Real[0]; + Real_ptr out2 = loop_data.array_1D_Real[1]; + Real_ptr out3 = loop_data.array_1D_Real[2]; + Real_ptr in1 = loop_data.array_1D_Real[3]; + Real_ptr in2 = loop_data.array_1D_Real[4]; + + for ( auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type i) { + out1[i] = in1[i] * in2[i] ; + out2[i] = in1[i] + in2[i] ; + out3[i] = in1[i] - in2[i] ; + } ); + + } +} + +BENCHMARK(BM_MULADDSUB_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + +static void BM_IF_QUAD_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(IF_QUAD); + + Real_ptr a = loop_data.array_1D_Real[0]; + Real_ptr b = loop_data.array_1D_Real[1]; + Real_ptr c = loop_data.array_1D_Real[2]; + Real_ptr x1 = loop_data.array_1D_Real[3]; + Real_ptr x2 = loop_data.array_1D_Real[4]; + + for ( auto _ : state ) { + + forall(0, state.range(0), + [&] (Index_type i) { + Real_type s = b[i]*b[i] - 4.0*a[i]*c[i]; + if ( s >= 0 ) { + s = sqrt(s); + x2[i] = (-b[i]+s)/(2.0*a[i]); + x1[i] = (-b[i]-s)/(2.0*a[i]); + } else { + x2[i] = 0.0; + x1[i] = 0.0; + } + } ); + + } +} + +BENCHMARK(BM_IF_QUAD_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + + +static void BM_TRAP_INT_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(TRAP_INT); + + Real_type xn = loop_data.scalar_Real[0]; + Real_type x0 = loop_data.scalar_Real[1]; + Real_type xp = loop_data.scalar_Real[2]; + Real_type y = loop_data.scalar_Real[3]; + Real_type yp = loop_data.scalar_Real[4]; + + Index_type nx = loop_data.array_1D_Indx[0][0] + 1; + + const Real_type h = (xn - x0) / nx; + Real_type sumx = 0.5*( trap_int_func(x0, y, xp, yp) + + trap_int_func(xn, y, xp, yp) ); + + Real_type val = 0; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type i) { + Real_type x = x0 + i*h; + sumx += trap_int_func(x, y, xp, yp); + } ); + benchmark::DoNotOptimize(val = sumx * h); + + } +} + +BENCHMARK(BM_TRAP_INT_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); Index: MicroBenchmarks/LCALS/SubsetBRawLoops/CMakeLists.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetBRawLoops/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG) +llvm_test_run() +llvm_test_executable(lcalsBRaw ../main.cxx RawSubsetBbenchmarks.cxx ../LCALSStats.cxx ../LCALSSuite.cxx ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx) +target_link_libraries(lcalsBRaw benchmark) + Index: MicroBenchmarks/LCALS/SubsetBRawLoops/RawSubsetBbenchmarks.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetBRawLoops/RawSubsetBbenchmarks.cxx @@ -0,0 +1,132 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file containing LCALS "B" subset raw loops using the google +// benchmark library +// + +#include +#include "../LCALSSuite.hxx" +#include "../SubsetDataB.hxx" + +static void BM_INIT3_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(INIT3); + + Real_ptr out1 = loop_data.array_1D_Real[0]; + Real_ptr out2 = loop_data.array_1D_Real[1]; + Real_ptr out3 = loop_data.array_1D_Real[2]; + Real_ptr in1 = loop_data.array_1D_Real[3]; + Real_ptr in2 = loop_data.array_1D_Real[4]; + + for( auto _ : state) { + + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + out1[i] = out2[i] = out3[i] = - in1[i] - in2[i]; + } + + } +} + +BENCHMARK(BM_INIT3_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + +static void BM_MULADDSUB_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(MULADDSUB); + + Real_ptr out1 = loop_data.array_1D_Real[0]; + Real_ptr out2 = loop_data.array_1D_Real[1]; + Real_ptr out3 = loop_data.array_1D_Real[2]; + Real_ptr in1 = loop_data.array_1D_Real[3]; + Real_ptr in2 = loop_data.array_1D_Real[4]; + + for ( auto _ : state) { + + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + out1[i] = in1[i] * in2[i] ; + out2[i] = in1[i] + in2[i] ; + out3[i] = in1[i] - in2[i] ; + } + + } +} + +BENCHMARK(BM_MULADDSUB_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + +static void BM_IF_QUAD_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(IF_QUAD); + + Real_ptr a = loop_data.array_1D_Real[0]; + Real_ptr b = loop_data.array_1D_Real[1]; + Real_ptr c = loop_data.array_1D_Real[2]; + Real_ptr x1 = loop_data.array_1D_Real[3]; + Real_ptr x2 = loop_data.array_1D_Real[4]; + + for ( auto _ : state ) { + + for (Index_type i=0 ; i< state.range(0); i++ ) { + Real_type s = b[i]*b[i] - 4.0*a[i]*c[i]; + if ( s >= 0 ) { + s = sqrt(s); + x2[i] = (-b[i]+s)/(2.0*a[i]); + x1[i] = (-b[i]-s)/(2.0*a[i]); + } else { + x2[i] = 0.0; + x1[i] = 0.0; + } + } + + } +} + +BENCHMARK(BM_IF_QUAD_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + + +static void BM_TRAP_INT_RAW(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(TRAP_INT); + + Real_type xn = loop_data.scalar_Real[0]; + Real_type x0 = loop_data.scalar_Real[1]; + Real_type xp = loop_data.scalar_Real[2]; + Real_type y = loop_data.scalar_Real[3]; + Real_type yp = loop_data.scalar_Real[4]; + + Index_type nx = loop_data.array_1D_Indx[0][0] + 1; + + const Real_type h = (xn - x0) / nx; + Real_type sumx = 0.5*( trap_int_func(x0, y, xp, yp) + + trap_int_func(xn, y, xp, yp) ); + + Real_type val = 0; + + for (auto _ : state) { + + for (Index_type i=0 ; i< state.range(0); i++ ) { + Real_type x = x0 + i*h; + sumx += trap_int_func(x, y, xp, yp); + } + benchmark::DoNotOptimize(val = sumx * h); + + } +} + +BENCHMARK(BM_TRAP_INT_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); Index: MicroBenchmarks/LCALS/SubsetCLambdaLoops/CMakeLists.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetCLambdaLoops/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG) +#llvm_test_run(--benchmark_repetitions=5) +llvm_test_run() +llvm_test_executable(lcalsCLambda ../main.cxx LambdaSubsetCbenchmarks.cxx ../LCALSStats.cxx ../LCALSSuite.cxx ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx) +target_link_libraries(lcalsCLambda benchmark) Index: MicroBenchmarks/LCALS/SubsetCLambdaLoops/LambdaSubsetCbenchmarks.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetCLambdaLoops/LambdaSubsetCbenchmarks.cxx @@ -0,0 +1,718 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file containing LCALS "C" subset forall lambda loops using +// the google benchmark library. +// + +#include +#include "../LCALSSuite.hxx" +#include "../LCALSTraversalMethods.hxx" + +static void BM_HYDRO_1D_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(HYDRO_1D); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + + const Real_type q = loop_data.scalar_Real[0]; + const Real_type r = loop_data.scalar_Real[1]; + const Real_type t = loop_data.scalar_Real[2]; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type k) { + x[k] = q + y[k]*( r*z[k+10] + t*z[k+11] ); + } ); + + } +} + +BENCHMARK(BM_HYDRO_1D_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + +static void BM_ICCG_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(ICCG); + + Real_ptr x = loop_data.array_1D_Nx4_Real[0]; + Real_ptr v = loop_data.array_1D_Nx4_Real[1]; + + Index_type ii, ipnt, ipntp, i; + + for (auto _ : state) { + + ii = state.range(0); + ipntp = 0; + do { + ipnt = ipntp; + ipntp += ii; + ii /= 2; + i = ipntp ; + forall(ipnt+1, ipntp, 2, + [&] (Index_type k) { + i++; + x[i] = x[k] - v[k ]*x[k-1] - v[k+1]*x[k+1]; + } ); + } while ( ii>0 ); + + } +} + +BENCHMARK(BM_ICCG_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_INNER_PROD_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(INNER_PROD); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr z = loop_data.array_1D_Real[1]; + + Real_type q = 0.0; + Real_type val = 0.0; + + for (auto _ : state) { + + q = 0.0; + forall(0, state.range(0), + [&] (Index_type k) { + benchmark::DoNotOptimize(q += z[k]*x[k]); + } ); + + } +} + +BENCHMARK(BM_INNER_PROD_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_BAND_LIN_EQ_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(BAND_LIN_EQ); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + + Index_type lw; + Real_type temp; + + for (auto _ : state) { + + Index_type m = ( 1001-7 )/2; + for ( Index_type k=6 ; k<1001 ; k=k+m ) { + lw = k - 6; + temp = x[k-1]; + forall(4, state.range(0), 5, + [&] (Index_type j) { + temp -= x[lw]*y[j]; + lw++; + } ); + x[k-1] = y[4]*temp; + } + + } +} + +BENCHMARK(BM_BAND_LIN_EQ_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_TRIDIAG_ELIM_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(TRIDIAG_ELIM); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + + for (auto _ : state) { + + forall(1, state.range(0), + [&] (Index_type i) { + x[i] = z[i]*( y[i] - x[i-1] ); + } ); + + } +} + +BENCHMARK(BM_TRIDIAG_ELIM_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_EOS_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(EOS); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + Real_ptr u = loop_data.array_1D_Real[3]; + + const Real_type q = loop_data.scalar_Real[0]; + const Real_type r = loop_data.scalar_Real[1]; + const Real_type t = loop_data.scalar_Real[2]; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type k) { + x[k] = u[k] + r*( z[k] + r*y[k] ) + + t*( u[k+3] + r*( u[k+2] + r*u[k+1] ) + + t*( u[k+6] + q*( u[k+5] + q*u[k+4] ) ) ); + } ); + + } +} + +BENCHMARK(BM_EOS_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_ADI_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(ADI); + + Real_ptr du1 = loop_data.array_1D_Real[0]; + Real_ptr du2 = loop_data.array_1D_Real[1]; + Real_ptr du3 = loop_data.array_1D_Real[2]; + + Real_ptr** u1 = loop_data.array_3D_2xNx4_Real[0]; + Real_ptr** u2 = loop_data.array_3D_2xNx4_Real[1]; + Real_ptr** u3 = loop_data.array_3D_2xNx4_Real[2]; + + const Real_type sig = loop_data.scalar_Real[0]; + const Real_type a11 = loop_data.scalar_Real[1]; + const Real_type a12 = loop_data.scalar_Real[2]; + const Real_type a13 = loop_data.scalar_Real[3]; + const Real_type a21 = loop_data.scalar_Real[4]; + const Real_type a22 = loop_data.scalar_Real[5]; + const Real_type a23 = loop_data.scalar_Real[6]; + const Real_type a31 = loop_data.scalar_Real[7]; + const Real_type a32 = loop_data.scalar_Real[8]; + const Real_type a33 = loop_data.scalar_Real[9]; + + Index_type nl1 = 0; + Index_type nl2 = 1; + Index_type kx; + + for (auto _ : state) { + + for ( kx=1 ; kx<3 ; kx++ ) { + forall(1, state.range(0), + [&] (Index_type ky) { + du1[ky] = u1[nl1][ky+1][kx] - u1[nl1][ky-1][kx]; + du2[ky] = u2[nl1][ky+1][kx] - u2[nl1][ky-1][kx]; + du3[ky] = u3[nl1][ky+1][kx] - u3[nl1][ky-1][kx]; + u1[nl2][ky][kx]= + u1[nl1][ky][kx]+a11*du1[ky]+a12*du2[ky]+a13*du3[ky] + sig* + (u1[nl1][ky][kx+1]-2.0*u1[nl1][ky][kx]+u1[nl1][ky][kx-1]); + u2[nl2][ky][kx]= + u2[nl1][ky][kx]+a21*du1[ky]+a22*du2[ky]+a23*du3[ky] + sig* + (u2[nl1][ky][kx+1]-2.0*u2[nl1][ky][kx]+u2[nl1][ky][kx-1]); + u3[nl2][ky][kx]= + u3[nl1][ky][kx]+a31*du1[ky]+a32*du2[ky]+a33*du3[ky] + sig* + (u3[nl1][ky][kx+1]-2.0*u3[nl1][ky][kx]+u3[nl1][ky][kx-1]); + } ); + } + + } +} + +BENCHMARK(BM_ADI_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_INT_PREDICT_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(INT_PREDICT); + + Real_ptr* px = loop_data.array_2D_Nx25_Real[0]; + + const Real_type dm22 = loop_data.scalar_Real[0]; + const Real_type dm23 = loop_data.scalar_Real[1]; + const Real_type dm24 = loop_data.scalar_Real[2]; + const Real_type dm25 = loop_data.scalar_Real[3]; + const Real_type dm26 = loop_data.scalar_Real[4]; + const Real_type dm27 = loop_data.scalar_Real[5]; + const Real_type dm28 = loop_data.scalar_Real[6]; + const Real_type c0 = loop_data.scalar_Real[7]; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type i) { + px[i][0] = dm28*px[i][12] + dm27*px[i][11] + dm26*px[i][10] + + dm25*px[i][ 9] + dm24*px[i][ 8] + dm23*px[i][ 7] + + dm22*px[i][ 6] + c0*( px[i][ 4] + px[i][ 5]) + px[i][ 2]; + } ); + + } +} + +BENCHMARK(BM_INT_PREDICT_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_DIFF_PREDICT_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(DIFF_PREDICT); + + Real_ptr* px = loop_data.array_2D_Nx25_Real[0]; + Real_ptr* cx = loop_data.array_2D_Nx25_Real[1]; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type i) { + Real_type ar, br, cr; + ar = cx[i][ 4]; + br = ar - px[i][ 4]; + px[i][ 4] = ar; + cr = br - px[i][ 5]; + px[i][ 5] = br; + ar = cr - px[i][ 6]; + px[i][ 6] = cr; + br = ar - px[i][ 7]; + px[i][ 7] = ar; + cr = br - px[i][ 8]; + px[i][ 8] = br; + ar = cr - px[i][ 9]; + px[i][ 9] = cr; + br = ar - px[i][10]; + px[i][10] = ar; + cr = br - px[i][11]; + px[i][11] = br; + px[i][13] = cr - px[i][12]; + px[i][12] = cr; + } ); + + } +} + +BENCHMARK(BM_DIFF_PREDICT_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_FIRST_SUM_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(FIRST_SUM); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + + for (auto _ :state) { + + x[0] = y[0]; + forall(1, state.range(0), + [&] (Index_type k) { + x[k] = x[k-1] + y[k]; + } ); + + } +} + +BENCHMARK(BM_FIRST_SUM_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_FIRST_DIFF_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(FIRST_DIFF); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type k) { + x[k] = y[k+1] - y[k]; + } ); + + } +} + +BENCHMARK(BM_FIRST_DIFF_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_PIC_2D_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(PIC_2D); + + Real_ptr* p = loop_data.array_2D_Nx25_Real[0]; + Real_ptr* b = loop_data.array_2D_Nx25_Real[1]; + Real_ptr* c = loop_data.array_2D_Nx25_Real[2]; + + Real_ptr y = loop_data.array_1D_Real[0]; + Real_ptr z = loop_data.array_1D_Real[1]; + + Index_type* e = loop_data.array_1D_Indx[0]; + Index_type* f = loop_data.array_1D_Indx[1]; + + Real_ptr* h = loop_data.array_2D_64x64_Real[0]; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type ip) { + Index_type i1, j1, i2, j2; + i1 = (Index_type) p[ip][0]; + j1 = (Index_type) p[ip][1]; + i1 &= 64-1; + j1 &= 64-1; + p[ip][2] += b[j1][i1]; + p[ip][3] += c[j1][i1]; + p[ip][0] += p[ip][2]; + p[ip][1] += p[ip][3]; + i2 = (Index_type) p[ip][0]; + j2 = (Index_type) p[ip][1]; + i2 = ( i2 & 64-1 ) ; + j2 = ( j2 & 64-1 ) ; + p[ip][0] += y[i2+32]; + p[ip][1] += z[j2+32]; + i2 += e[i2+32]; + j2 += f[j2+32]; + h[j2][i2] += 1.0; + } ); + + } +} + +BENCHMARK(BM_PIC_2D_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_PIC_1D_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(PIC_1D); + + Real_ptr vx = loop_data.array_1D_Real[0]; + Real_ptr xx = loop_data.array_1D_Real[1]; + Real_ptr xi = loop_data.array_1D_Real[2]; + Real_ptr ex = loop_data.array_1D_Real[3]; + Real_ptr ex1 = loop_data.array_1D_Real[4]; + Real_ptr dex = loop_data.array_1D_Real[5]; + Real_ptr dex1 = loop_data.array_1D_Real[6]; + Real_ptr rh = loop_data.array_1D_Real[7]; + Real_ptr rx = loop_data.array_1D_Real[8]; + + const Real_type flx = loop_data.scalar_Real[0]; + + Index_type* ix = loop_data.array_1D_Indx[2]; + Index_type* ir = loop_data.array_1D_Indx[3]; + Index_type* grd = loop_data.array_1D_Indx[4]; + + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type k) { + vx[k] = 0.0; + xx[k] = 0.0; + ix[k] = (Index_type) grd[k]; + xi[k] = (Real_type) ix[k]; + ex1[k] = ex[ ix[k] - 1 ]; + dex1[k] = dex[ ix[k] - 1 ]; + } ); + + forall(0, state.range(0), + [&] (Index_type k) { + vx[k] = vx[k] + ex1[k] + ( xx[k] - xi[k] )*dex1[k]; + xx[k] = xx[k] + vx[k] + flx; + ir[k] = (Index_type) xx[k]; + rx[k] = xx[k] - ir[k]; + ir[k] = ( ir[k] & (2048-1) ) + 1; + xx[k] = rx[k] + ir[k]; + } ); + + forall(0, state.range(0), + [&] (Index_type k) { + rh[ ir[k]-1 ] += 1.0 - rx[k]; + rh[ ir[k] ] += rx[k]; + } ); + + } +} + +BENCHMARK(BM_PIC_1D_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_HYDRO_2D_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(HYDRO_2D); + + Real_ptr* za = loop_data.array_2D_7xN_Real[0]; + Real_ptr* zb = loop_data.array_2D_7xN_Real[1]; + Real_ptr* zm = loop_data.array_2D_7xN_Real[2]; + Real_ptr* zp = loop_data.array_2D_7xN_Real[3]; + Real_ptr* zq = loop_data.array_2D_7xN_Real[4]; + Real_ptr* zr = loop_data.array_2D_7xN_Real[5]; + Real_ptr* zu = loop_data.array_2D_7xN_Real[6]; + Real_ptr* zv = loop_data.array_2D_7xN_Real[7]; + Real_ptr* zz = loop_data.array_2D_7xN_Real[8]; + + Real_ptr* zrout = loop_data.array_2D_7xN_Real[9]; + Real_ptr* zzout = loop_data.array_2D_7xN_Real[10]; + + const Real_type t = 0.0037; + const Real_type s = 0.0041; + + Index_type kn = 6; + Index_type jn = state.range(0); + Index_type k; + + for (auto _ : state) { + + for ( k=1 ; k(1, jn, + [&] (Index_type j) { + za[k][j] = ( zp[k+1][j-1] +zq[k+1][j-1] -zp[k][j-1] -zq[k][j-1] )* + ( zr[k][j] +zr[k][j-1] ) / ( zm[k][j-1] +zm[k+1][j-1]); + zb[k][j] = ( zp[k][j-1] +zq[k][j-1] -zp[k][j] -zq[k][j] ) * + ( zr[k][j] +zr[k-1][j] ) / ( zm[k][j] +zm[k][j-1]); + } ); + } + + for ( k=1 ; k(1, jn, + [&] (Index_type j) { + zu[k][j] += s*( za[k][j] *( zz[k][j] - zz[k][j+1] ) - + za[k][j-1] *( zz[k][j] - zz[k][j-1] ) - + zb[k][j] *( zz[k][j] - zz[k-1][j] ) + + zb[k+1][j] *( zz[k][j] - zz[k+1][j] ) ); + zv[k][j] += s*( za[k][j] *( zr[k][j] - zr[k][j+1] ) - + za[k][j-1] *( zr[k][j] - zr[k][j-1] ) - + zb[k][j] *( zr[k][j] - zr[k-1][j] ) + + zb[k+1][j] *( zr[k][j] - zr[k+1][j] ) ); + } ); + } + + for ( k=1 ; k(1, jn, + [&] (Index_type j) { + zrout[k][j] = zr[k][j] + t*zu[k][j]; + zzout[k][j] = zz[k][j] + t*zv[k][j]; + } ); + } + + } +} + +BENCHMARK(BM_HYDRO_2D_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_GEN_LIN_RECUR_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(GEN_LIN_RECUR); + + Real_ptr b5 = loop_data.array_1D_Real[0]; + Real_ptr sa = loop_data.array_1D_Real[1]; + Real_ptr sb = loop_data.array_1D_Real[2]; + + Real_type stb5 = loop_data.scalar_Real[0]; + + Index_type kb5i = 0; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type k) { + b5[k+kb5i] = sa[k] + stb5*sb[k]; + stb5 = b5[k+kb5i] - stb5; + } ); + + forall(1, state.range(0) + 1, + [&] (Index_type i) { + Index_type k = state.range(0) - i ; + b5[k+kb5i] = sa[k] + stb5*sb[k]; + stb5 = b5[k+kb5i] - stb5; + } ); + + } +} + +BENCHMARK(BM_GEN_LIN_RECUR_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_DISC_ORD_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(DISC_ORD); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + Real_ptr u = loop_data.array_1D_Real[3]; + Real_ptr v = loop_data.array_1D_Real[4]; + Real_ptr w = loop_data.array_1D_Real[5]; + Real_ptr g = loop_data.array_1D_Real[6]; + Real_ptr xx = loop_data.array_1D_Real[7]; + Real_ptr vx = loop_data.array_1D_Real[9]; + const Real_type s = loop_data.scalar_Real[0]; + const Real_type t = loop_data.scalar_Real[1]; + const Real_type dk = loop_data.scalar_Real[2]; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type k) { + Real_type di = y[k] - g[k] / ( xx[k] + dk ); + Real_type dn = 0.2; + if ( di ) { + dn = z[k]/di ; + if ( t < dn ) dn = t; + if ( s > dn ) dn = s; + } + x[k] = ( ( w[k] + v[k]*dn )* xx[k] + u[k] ) / ( vx[k] + v[k]*dn ); + xx[k+1] = ( x[k] - xx[k] )* dn + xx[k]; + } ); + + } +} + +BENCHMARK(BM_DISC_ORD_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_MAT_X_MAT_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(MAT_X_MAT); + + Real_ptr* px = loop_data.array_2D_Nx25_Real[0]; + Real_ptr* cx = loop_data.array_2D_Nx25_Real[1]; + Real_ptr* vy = loop_data.array_2D_64x64_Real[0]; + + Index_type k, i; + + for (auto _ : state) { + + for ( k=0 ; k<25 ; k++ ) { + for ( i=0 ; i<25 ; i++ ) { + forall(0, state.range(0), + [&] (Index_type j) { + px[j][i] += vy[k][i] * cx[j][k]; + } ); + } + } + + } +} + +BENCHMARK(BM_MAT_X_MAT_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_PLANCKIAN_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(PLANCKIAN); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr u = loop_data.array_1D_Real[2]; + Real_ptr v = loop_data.array_1D_Real[3]; + Real_ptr w = loop_data.array_1D_Real[4]; + + Real_type expmax = 20.0; + u[state.range(0)-1] = 0.99*expmax*v[state.range(0)-1]; + + for (auto _ : state) { + + forall(0, state.range(0), + [&] (Index_type k) { + y[k] = u[k] / v[k]; + w[k] = x[k] / ( exp( y[k] ) -1.0 ); + } ); + + } +} + +BENCHMARK(BM_PLANCKIAN_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_IMP_HYDRO_2D_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(IMP_HYDRO_2D); + + Real_ptr* za = loop_data.array_2D_7xN_Real[0]; + Real_ptr* zb = loop_data.array_2D_7xN_Real[1]; + Real_ptr* zr = loop_data.array_2D_7xN_Real[2]; + Real_ptr* zu = loop_data.array_2D_7xN_Real[3]; + Real_ptr* zv = loop_data.array_2D_7xN_Real[4]; + Real_ptr* zz = loop_data.array_2D_7xN_Real[5]; + + Index_type j; + + for (auto _ : state) { + + for ( j=1 ; j<6 ; j++ ) { + forall(1, state.range(0), + [&] (Index_type k) { + Real_type qa = za[j+1][k]*zr[j][k] + za[j-1][k]*zb[j][k] + + za[j][k+1]*zu[j][k] + za[j][k-1]*zv[j][k] + zz[j][k]; + za[j][k] += 0.175*( qa - za[j][k] ); + } ); + } + + } +} + +BENCHMARK(BM_IMP_HYDRO_2D_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_FIND_FIRST_MIN_LAMBDA(benchmark::State& state) { + + LoopData& loop_data = getLoopData(); + + loopInit(FIND_FIRST_MIN); + + Real_ptr x = loop_data.array_1D_Real[0]; + + Index_type m = 0; + Index_type val = 0; + + for (auto _ : state) { + + m = 0; + forall(1, state.range(0), + [&] (Index_type k) { + if ( x[k] < x[m] ) benchmark::DoNotOptimize(m = k); + } ); + + } +} + +BENCHMARK(BM_FIND_FIRST_MIN_LAMBDA)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); Index: MicroBenchmarks/LCALS/SubsetCRawLoops/CMakeLists.txt =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetCRawLoops/CMakeLists.txt @@ -0,0 +1,4 @@ +list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG) +llvm_test_run() +llvm_test_executable(lcalsCRaw ../main.cxx RawSubsetCbenchmarks.cxx ../LCALSStats.cxx ../LCALSSuite.cxx ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx) +target_link_libraries(lcalsCRaw benchmark) Index: MicroBenchmarks/LCALS/SubsetCRawLoops/RawSubsetCbenchmarks.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetCRawLoops/RawSubsetCbenchmarks.cxx @@ -0,0 +1,1017 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file containing LCALS "C" subset raw loops using the google +// benchmark library. +// + +#include +#include "../LCALSSuite.hxx" + +static void BM_HYDRO_1D_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 1 -- hydro fragment + ******************************************************************* + * DO 1 L = 1,Loop + * DO 1 k = 1,n + * 1 X(k)= Q + Y(k)*(R*ZX(k+10) + T*ZX(k+11)) + */ + + LoopData& loop_data = getLoopData(); + + loopInit(HYDRO_1D); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + + const Real_type q = loop_data.scalar_Real[0]; + const Real_type r = loop_data.scalar_Real[1]; + const Real_type t = loop_data.scalar_Real[2]; + + for (auto _ : state) { + + for (Index_type k=0 ; k< state.range(0) ; k++ ) { + x[k] = q + y[k]*( r*z[k+10] + t*z[k+11] ); + } + + } +} + +BENCHMARK(BM_HYDRO_1D_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + + +static void BM_ICCG_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 2 -- ICCG excerpt (Incomplete Cholesky Conj. Gradient) + ******************************************************************* + * DO 200 L= 1,Loop + * II= n + * IPNTP= 0 + *222 IPNT= IPNTP + * IPNTP= IPNTP+II + * II= II/2 + * i= IPNTP+1 + CDIR$ IVDEP + * DO 2 k= IPNT+2,IPNTP,2 + * i= i+1 + * 2 X(i)= X(k) - V(k)*X(k-1) - V(k+1)*X(k+1) + * IF( II.GT.1) GO TO 222 + *200 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(ICCG); + + Real_ptr x = loop_data.array_1D_Nx4_Real[0]; + Real_ptr v = loop_data.array_1D_Nx4_Real[1]; + + Index_type ii, ipnt, ipntp, i; + + for (auto _ : state) { + + ii = state.range(0); + ipntp = 0; + do { + ipnt = ipntp; + ipntp += ii; + ii /= 2; + i = ipntp ; + for (Index_type k=ipnt+1 ; k0 ); + + } +} + +BENCHMARK(BM_ICCG_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_INNER_PROD_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 3 -- inner product + ******************************************************************* + * DO 3 L= 1,Loop + * Q= 0.0 + * DO 3 k= 1,n + * 3 Q= Q + Z(k)*X(k) + */ + + LoopData& loop_data = getLoopData(); + + loopInit(INNER_PROD); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr z = loop_data.array_1D_Real[1]; + + Real_type q = 0.0; + Real_type val = 0.0; + + for (auto _ : state) { + + q = 0.0; + for (Index_type k=0 ; k< state.range(0); k++ ) { + benchmark::DoNotOptimize(q += z[k]*x[k]); + } + + } +} + +BENCHMARK(BM_INNER_PROD_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_BAND_LIN_EQ_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 4 -- banded linear equations + ******************************************************************* + * m= (1001-7)/2 + * DO 444 L= 1,Loop + * DO 444 k= 7,1001,m + * lw= k-6 + * temp= X(k-1) + CDIR$ IVDEP + * DO 4 j= 5,n,5 + * temp = temp - XZ(lw)*Y(j) + * 4 lw= lw+1 + * X(k-1)= Y(5)*temp + *444 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(BAND_LIN_EQ); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + + Index_type lw; + Real_type temp; + + for (auto _ : state) { + + Index_type m = ( 1001-7 )/2; + for ( Index_type k=6 ; k<1001 ; k=k+m ) { + lw = k - 6; + temp = x[k-1]; + for (Index_type j=4 ; j< state.range(0) ; j=j+5 ) { + temp -= x[lw]*y[j]; + lw++; + } + x[k-1] = y[4]*temp; + } + + } +} + +BENCHMARK(BM_BAND_LIN_EQ_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_TRIDIAG_ELIM_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 5 -- tri-diagonal elimination, below diagonal + ******************************************************************* + * DO 5 L = 1,Loop + * DO 5 i = 2,n + * 5 X(i)= Z(i)*(Y(i) - X(i-1)) + */ + + LoopData& loop_data = getLoopData(); + + loopInit(TRIDIAG_ELIM); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + + for (auto _ : state) { + + for ( Index_type i=1 ; i< state.range(0) ; i++ ) { + x[i] = z[i]*( y[i] - x[i-1] ); + } + + } +} + +BENCHMARK(BM_TRIDIAG_ELIM_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_EOS_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 7 -- equation of state fragment + ******************************************************************* + * DO 7 L= 1,Loop + * DO 7 k= 1,n + * X(k)= U(k ) + R*( Z(k ) + R*Y(k )) + + * . T*( U(k+3) + R*( U(k+2) + R*U(k+1)) + + * . T*( U(k+6) + Q*( U(k+5) + Q*U(k+4)))) + * 7 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(EOS); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + Real_ptr u = loop_data.array_1D_Real[3]; + + const Real_type q = loop_data.scalar_Real[0]; + const Real_type r = loop_data.scalar_Real[1]; + const Real_type t = loop_data.scalar_Real[2]; + + for (auto _ : state) { + + for ( Index_type k=0 ; k< state.range(0) ; k++ ) { + x[k] = u[k] + r*( z[k] + r*y[k] ) + + t*( u[k+3] + r*( u[k+2] + r*u[k+1] ) + + t*( u[k+6] + q*( u[k+5] + q*u[k+4] ) ) ); + } + + } +} + +BENCHMARK(BM_EOS_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_ADI_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 8 -- ADI integration + ******************************************************************* + * DO 8 L = 1,Loop + * nl1 = 1 + * nl2 = 2 + * DO 8 kx = 2,3 + CDIR$ IVDEP + * DO 8 ky = 2,n + * DU1(ky)=U1(kx,ky+1,nl1) - U1(kx,ky-1,nl1) + * DU2(ky)=U2(kx,ky+1,nl1) - U2(kx,ky-1,nl1) + * DU3(ky)=U3(kx,ky+1,nl1) - U3(kx,ky-1,nl1) + * U1(kx,ky,nl2)=U1(kx,ky,nl1) +A11*DU1(ky) +A12*DU2(ky) +A13*DU3(ky) + * . + SIG*(U1(kx+1,ky,nl1) -2.*U1(kx,ky,nl1) +U1(kx-1,ky,nl1)) + * U2(kx,ky,nl2)=U2(kx,ky,nl1) +A21*DU1(ky) +A22*DU2(ky) +A23*DU3(ky) + * . + SIG*(U2(kx+1,ky,nl1) -2.*U2(kx,ky,nl1) +U2(kx-1,ky,nl1)) + * U3(kx,ky,nl2)=U3(kx,ky,nl1) +A31*DU1(ky) +A32*DU2(ky) +A33*DU3(ky) + * . + SIG*(U3(kx+1,ky,nl1) -2.*U3(kx,ky,nl1) +U3(kx-1,ky,nl1)) + * 8 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(ADI); + + Real_ptr du1 = loop_data.array_1D_Real[0]; + Real_ptr du2 = loop_data.array_1D_Real[1]; + Real_ptr du3 = loop_data.array_1D_Real[2]; + + Real_ptr** u1 = loop_data.array_3D_2xNx4_Real[0]; + Real_ptr** u2 = loop_data.array_3D_2xNx4_Real[1]; + Real_ptr** u3 = loop_data.array_3D_2xNx4_Real[2]; + + const Real_type sig = loop_data.scalar_Real[0]; + const Real_type a11 = loop_data.scalar_Real[1]; + const Real_type a12 = loop_data.scalar_Real[2]; + const Real_type a13 = loop_data.scalar_Real[3]; + const Real_type a21 = loop_data.scalar_Real[4]; + const Real_type a22 = loop_data.scalar_Real[5]; + const Real_type a23 = loop_data.scalar_Real[6]; + const Real_type a31 = loop_data.scalar_Real[7]; + const Real_type a32 = loop_data.scalar_Real[8]; + const Real_type a33 = loop_data.scalar_Real[9]; + + Index_type nl1 = 0; + Index_type nl2 = 1; + Index_type kx; + + for (auto _ : state) { + + for ( kx=1 ; kx<3 ; kx++ ) { + for (Index_type ky=1 ; ky< state.range(0) ; ky++ ) { + du1[ky] = u1[nl1][ky+1][kx] - u1[nl1][ky-1][kx]; + du2[ky] = u2[nl1][ky+1][kx] - u2[nl1][ky-1][kx]; + du3[ky] = u3[nl1][ky+1][kx] - u3[nl1][ky-1][kx]; + u1[nl2][ky][kx]= + u1[nl1][ky][kx]+a11*du1[ky]+a12*du2[ky]+a13*du3[ky] + sig* + (u1[nl1][ky][kx+1]-2.0*u1[nl1][ky][kx]+u1[nl1][ky][kx-1]); + u2[nl2][ky][kx]= + u2[nl1][ky][kx]+a21*du1[ky]+a22*du2[ky]+a23*du3[ky] + sig* + (u2[nl1][ky][kx+1]-2.0*u2[nl1][ky][kx]+u2[nl1][ky][kx-1]); + u3[nl2][ky][kx]= + u3[nl1][ky][kx]+a31*du1[ky]+a32*du2[ky]+a33*du3[ky] + sig* + (u3[nl1][ky][kx+1]-2.0*u3[nl1][ky][kx]+u3[nl1][ky][kx-1]); + } + } + + } +} + +BENCHMARK(BM_ADI_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_INT_PREDICT_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 9 -- integrate predictors + ******************************************************************* + * DO 9 L = 1,Loop + * DO 9 i = 1,n + * PX( 1,i)= DM28*PX(13,i) + DM27*PX(12,i) + DM26*PX(11,i) + + * . DM25*PX(10,i) + DM24*PX( 9,i) + DM23*PX( 8,i) + + * . DM22*PX( 7,i) + C0*(PX( 5,i) + PX( 6,i))+ PX( 3,i) + * 9 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(INT_PREDICT); + + Real_ptr* px = loop_data.array_2D_Nx25_Real[0]; + + const Real_type dm22 = loop_data.scalar_Real[0]; + const Real_type dm23 = loop_data.scalar_Real[1]; + const Real_type dm24 = loop_data.scalar_Real[2]; + const Real_type dm25 = loop_data.scalar_Real[3]; + const Real_type dm26 = loop_data.scalar_Real[4]; + const Real_type dm27 = loop_data.scalar_Real[5]; + const Real_type dm28 = loop_data.scalar_Real[6]; + const Real_type c0 = loop_data.scalar_Real[7]; + + for (auto _ : state) { + + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + px[i][0] = dm28*px[i][12] + dm27*px[i][11] + dm26*px[i][10] + + dm25*px[i][ 9] + dm24*px[i][ 8] + dm23*px[i][ 7] + + dm22*px[i][ 6] + c0*( px[i][ 4] + px[i][ 5]) + px[i][ 2]; + } + + } +} + +BENCHMARK(BM_INT_PREDICT_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_DIFF_PREDICT_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 10 -- difference predictors + ******************************************************************* + * DO 10 L= 1,Loop + * DO 10 i= 1,n + * AR = CX(5,i) + * BR = AR - PX(5,i) + * PX(5,i) = AR + * CR = BR - PX(6,i) + * PX(6,i) = BR + * AR = CR - PX(7,i) + * PX(7,i) = CR + * BR = AR - PX(8,i) + * PX(8,i) = AR + * CR = BR - PX(9,i) + * PX(9,i) = BR + * AR = CR - PX(10,i) + * PX(10,i)= CR + * BR = AR - PX(11,i) + * PX(11,i)= AR + * CR = BR - PX(12,i) + * PX(12,i)= BR + * PX(14,i)= CR - PX(13,i) + * PX(13,i)= CR + * 10 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(DIFF_PREDICT); + + Real_ptr* px = loop_data.array_2D_Nx25_Real[0]; + Real_ptr* cx = loop_data.array_2D_Nx25_Real[1]; + + for (auto _ : state) { + + for (Index_type i=0 ; i< state.range(0) ; i++ ) { + Real_type ar, br, cr; + ar = cx[i][ 4]; + br = ar - px[i][ 4]; + px[i][ 4] = ar; + cr = br - px[i][ 5]; + px[i][ 5] = br; + ar = cr - px[i][ 6]; + px[i][ 6] = cr; + br = ar - px[i][ 7]; + px[i][ 7] = ar; + cr = br - px[i][ 8]; + px[i][ 8] = br; + ar = cr - px[i][ 9]; + px[i][ 9] = cr; + br = ar - px[i][10]; + px[i][10] = ar; + cr = br - px[i][11]; + px[i][11] = br; + px[i][13] = cr - px[i][12]; + px[i][12] = cr; + } + + } +} + +BENCHMARK(BM_DIFF_PREDICT_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_FIRST_SUM_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 11 -- first sum + ******************************************************************* + * DO 11 L = 1,Loop + * X(1)= Y(1) + * DO 11 k = 2,n + * 11 X(k)= X(k-1) + Y(k) + */ + + LoopData& loop_data = getLoopData(); + + loopInit(FIRST_SUM); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + + for (auto _ :state) { + + x[0] = y[0]; + for (Index_type k=1 ; k< state.range(0) ; k++ ) { + x[k] = x[k-1] + y[k]; + } + + } +} + +BENCHMARK(BM_FIRST_SUM_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_FIRST_DIFF_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 12 -- first difference + ******************************************************************* + * DO 12 L = 1,Loop + * DO 12 k = 1,n + * 12 X(k)= Y(k+1) - Y(k) + */ + + LoopData& loop_data = getLoopData(); + + loopInit(FIRST_DIFF); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + + for (auto _ : state) { + + for (Index_type k=0 ; k< state.range(0) ; k++ ) { + x[k] = y[k+1] - y[k]; + } + + } +} + +BENCHMARK(BM_FIRST_DIFF_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_PIC_2D_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 13 -- 2-D PIC (Particle In Cell) + ******************************************************************* + * DO 13 L= 1,Loop + * DO 13 ip= 1,n + * i1= P(1,ip) + * j1= P(2,ip) + * i1= 1 + MOD2N(i1,64) + * j1= 1 + MOD2N(j1,64) + * P(3,ip)= P(3,ip) + B(i1,j1) + * P(4,ip)= P(4,ip) + C(i1,j1) + * P(1,ip)= P(1,ip) + P(3,ip) + * P(2,ip)= P(2,ip) + P(4,ip) + * i2= P(1,ip) + * j2= P(2,ip) + * i2= MOD2N(i2,64) + * j2= MOD2N(j2,64) + * P(1,ip)= P(1,ip) + Y(i2+32) + * P(2,ip)= P(2,ip) + Z(j2+32) + * i2= i2 + E(i2+32) + * j2= j2 + F(j2+32) + * H(i2,j2)= H(i2,j2) + 1.0 + * 13 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(PIC_2D); + + Real_ptr* p = loop_data.array_2D_Nx25_Real[0]; + Real_ptr* b = loop_data.array_2D_Nx25_Real[1]; + Real_ptr* c = loop_data.array_2D_Nx25_Real[2]; + + Real_ptr y = loop_data.array_1D_Real[0]; + Real_ptr z = loop_data.array_1D_Real[1]; + + Index_type* e = loop_data.array_1D_Indx[0]; + Index_type* f = loop_data.array_1D_Indx[1]; + + Real_ptr* h = loop_data.array_2D_64x64_Real[0]; + + for (auto _ : state) { + + for (Index_type ip=0 ; ip< state.range(0) ; ip++ ) { + Index_type i1, j1, i2, j2; + i1 = (Index_type) p[ip][0]; + j1 = (Index_type) p[ip][1]; + i1 &= 64-1; + j1 &= 64-1; + p[ip][2] += b[j1][i1]; + p[ip][3] += c[j1][i1]; + p[ip][0] += p[ip][2]; + p[ip][1] += p[ip][3]; + i2 = (Index_type) p[ip][0]; + j2 = (Index_type) p[ip][1]; + i2 = ( i2 & 64-1 ) ; + j2 = ( j2 & 64-1 ) ; + p[ip][0] += y[i2+32]; + p[ip][1] += z[j2+32]; + i2 += e[i2+32]; + j2 += f[j2+32]; + h[j2][i2] += 1.0; + } + + } +} + +BENCHMARK(BM_PIC_2D_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_PIC_1D_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 14 -- 1-D PIC (Particle In Cell) + ******************************************************************* + * DO 14 L= 1,Loop + * DO 141 k= 1,n + * VX(k)= 0.0 + * XX(k)= 0.0 + * IX(k)= INT( GRD(k)) + * XI(k)= REAL( IX(k)) + * EX1(k)= EX ( IX(k)) + * DEX1(k)= DEX ( IX(k)) + *41 CONTINUE + * DO 142 k= 1,n + * VX(k)= VX(k) + EX1(k) + (XX(k) - XI(k))*DEX1(k) + * XX(k)= XX(k) + VX(k) + FLX + * IR(k)= XX(k) + * RX(k)= XX(k) - IR(k) + * IR(k)= MOD2N( IR(k),2048) + 1 + * XX(k)= RX(k) + IR(k) + *42 CONTINUE + * DO 14 k= 1,n + * RH(IR(k) )= RH(IR(k) ) + 1.0 - RX(k) + * RH(IR(k)+1)= RH(IR(k)+1) + RX(k) + *14 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(PIC_1D); + + Real_ptr vx = loop_data.array_1D_Real[0]; + Real_ptr xx = loop_data.array_1D_Real[1]; + Real_ptr xi = loop_data.array_1D_Real[2]; + Real_ptr ex = loop_data.array_1D_Real[3]; + Real_ptr ex1 = loop_data.array_1D_Real[4]; + Real_ptr dex = loop_data.array_1D_Real[5]; + Real_ptr dex1 = loop_data.array_1D_Real[6]; + Real_ptr rh = loop_data.array_1D_Real[7]; + Real_ptr rx = loop_data.array_1D_Real[8]; + + const Real_type flx = loop_data.scalar_Real[0]; + + Index_type* ix = loop_data.array_1D_Indx[2]; + Index_type* ir = loop_data.array_1D_Indx[3]; + Index_type* grd = loop_data.array_1D_Indx[4]; + + + for (auto _ : state) { + + for (Index_type k=0 ; k< state.range(0) ; k++ ) { + vx[k] = 0.0; + xx[k] = 0.0; + ix[k] = (Index_type) grd[k]; + xi[k] = (Real_type) ix[k]; + ex1[k] = ex[ ix[k] - 1 ]; + dex1[k] = dex[ ix[k] - 1 ]; + } + + for (Index_type k=0 ; k< state.range(0) ; k++ ) { + vx[k] = vx[k] + ex1[k] + ( xx[k] - xi[k] )*dex1[k]; + xx[k] = xx[k] + vx[k] + flx; + ir[k] = (Index_type) xx[k]; + rx[k] = xx[k] - ir[k]; + ir[k] = ( ir[k] & (2048-1) ) + 1; + xx[k] = rx[k] + ir[k]; + } + + for (Index_type k=0 ; k< state.range(0) ; k++ ) { + rh[ ir[k]-1 ] += 1.0 - rx[k]; + rh[ ir[k] ] += rx[k]; + } + + } +} + +BENCHMARK(BM_PIC_1D_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_HYDRO_2D_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 18 - 2-D explicit hydrodynamics fragment + ******************************************************************* + * DO 75 L= 1,Loop + * T= 0.0037 + * S= 0.0041 + * KN= 6 + * JN= n + * DO 70 k= 2,KN + * DO 70 j= 2,JN + * ZA(j,k)= (ZP(j-1,k+1)+ZQ(j-1,k+1)-ZP(j-1,k)-ZQ(j-1,k)) + * . *(ZR(j,k)+ZR(j-1,k))/(ZM(j-1,k)+ZM(j-1,k+1)) + * ZB(j,k)= (ZP(j-1,k)+ZQ(j-1,k)-ZP(j,k)-ZQ(j,k)) + * . *(ZR(j,k)+ZR(j,k-1))/(ZM(j,k)+ZM(j-1,k)) + * 70 CONTINUE + * DO 72 k= 2,KN + * DO 72 j= 2,JN + * ZU(j,k)= ZU(j,k)+S*(ZA(j,k)*(ZZ(j,k)-ZZ(j+1,k)) + * . -ZA(j-1,k) *(ZZ(j,k)-ZZ(j-1,k)) + * . -ZB(j,k) *(ZZ(j,k)-ZZ(j,k-1)) + * . +ZB(j,k+1) *(ZZ(j,k)-ZZ(j,k+1))) + * ZV(j,k)= ZV(j,k)+S*(ZA(j,k)*(ZR(j,k)-ZR(j+1,k)) + * . -ZA(j-1,k) *(ZR(j,k)-ZR(j-1,k)) + * . -ZB(j,k) *(ZR(j,k)-ZR(j,k-1)) + * . +ZB(j,k+1) *(ZR(j,k)-ZR(j,k+1))) + * 72 CONTINUE + * DO 75 k= 2,KN + * DO 75 j= 2,JN + * ZR(j,k)= ZR(j,k)+T*ZU(j,k) + * ZZ(j,k)= ZZ(j,k)+T*ZV(j,k) + * 75 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(HYDRO_2D); + + Real_ptr* za = loop_data.array_2D_7xN_Real[0]; + Real_ptr* zb = loop_data.array_2D_7xN_Real[1]; + Real_ptr* zm = loop_data.array_2D_7xN_Real[2]; + Real_ptr* zp = loop_data.array_2D_7xN_Real[3]; + Real_ptr* zq = loop_data.array_2D_7xN_Real[4]; + Real_ptr* zr = loop_data.array_2D_7xN_Real[5]; + Real_ptr* zu = loop_data.array_2D_7xN_Real[6]; + Real_ptr* zv = loop_data.array_2D_7xN_Real[7]; + Real_ptr* zz = loop_data.array_2D_7xN_Real[8]; + + Real_ptr* zrout = loop_data.array_2D_7xN_Real[9]; + Real_ptr* zzout = loop_data.array_2D_7xN_Real[10]; + + const Real_type t = 0.0037; + const Real_type s = 0.0041; + + Index_type kn = 6; + Index_type jn = state.range(0); + Index_type k; + + for (auto _ : state) { + + for ( k=1 ; kArg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_GEN_LIN_RECUR_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 19 -- general linear recurrence equations + ******************************************************************* + * KB5I= 0 + * DO 194 L= 1,Loop + * DO 191 k= 1,n + * B5(k+KB5I)= SA(k) +STB5*SB(k) + * STB5= B5(k+KB5I) -STB5 + *191 CONTINUE + *192 DO 193 i= 1,n + * k= n-i+1 + * B5(k+KB5I)= SA(k) +STB5*SB(k) + * STB5= B5(k+KB5I) -STB5 + *193 CONTINUE + *194 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(GEN_LIN_RECUR); + + Real_ptr b5 = loop_data.array_1D_Real[0]; + Real_ptr sa = loop_data.array_1D_Real[1]; + Real_ptr sb = loop_data.array_1D_Real[2]; + + Real_type stb5 = loop_data.scalar_Real[0]; + + Index_type kb5i = 0; + + for (auto _ : state) { + + for ( Index_type k=0 ; k< state.range(0) ; k++ ) { + b5[k+kb5i] = sa[k] + stb5*sb[k]; + stb5 = b5[k+kb5i] - stb5; + } + + for (Index_type i=1 ; i<= state.range(0) ; i++ ) { + Index_type k = state.range(0) - i ; + b5[k+kb5i] = sa[k] + stb5*sb[k]; + stb5 = b5[k+kb5i] - stb5; + } + + } +} + +BENCHMARK(BM_GEN_LIN_RECUR_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_DISC_ORD_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 20 -- Discrete ordinates transport, cond recurrence on xx + ******************************************************************* + * DO 20 L= 1,Loop + * DO 20 k= 1,n + * DI= Y(k)-G(k)/( XX(k)+DK) + * DN= 0.2 + * IF( DI.NE.0.0) DN= MAX( S,MIN( Z(k)/DI, T)) + * X(k)= ((W(k)+V(k)*DN)* XX(k)+U(k))/(VX(k)+V(k)*DN) + * XX(k+1)= (X(k)- XX(k))*DN+ XX(k) + * 20 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(DISC_ORD); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr z = loop_data.array_1D_Real[2]; + Real_ptr u = loop_data.array_1D_Real[3]; + Real_ptr v = loop_data.array_1D_Real[4]; + Real_ptr w = loop_data.array_1D_Real[5]; + Real_ptr g = loop_data.array_1D_Real[6]; + Real_ptr xx = loop_data.array_1D_Real[7]; + Real_ptr vx = loop_data.array_1D_Real[9]; + const Real_type s = loop_data.scalar_Real[0]; + const Real_type t = loop_data.scalar_Real[1]; + const Real_type dk = loop_data.scalar_Real[2]; + + for (auto _ : state) { + + for (Index_type k=0 ; k< state.range(0) ; k++ ) { + Real_type di = y[k] - g[k] / ( xx[k] + dk ); + Real_type dn = 0.2; + if ( di ) { + dn = z[k]/di ; + if ( t < dn ) dn = t; + if ( s > dn ) dn = s; + } + x[k] = ( ( w[k] + v[k]*dn )* xx[k] + u[k] ) / ( vx[k] + v[k]*dn ); + xx[k+1] = ( x[k] - xx[k] )* dn + xx[k]; + } + + } +} + +BENCHMARK(BM_DISC_ORD_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_MAT_X_MAT_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 21 -- matrix*matrix product + ******************************************************************* + * DO 21 L= 1,Loop + * DO 21 k= 1,25 + * DO 21 i= 1,25 + * DO 21 j= 1,n + * PX(i,j)= PX(i,j) +VY(i,k) * CX(k,j) + * 21 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(MAT_X_MAT); + + Real_ptr* px = loop_data.array_2D_Nx25_Real[0]; + Real_ptr* cx = loop_data.array_2D_Nx25_Real[1]; + Real_ptr* vy = loop_data.array_2D_64x64_Real[0]; + + Index_type k, i; + + for (auto _ : state) { + + for ( k=0 ; k<25 ; k++ ) { + for ( i=0 ; i<25 ; i++ ) { + for (Index_type j=0 ; j< state.range(0) ; j++ ) { + px[j][i] += vy[k][i] * cx[j][k]; + } + } + } + + } +} + +BENCHMARK(BM_MAT_X_MAT_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_PLANCKIAN_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 22 -- Planckian distribution + ******************************************************************* + * EXPMAX= 20.0 + * U(n)= 0.99*EXPMAX*V(n) + * DO 22 L= 1,Loop + * DO 22 k= 1,n + * Y(k)= U(k)/V(k) + * W(k)= X(k)/( EXP( Y(k)) -1.0) + * 22 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(PLANCKIAN); + + Real_ptr x = loop_data.array_1D_Real[0]; + Real_ptr y = loop_data.array_1D_Real[1]; + Real_ptr u = loop_data.array_1D_Real[2]; + Real_ptr v = loop_data.array_1D_Real[3]; + Real_ptr w = loop_data.array_1D_Real[4]; + + Real_type expmax = 20.0; + u[state.range(0)-1] = 0.99*expmax*v[state.range(0)-1]; + + for (auto _ : state) { + + for (Index_type k=0 ; k< state.range(0) ; k++ ) { + y[k] = u[k] / v[k]; + w[k] = x[k] / ( exp( y[k] ) -1.0 ); + } + + } +} + +BENCHMARK(BM_PLANCKIAN_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_IMP_HYDRO_2D_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 23 -- 2-D implicit hydrodynamics fragment + ******************************************************************* + * DO 23 L= 1,Loop + * DO 23 j= 2,6 + * DO 23 k= 2,n + * QA= ZA(k,j+1)*ZR(k,j) +ZA(k,j-1)*ZB(k,j) + + * . ZA(k+1,j)*ZU(k,j) +ZA(k-1,j)*ZV(k,j) +ZZ(k,j) + * 23 ZA(k,j)= ZA(k,j) +.175*(QA -ZA(k,j)) + */ + + LoopData& loop_data = getLoopData(); + + loopInit(IMP_HYDRO_2D); + + Real_ptr* za = loop_data.array_2D_7xN_Real[0]; + Real_ptr* zb = loop_data.array_2D_7xN_Real[1]; + Real_ptr* zr = loop_data.array_2D_7xN_Real[2]; + Real_ptr* zu = loop_data.array_2D_7xN_Real[3]; + Real_ptr* zv = loop_data.array_2D_7xN_Real[4]; + Real_ptr* zz = loop_data.array_2D_7xN_Real[5]; + + Index_type j; + + for (auto _ : state) { + + for ( j=1 ; j<6 ; j++ ) { + for ( Index_type k=1 ; k< state.range(0) ; k++ ) { + Real_type qa = za[j+1][k]*zr[j][k] + za[j-1][k]*zb[j][k] + + za[j][k+1]*zu[j][k] + za[j][k-1]*zv[j][k] + zz[j][k]; + za[j][k] += 0.175*( qa - za[j][k] ); + } + } + + } +} + +BENCHMARK(BM_IMP_HYDRO_2D_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); + +static void BM_FIND_FIRST_MIN_RAW(benchmark::State& state) { + + /* + ******************************************************************* + * Kernel 24 -- find location of first minimum in array + ******************************************************************* + * X( n/2)= -1.0E+10 + * DO 24 L= 1,Loop + * m= 1 + * DO 24 k= 2,n + * IF( X(k).LT.X(m)) m= k + * 24 CONTINUE + */ + + LoopData& loop_data = getLoopData(); + + loopInit(FIND_FIRST_MIN); + + Real_ptr x = loop_data.array_1D_Real[0]; + + Index_type m = 0; + Index_type val = 0; + + for (auto _ : state) { + + m = 0; + for (Index_type k=1 ; k< state.range(0) ; k++ ) { + if ( x[k] < x[m] ) benchmark::DoNotOptimize(m = k); + } + + } +} + +BENCHMARK(BM_FIND_FIRST_MIN_RAW)->Arg(171)->Arg(5001)-> + Arg(44217)->Unit(benchmark::kMicrosecond); Index: MicroBenchmarks/LCALS/SubsetDataA.hxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetDataA.hxx @@ -0,0 +1,168 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Header file defining macros, routines, structures used in Loop Subset A. +// + +#ifndef SubsetDataA_HXX +#define SubsetDataA_HXX + +// +// Some macros used in kernels to mimic real code usage. +// +#define NDPTRSET(v,v0,v1,v2,v3,v4,v5,v6,v7) \ + v0 = v ; \ + v1 = v0 + 1 ; \ + v2 = v0 + domain.jp ; \ + v3 = v1 + domain.jp ; \ + v4 = v0 + domain.kp ; \ + v5 = v1 + domain.kp ; \ + v6 = v2 + domain.kp ; \ + v7 = v3 + domain.kp ; + +#define NDSET2D(v,v1,v2,v3,v4) \ + v4 = v ; \ + v1 = v4 + 1 ; \ + v2 = v1 + domain.jp ; \ + v3 = v4 + domain.jp ; + +#define zabs2(z) ( real(z)*real(z)+imag(z)*imag(z) ) + + +// +// Domain structure to mimic structured mesh loops in real codes. +// +struct ADomain +{ + ADomain( int ilen, Index_type ndims ) + : ndims(ndims), NPNL(2), NPNR(1) + { + Index_type rzmax; + switch ( ilen ) { + case LONG : { + if ( ndims == 2 ) { + rzmax = 156 * loop_length_factor; + } else if ( ndims == 3 ) { + rzmax = 28 * loop_length_factor; + } + break; + } + case MEDIUM : { + if ( ndims == 2 ) { + rzmax = 64 * loop_length_factor; + } else if ( ndims == 3 ) { + rzmax = 16 * loop_length_factor; + } + break; + } + case SHORT : { + if ( ndims == 2 ) { + rzmax = 8 * loop_length_factor; + } else if ( ndims == 3 ) { + rzmax = 4 * loop_length_factor; + } + break; + } + + default : { } + } + + imin = NPNL; + jmin = NPNL; + imax = rzmax + NPNR; + jmax = rzmax + NPNR; + jp = imax - imin + 1 + NPNL + NPNR; + + if ( ndims == 2 ) { + kmin = 0; + kmax = 0; + kp = 0; + nnalls = jp * (jmax - jmin + 1 + NPNL + NPNR) ; + } else if ( ndims == 3 ) { + kmin = NPNL; + kmax = rzmax + NPNR; + kp = jp * (jmax - jmin + 1 + NPNL + NPNR); + nnalls = kp * (kmax - kmin + 1 + NPNL + NPNR) ; + } + + fpn = 0; + lpn = nnalls - 1; + frn = fpn + NPNL * (kp + jp) + NPNL; + lrn = lpn - NPNR * (kp + jp) - NPNR; + + fpz = frn - jp - kp - 1; + lpz = lrn; + + real_zones = new Index_type[nnalls]; + for (Index_type i = 0; i < nnalls; ++i) real_zones[i] = -1; + + n_real_zones = 0; + + if ( ndims == 2 ) { + + for (Index_type j = jmin; j < jmax; j++) { + for (Index_type i = imin; i < imax; i++) { + Index_type ip = i + j*jp ; + + Index_type id = n_real_zones; + real_zones[id] = ip; + n_real_zones++; + } + } + + } else if ( ndims == 3 ) { + + for (Index_type k = kmin; k < kmax; k++) { + for (Index_type j = jmin; j < jmax; j++) { + for (Index_type i = imin; i < imax; i++) { + Index_type ip = i + j*jp + kp*k ; + + Index_type id = n_real_zones; + real_zones[id] = ip; + n_real_zones++; + } + } + } + + } + + } + + ~ADomain() + { + if (real_zones) delete [] real_zones; + } + + static double loop_length_factor; + + Index_type ndims; + Index_type NPNL; + Index_type NPNR; + + Index_type imin; + Index_type jmin; + Index_type kmin; + Index_type imax; + Index_type jmax; + Index_type kmax; + + Index_type jp; + Index_type kp; + Index_type nnalls; + + Index_type fpn; + Index_type lpn; + Index_type frn; + Index_type lrn; + + Index_type fpz; + Index_type lpz; + + Index_type* real_zones; + Index_type n_real_zones; +}; + + +#endif // closing endif for header file include guard Index: MicroBenchmarks/LCALS/SubsetDataB.hxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/SubsetDataB.hxx @@ -0,0 +1,30 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Header file defining macros, routines, structures used in Loop Subset B. +// + +#ifndef SubsetDataB_HXX +#define SubsetDataB_HXX + +namespace { + +// +// Function used in TRAP_INT loop. +// +LCALS_INLINE +Real_type trap_int_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); + denom = 1.0/sqrt(denom); + return denom; +} + +} // closing brace for unnamed namespace + +#endif // closing endif for header file include guard Index: MicroBenchmarks/LCALS/main.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/main.cxx @@ -0,0 +1,407 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// +// This code is under continuing development. Go to http://codesign.llnl.gov +// to acquire the latest released version. +// + +// +// Main program for LCALS suite. +// + +#include + +#include +#include +#include + +#include + +#include "LCALSSuite.hxx" +#include + +int main(int argc, char *argv[]) +{ + + // + // Define some variables used to define part of suite execution. + // + bool do_fom = true; + bool run_misc = false; + bool input_error = false; + std::string output_dirname; + + // + // Process command line args and report correct usage if necessary. + // + // if (argc == 1) no args to check... +#ifdef TESTSUITE + if (argc > 1) { + std::string arg = argv[1]; + if ( arg == "-misc" ) { + run_misc = true; + } else { + output_dirname = argv[1]; + } + } + + if (argc > 2) { + if ( !run_misc ) { + input_error = true; + } else { + output_dirname = argv[2]; + } + } + + if ( argc > 3) { + input_error = true; + } + + if ( !input_error ) { + + if ( !output_dirname.empty() && !recursiveMkdir(output_dirname) ) { + std::cout << "Problem with given output directory name." << std::endl; + std::cout << "No file output will be generated." << std::endl; + } + + } else { + std::cout << "ERROR RUNNING EXECUTABLE!\n\n"; + std::cout << "CORRECT USAGE:\n"; + std::cout << "\t" << argv[0] + << " -misc , both args optional\n\n" + << "\tIf '-misc' option is given, " + << "benchmark variants plus others may be run.\n" + << "\tActual loop variants to run are set below using the\n" + << "\tvector 'run_variants'. Note that the compiler switch\n" + << "\tin the Makefile may be required for full compilation.\n\n" + << "\tWhen no output directory is provided,\n" + << "\trun summary will be printed to standard output\n" + << "\tIf directory name is provided, execution summary and\n" + << "\ttext files suitable for importing into MS Excel will\n" + << "\tbe written there." << std::endl; + exit(-1); + return -1; + } +#endif + + // + // Define some parameters specifying how suite of loops will execute. + // + // See README-LCALS_instructions.txt file for additional description of how + // to control compilation and execution of loop suite. + // + unsigned num_suite_passes = 1; +#if defined(LCALS_VERIFY_CHECKSUM_ABBREVIATED) + // + // When verifying checksums, we only take one pass through the suite of loops + // as this is sufficient. + // + num_suite_passes = num_checksum_suite_passes; +#endif + + // + // Specify fraction of pre-defined loop sample counts to use. + // Smaller value reduces total run time. However, a value too + // small will result in inaccurate timings. + // + double sample_frac = 1.0; + + // + // Specify multiplication factor used to deviate from pre-defined loop + // lengths to use. For example, setting factor to 'a' will roughly + // multiply the length of "1D" loops by a and will multiply total number + // of iterations of "domain-based" loops by a^N, where N is the + // spatial dimension of the domain used by the loop. + // + double loop_length_factor = 1.0; + + // + // Specify which loops lengths to run by true/false + // value in 'run_loop_length' array. + // + bool run_loop_length[NUM_LENGTHS]; + run_loop_length[LONG] = true; + run_loop_length[MEDIUM] = true; + run_loop_length[SHORT] = true; + + + // + // Specify loop kernels to run by true/false value in 'run_loop' array. + // + // NOTE: If COMPILE_* macro constant associated with each lernel + // is not defined, then those kernels will not be compiled + // and thus will not be run. + // + bool run_loop[NUM_LOOP_KERNELS]; + for (unsigned iloop = 0; iloop < NUM_LOOP_KERNELS; ++iloop) { + run_loop[iloop] = false; + } + + +#if defined(LCALS_DO_OMP_ONLY) + + // Loop Subset A: Loops extracted from LLNL app codes. + run_loop[PRESSURE_CALC ] = true; + run_loop[PRESSURE_CALC_ALT ] = true; + run_loop[ENERGY_CALC ] = true; + run_loop[ENERGY_CALC_ALT ] = true; + run_loop[VOL3D_CALC ] = true; + run_loop[DEL_DOT_VEC_2D] = true; + run_loop[COUPLE ] = true; + run_loop[FIR ] = true; + + // Loop Subset B: "Basic" Loops. + run_loop[INIT3 ] = true; + run_loop[MULADDSUB ] = true; + run_loop[IF_QUAD ] = true; + run_loop[TRAP_INT ] = true; + + // Loop Subset C: Loops from older Livermore Loops in "C" suite. + run_loop[PIC_2D ] = true; + +#else // else run all loop kernels + + // Loop Subset A: Loops extracted from LLNL app codes. + run_loop[PRESSURE_CALC ] = true; + run_loop[ENERGY_CALC ] = true; + run_loop[VOL3D_CALC ] = true; + run_loop[DEL_DOT_VEC_2D] = true; + run_loop[COUPLE ] = true; + run_loop[FIR ] = true; + + // Loop Subset B: "Basic" Loops. + run_loop[INIT3 ] = true; + run_loop[MULADDSUB ] = true; + run_loop[IF_QUAD ] = true; + run_loop[TRAP_INT ] = true; + + // Loop Subset C: Loops from older Livermore Loops in "C" suite. + run_loop[HYDRO_1D ] = true; + run_loop[ICCG ] = true; + run_loop[INNER_PROD ] = true; + run_loop[BAND_LIN_EQ ] = true; + run_loop[TRIDIAG_ELIM ] = true; + run_loop[EOS ] = true; + run_loop[ADI ] = true; + run_loop[INT_PREDICT ] = true; + run_loop[DIFF_PREDICT ] = true; + run_loop[FIRST_SUM ] = true; + run_loop[FIRST_DIFF ] = true; + run_loop[PIC_2D ] = true; + run_loop[PIC_1D ] = true; + run_loop[HYDRO_2D ] = true; + run_loop[GEN_LIN_RECUR ] = true; + run_loop[DISC_ORD ] = true; + run_loop[MAT_X_MAT ] = true; + run_loop[PLANCKIAN ] = true; + run_loop[IMP_HYDRO_2D ] = true; + run_loop[FIND_FIRST_MIN] = true; + +#endif + + + // + // Specify which loop variants are executed. To run different loop variants, + // change which enum values are pushed onto the run-variants vector here. + // + // IMPORTANT: The first variant added is used as the reference + // variant for reporting relative execution timing data + // and checksum comparisons. + // + std::vector run_variants; + if ( !run_misc ) { + // + // These variants comprose the LCALS benchmark. + // + +#if defined(LCALS_DO_OMP_ONLY) + + run_variants.push_back(RAW_OMP); + run_variants.push_back(FORALL_LAMBDA_OMP); + +#else // run other variants in addition to OMP variants + + run_variants.push_back(RAW); + run_variants.push_back(FORALL_LAMBDA); + run_variants.push_back(RAW_OMP); + run_variants.push_back(FORALL_LAMBDA_OMP); + +#endif + + } else { + // + // These variants are used for miscellaneous studies. + // + +#if defined(LCALS_DO_OMP_ONLY) + + run_variants.push_back(RAW_OMP); + run_variants.push_back(FORALL_LAMBDA_OMP); +#if defined(LCALS_DO_MISC) + run_variants.push_back(FORALL_FUNCTOR_OMP); +// run_variants.push_back(FORALL_LAMBDA_OMP_TYPEFIX); +#endif // if LCALS_DO_MISC + + +#else // run other variants in addition to OMP variants + + // + // Bechmark variants. + // + run_variants.push_back(RAW); + run_variants.push_back(FORALL_LAMBDA); +// run_variants.push_back(RAW_OMP); +// run_variants.push_back(FORALL_LAMBDA_OMP); + + // + // Other available loop variants. + // +#if defined(LCALS_DO_MISC) +// run_variants.push_back(FORALL_HYBRID_LAMBDA); + +// run_variants.push_back(FORALL_FUNCTOR); +// run_variants.push_back(FORALL_FUNCTOR_OMP); + +// run_variants.push_back(RAW_FUNC); + +// run_variants.push_back(FORALL_LAMBDA_TYPEFIX); +// run_variants.push_back(FORALL_LAMBDA_OMP_TYPEFIX); +// run_variants.push_back(FORALL_HYBRID_LAMBDA_TYPEFIX); +#endif // if LCALS_DO_MISC + +#endif + + } + + + // + // Obtain and report hostname. + // + const int host_namelen = 64; + char host[host_namelen]; + gethostname( host, host_namelen ); + std::string host_name(host); + +#ifdef TESTSUITE + std::cout << "\n Running loop suite on " << host_name << std::endl; +#endif + // + // Specify size in bytes of largest data cache level on machine so that + // caches can be properly flushed between execution of different loops. + // + + CacheIndex_type cache_size = 0; + if ( host_name.find("rzalastor") != std::string::npos ) { + cache_size = 12000000; // 12MB on rzalastor + } else if ( host_name.find("rzmerl") != std::string::npos ) { + cache_size = 20000000; // 20MB on rzmerl + } else if ( host_name.find("dawn") != std::string::npos ) { + cache_size = 8000000; // 8MB on dawn/rzdawndev + } else if ( host_name.find("rzuseq") != std::string::npos || + host_name.find("vulcan") != std::string::npos || + host_name.find("sequoia") != std::string::npos ) { + cache_size = 32000000; // 32MB on BG/Q + } +#ifdef TESTSUITE + else { + std::cout << "\n WARNING: unknown system cache size. " + << "Timing results may be suspect!!" << std::endl; + } +#endif + + + // + // Allocate data for running loops and generating execution timings. + // Also, set structures that define how loops will be run. + // + allocateLoopSuiteRunInfo(host_name, + NUM_LOOP_KERNELS, + NUM_LENGTHS, + num_suite_passes, + run_loop_length, + cache_size); + + + defineLoopSuiteRunInfo( run_variants, run_loop, sample_frac, + loop_length_factor ); + + allocateLoopData(); + + + if (do_fom) { + // + // Compute reference times for figure of merit (FOM) calculation. + // + computeReferenceLoopTimes(); + } + + /*************** TEST SUITE **************** + * * + * Using google benchmark as test Runner * + * * + *******************************************/ + + + ::benchmark::Initialize(&argc, argv); + if(::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; + ::benchmark::RunSpecifiedBenchmarks(); + + +#ifdef TESTSUITE + // Run loops, record timings, etc. + // + for (unsigned ipass = 0; ipass < num_suite_passes; ++ipass) { + std::cout << "\n run suite: pass = " << ipass << std::endl; + + for (unsigned ivariant = 0; ivariant < run_variants.size(); ++ivariant) { + + std::string loop_variant_name = + getVariantName(run_variants[ivariant]); + + std::cout << "\t run loop variant ---> " + << loop_variant_name << std::endl; + + for (unsigned ilen = 0; ilen < NUM_LENGTHS; ++ilen) { + + if (run_loop_length[ilen]) { + + LoopLength rilen = static_cast(ilen); + + runLoopVariant(run_variants[ivariant], run_loop, rilen) ; + + } // if loop length is run + + } // iterate over loop lengths + + } // iterate over loop variants + + } // iterate over loop suite passes +#endif + +#ifdef TESTSUITE + // + // Generate report(s). + // + std::cout << "\n generate reports...." << std::endl; + + std::vector run_variant_names = getVariantNames(run_variants); + + generateTimingReport(run_variant_names, output_dirname); + generateChecksumReport(run_variant_names, output_dirname); + generateFOMReport(run_variant_names, output_dirname); +#endif + + // + // Clean up. + // + freeLoopData(); +#ifdef TESTSUITE + std::cout << "\n freeLoopSuiteRunInfo..." << std::endl; +#endif + freeLoopSuiteRunInfo(); +#ifdef TESTSUITE + std::cout << "\n DONE!!! " << std::endl; +#endif + return 0 ; +} + Index: MicroBenchmarks/LCALS/runReferenceLoops.cxx =================================================================== --- /dev/null +++ MicroBenchmarks/LCALS/runReferenceLoops.cxx @@ -0,0 +1,172 @@ +// +// See README-LCALS_license.txt for access and distribution restrictions +// + +// +// Source file with routines to generate reference loop times for +// figure of merit (FOM) calculations. +// + +#include "LCALSSuite.hxx" +#include "LCALSStats.hxx" + +#include +#include + +// +// Prototypes for file scope routines containing reference loops +// + +namespace { + +void runReferenceLoop0(LoopStat& lstat, unsigned ilen); +void runReferenceLoop1(LoopStat& lstat, unsigned ilen); + +} // closing brace for unnamed namespace + + +// +// Define reference loop information. +// +// Note: That this may need to be tweaked in the future. +// +void defineReferenceLoopRunInfo() +{ + LoopSuiteRunInfo& suite_info = getLoopSuiteRunInfo(); + + suite_info.ref_loop_stat = LoopStat(NUM_LENGTHS); + LoopStat& ref_loop_stat = suite_info.ref_loop_stat; + + ref_loop_stat.loop_length[LONG] = 24336; + ref_loop_stat.loop_length[MEDIUM] = 3844; + ref_loop_stat.loop_length[SHORT] = 64; + ref_loop_stat.samples_per_pass[LONG] = 30000; + ref_loop_stat.samples_per_pass[MEDIUM] = 300000; + ref_loop_stat.samples_per_pass[SHORT] = 50000000; +} + + +// +// Execute reference loops. The intent is to generate a time for +// fast loops that any compile should be able to optimize well. +// We run two reference loops and take the min execution time. +// This time is used as a reference against which to compre the +// execution times of other loops for figure of merit computation. +// +// Note: That this may need to be tweaked in the future. +// +void computeReferenceLoopTimes() +{ +#ifdef TESTSUITE + std::cout << "\n computeReferenceLoopTimes..." << std::endl; +#endif + LoopSuiteRunInfo& suite_info = getLoopSuiteRunInfo(); + LoopStat& ref_loop_stat = suite_info.ref_loop_stat; + + + LoopStat lstat0(suite_info.num_loop_lengths); + lstat0 = ref_loop_stat; + for (unsigned ilen = 0; ilen < NUM_LENGTHS; ++ilen) { + runReferenceLoop0(lstat0, ilen); + } + + LoopStat lstat1(suite_info.num_loop_lengths); + lstat1 = ref_loop_stat; + for (unsigned ilen = 0; ilen < NUM_LENGTHS; ++ilen) { + runReferenceLoop1(lstat1, ilen); + } + + for (unsigned ilen = 0; ilen < NUM_LENGTHS; ++ilen) { + ref_loop_stat.loop_run_time[ilen].push_back( + std::min(lstat0.loop_run_time[ilen][0], + lstat1.loop_run_time[ilen][0]) ); +#if 0 // Just for checking... + std::cout << "\t len : " << ilen << " rloop0 time = " + << lstat0.loop_run_time[ilen][0] << std::endl; + std::cout << "\t len : " << ilen << " rloop1 time = " + << lstat1.loop_run_time[ilen][0] << std::endl; + std::cout << "\t ref len, time = " << ilen << " , " + << ref_loop_stat.loop_run_time[ilen][0] << std::endl; +#endif + } +} + + +// +// Prototypes for file scope reference loop routines +// + +namespace { + +// +// Element-wise vector product +// +void runReferenceLoop0(LoopStat& lstat, unsigned ilen) +{ + LoopData& loop_data = getLoopData(); + + Index_type len = lstat.loop_length[ilen]; + int num_samples = lstat.samples_per_pass[ilen]; + LoopTimer ltimer; + + loopInit(REF_LOOP, lstat); + + Real_ptr a = loop_data.array_1D_Real[0]; + Real_ptr b = loop_data.array_1D_Real[1]; + Real_ptr c = loop_data.array_1D_Real[2]; + + TIMER_START(ltimer); + for (SampIndex_type isamp = 0; isamp < num_samples; ++isamp) { + + for (Index_type i=0 ; i