Index: LICENSE.TXT
===================================================================
--- LICENSE.TXT
+++ LICENSE.TXT
@@ -63,6 +63,7 @@
 -------             ---------
 Autoconf:           llvm-test/autoconf
 Benchmark:          llvm-test/libs/benchmark-1.1.0
+LCALS:              llvm-test/MicroBenchmarks/LCALS
 Burg:               llvm-test/MultiSource/Applications/Burg
 Aha:                llvm-test/MultiSource/Applications/aha
 SGEFA:              llvm-test/MultiSource/Applications/sgefa
Index: MicroBenchmarks/CMakeLists.txt
===================================================================
--- MicroBenchmarks/CMakeLists.txt
+++ MicroBenchmarks/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(libs)
 add_subdirectory(XRay)
+add_subdirectory(LCALS)
Index: MicroBenchmarks/LCALS/CMakeLists.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_subdirectory(SubsetARawLoops)
+add_subdirectory(SubsetALambdaLoops)
+add_subdirectory(SubsetBRawLoops)
+add_subdirectory(SubsetBLambdaLoops)
+add_subdirectory(SubsetCRawLoops)
+add_subdirectory(SubsetCLambdaLoops)
Index: MicroBenchmarks/LCALS/LCALSParams.hxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/LCALSParams.hxx
@@ -0,0 +1,836 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Header file with macros and constants for data types, execution,
+// timing options, etc. used in LCALS
+//
+
+#ifndef LCALSParams_HXX
+#define LCALSParams_HXX
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// This file contains various parameters that control compilation and some
+// aspects of execution of the loop suite.  The macro constants and typedefs in
+// this file provide the ability to make changes that will propagate 
+// throughout the LCALS code when compiled.  Parameters in this file specify:
+//
+// o Timing and checksum output options
+// o Scalar data types and pointer types (e.g., restrict & alignment properties)
+// o Loop variants that can be built by each compiler 
+// o Loop execution policies for traversal templates (used with loop bodies
+//   represented as lambda expressions or functors)
+//
+//
+// IMPORTANT: MANY OF THE MACROS CONTROLLING THESE OPTIONS
+//            ARE SET IN THE LCALS_rules.mk FILE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(LCALS_VERIFY_CHECKSUM_ABBREVIATED)
+static const int num_checksum_suite_passes = 1;
+static const int num_checksum_samples = 3;
+#endif
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Define/undefine macro constants and other paramters used to control data
+// type definitions.
+// 
+////////////////////////////////////////////////////////////////////////////////
+
+//
+//  Parameterized scalar data types.
+//
+
+typedef int     Index_type;
+
+#if defined(LCALS_USE_DOUBLE)
+///
+typedef double  Real_type;
+
+#elif defined(LCALS_USE_FLOAT)
+///
+typedef float  Real_type;
+
+
+#else
+#error LCALS Real_type is undefined!
+
+#endif
+
+#include<complex>
+typedef std::complex<Real_type> Complex_type;
+
+//
+// Use volatile keyword on loop variable for sampling loops to prevent 
+// compilers from potentially optimizing out loops where result is
+// identical for each sample iteration.
+// 
+typedef volatile int SampIndex_type;
+//
+// Use unsigned long for loop variable used in loops to flush cache to
+// allow for large caches with size possibly bigger than what int can adderss.
+// 
+typedef unsigned long CacheIndex_type;
+
+//
+//  Floating point array data alignmnent value.  Typically, same as
+//  SIMD vector width.
+//
+const int LCALS_DATA_ALIGN = 32;
+
+
+//
+//  Compiler-specific definitions for inline directives, data alignment
+//  intrinsics, and SIMD vector pragmas
+//
+//  Variables for compiler instrinsics, directives, typedefs
+//
+//     LCALS_INLINE - macro to enforce method inlining
+//
+//     LCALS_ALIGN_DATA(<variable>) - macro to express alignment of data,
+//                              loop bounds, etc.
+//
+//     LCALS_SIMD - macro to express SIMD vectorization pragma to force
+//                 loop vectorization
+//
+
+#if defined(LCALS_COMPILER_ICC)
+//
+// Configuration options for Intel compilers
+//
+
+#define LCALS_INLINE inline  __attribute__((always_inline))
+
+#if __ICC < 1300  // use alignment intrinsic
+#define LCALS_ALIGN_DATA(d) __assume_aligned(d, LCALS_DATA_ALIGN)
+#else
+#define LCALS_ALIGN_DATA(d)  // TODO: Define this...
+#endif
+
+#define LCALS_SIMD  // TODO: Define this...
+
+
+#elif defined(LCALS_COMPILER_GNU)
+//
+// Configuration options for GNU compilers
+//
+
+#define LCALS_INLINE inline  __attribute__((always_inline))
+
+#define LCALS_ALIGN_DATA(d) __builtin_assume_aligned(d, LCALS_DATA_ALIGN)
+
+#define LCALS_SIMD  // TODO: Define this...
+
+
+#elif defined(LCALS_COMPILER_XLC12)
+//
+// Configuration options for xlc v12 compiler (i.e., bgq/sequoia).
+//
+
+#define LCALS_INLINE inline  __attribute__((always_inline))
+
+#define LCALS_ALIGN_DATA(d) __alignx(LCALS_DATA_ALIGN, d)
+
+//#define LCALS_SIMD  _Pragma("simd_level(10)")
+#define LCALS_SIMD   // TODO: Define this...
+
+
+#elif defined(LCALS_COMPILER_CLANG)
+//
+// Configuration options for clang compilers
+//
+
+#define LCALS_INLINE inline  __attribute__((always_inline))
+
+#define LCALS_ALIGN_DATA(d) // TODO: Define this...
+
+#define LCALS_SIMD  // TODO: Define this...
+
+
+#else
+#error LCALS compiler is undefined!
+
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// The following items include some setup items for pointer type definitions 
+// that follow.
+// 
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(LCALS_COMPILER_ICC)
+//
+// alignment attribute supported for versions > 12
+//
+#if __ICC >= 1300
+typedef Real_type* __restrict__ __attribute__((align_value(LCALS_DATA_ALIGN))) TDRAReal_ptr;
+
+typedef const Real_type* __restrict__ __attribute__((align_value(LCALS_DATA_ALIGN))) const_TDRAReal_ptr;
+#endif
+
+
+#elif defined(LCALS_COMPILER_GNU)
+//
+// Nothing here for now because alignment attribute is not working...
+//
+
+
+#elif defined(LCALS_COMPILER_XLC12)
+extern
+#ifdef __cplusplus
+"builtin"
+#endif
+void __alignx(int n, const void* addr);
+
+
+#elif defined(LCALS_COMPILER_CLANG)
+typedef Real_type aligned_real_type __attribute__((aligned (LCALS_DATA_ALIGN)));
+typedef aligned_real_type* __restrict__ TDRAReal_ptr;
+
+typedef const aligned_real_type* __restrict__ const_TDRAReal_ptr;
+
+#else
+#error LCALS compiler is undefined!
+
+#endif
+
+
+#if defined(LCALS_USE_PTR_CLASS)
+/*!
+ ******************************************************************************
+ *
+ * \brief Class representing a restricted Real_type const pointer.
+ *
+ ******************************************************************************
+ */
+class ConstRestrictRealPtr
+{
+public:
+
+   ///
+   /// Ctors and assignment op.
+   ///
+
+   ConstRestrictRealPtr() : dptr(0) { ; }
+
+   ConstRestrictRealPtr(const Real_type* d) : dptr(d) { ; }
+
+   ConstRestrictRealPtr& operator=(const Real_type* d) {
+      ConstRestrictRealPtr copy(d);
+      std::swap(dptr, copy.dptr);
+      return *this;
+   }
+
+   ///
+   /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op.
+   ///
+
+   ///
+   ///  Implicit conversion operator to bare const pointer.
+   ///
+   operator const Real_type*() { return dptr; }
+
+   ///
+   ///  "Explicit conversion operator" to bare const pointer,
+   ///  consistent with boost shared ptr.
+   ///
+   const Real_type* get() const { return dptr; }
+
+   ///
+   /// Bracket operator.
+   ///
+   const Real_type& operator [] (Index_type i) const
+   {
+      return( (const Real_type* __restrict__) dptr)[i];
+   }
+
+   ///
+   /// + operator for pointer arithmetic.
+   ///
+   const Real_type* operator+ (Index_type i) const { return dptr+i; }
+
+private:
+   const Real_type* dptr;
+};
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Class representing a restricted Real_type (non-const) pointer.
+ *
+ ******************************************************************************
+ */
+class RestrictRealPtr
+{
+public:
+
+   ///
+   /// Ctors and assignment op.
+   ///
+
+   RestrictRealPtr() : dptr(0) { ; }
+
+   RestrictRealPtr(Real_type* d) : dptr(d) { ; }
+
+   RestrictRealPtr& operator=(Real_type* d) { 
+      RestrictRealPtr copy(d);
+      std::swap(dptr, copy.dptr);
+      return *this; 
+   }
+
+   ///
+   /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op.
+   ///
+
+   ///
+   ///  Implicit conversion operator to (non-const) bare pointer.
+   ///
+   operator Real_type*() { return dptr; }
+
+   ///
+   ///  Implicit conversion operator to const bare pointer.
+   ///
+   operator const Real_type*() const { return dptr; }
+
+   ///
+   ///  "Explicit conversion operator" to (non-const) bare pointer,
+   ///  consistent with boost shared ptr.
+   ///
+   Real_type* get() { return dptr; }
+
+   ///
+   ///  "Explicit conversion operator" to const bare pointer,
+   ///  consistent with boost shared ptr.
+   ///
+   const Real_type* get() const { return dptr; }
+
+   ///
+   ///  Operator that enables implicit conversion from RestrictRealPtr to
+   ///  RestrictRealConstPtr.
+   ///
+   operator ConstRestrictRealPtr ()
+      { return ConstRestrictRealPtr(dptr); }
+
+
+   ///
+   /// Bracket operator.
+   ///
+   Real_type& operator [] (Index_type i)
+   {
+      return( (Real_type* __restrict__) dptr)[i];
+   }
+
+   ///
+   /// + operator for (non-const) pointer arithmetic.
+   ///
+   Real_type* operator+ (Index_type i) { return dptr+i; }
+
+   ///
+   /// + operator for const pointer arithmetic.
+   ///
+   const Real_type* operator+ (Index_type i) const { return dptr+i; }
+
+private:
+   Real_type* dptr;
+};
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Class representing a restricted aligned Real_type const pointer.
+ *
+ ******************************************************************************
+ */
+class ConstRestrictAlignedRealPtr
+{
+public:
+
+   ///
+   /// Ctors and assignment op.
+   ///
+
+   ConstRestrictAlignedRealPtr() : dptr(0) { ; }
+
+   ConstRestrictAlignedRealPtr(const Real_type* d) : dptr(d) { ; }
+
+   ConstRestrictAlignedRealPtr& operator=(const Real_type* d) {
+      ConstRestrictAlignedRealPtr copy(d);
+      std::swap(dptr, copy.dptr);
+      return *this;
+   }
+
+   ///
+   /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op.
+   ///
+
+   ///
+   ///  Implicit conversion operator to bare const pointer.
+   ///
+   operator const Real_type*() { return dptr; }
+
+   ///
+   ///  "Explicit conversion operator" to bare const pointer,
+   ///  consistent with boost shared ptr.
+   ///
+   const Real_type* get() const { return dptr; }
+
+   ///
+   /// Compiler-specific bracket operators.
+   ///
+#if defined(LCALS_COMPILER_ICC)
+   ///
+   const Real_type& operator [] (Index_type i) const
+   {
+#if __ICC < 1300 // use alignment intrinsic
+      LCALS_ALIGN_DATA(dptr);
+      return( (const Real_type* __restrict__) dptr)[i];
+#else // use alignment attribute
+      return( (const_TDRAReal_ptr) dptr)[i];
+#endif
+   }
+
+
+#elif defined(LCALS_COMPILER_GNU)
+   ///
+   const Real_type& operator [] (Index_type i) const
+   {
+#if 1 // NOTE: alignment instrinsic not available for older GNU compilers
+      return( (const Real_type* __restrict__) LCALS_ALIGN_DATA(dptr) )[i];
+#else
+      return( (const Real_type* __restrict__) dptr)[i];
+#endif
+   }
+
+
+#elif defined(LCALS_COMPILER_XLC12)
+   const Real_type& operator [] (Index_type i) const
+   {
+      LCALS_ALIGN_DATA(dptr);
+      return( (const Real_type* __restrict__) dptr)[i];
+   }
+
+
+#elif defined(LCALS_COMPILER_CLANG)
+   const Real_type& operator [] (Index_type i) const
+   {
+      return( (const_TDRAReal_ptr) dptr)[i];
+   }
+
+
+#else
+#error LCALS compiler macro is undefined!
+
+#endif
+
+   ///
+   /// + operator for pointer arithmetic.
+   ///
+   const Real_type* operator+ (Index_type i) const { return dptr+i; }
+
+private:
+   const Real_type* dptr;
+};
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Class representing a restricted aligned Real_type (non-const) pointer.
+ *
+ ******************************************************************************
+ */
+class RestrictAlignedRealPtr
+{
+public:
+
+   ///
+   /// Ctors and assignment op.
+   ///
+
+   RestrictAlignedRealPtr() : dptr(0) { ; }
+
+   RestrictAlignedRealPtr(Real_type* d) : dptr(d) { ; }
+
+   RestrictAlignedRealPtr& operator=(Real_type* d) { RestrictAlignedRealPtr copy(d);
+                                                   std::swap(dptr, copy.dptr);
+                                                   return *this; }
+
+   ///
+   /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op.
+   ///
+
+   ///
+   ///  Implicit conversion operator to (non-const) bare pointer.
+   ///
+   operator Real_type*() { return dptr; }
+
+   ///
+   ///  Implicit conversion operator to const bare pointer.
+   ///
+   operator const Real_type*() const { return dptr; }
+
+   ///
+   ///  "Explicit conversion operator" to (non-const) bare pointer,
+   ///  consistent with boost shared ptr.
+   ///
+   Real_type* get() { return dptr; }
+
+   ///
+   ///  "Explicit conversion operator" to const bare pointer,
+   ///  consistent with boost shared ptr.
+   ///
+   const Real_type* get() const { return dptr; }
+
+   ///
+   ///  Operator that enables implicit conversion from RestrictAlignedRealPtr to
+   ///  RestrictAlignedRealConstPtr.
+   ///
+   operator ConstRestrictAlignedRealPtr ()
+      { return ConstRestrictAlignedRealPtr(dptr); }
+
+
+   ///
+   /// Compiler-specific bracket operators.
+   ///
+
+#if defined(LCALS_COMPILER_ICC)
+   ///
+   Real_type& operator [] (Index_type i)
+   {
+#if __ICC < 1300 // use alignment intrinsic
+      LCALS_ALIGN_DATA(dptr);
+      return( (Real_type* __restrict__) dptr)[i];
+#else // use alignment attribute
+      return( (TDRAReal_ptr) dptr)[i];
+#endif
+   }
+
+   ///
+   const Real_type& operator [] (Index_type i) const
+   {
+#if __ICC < 1300 // use alignment intrinsic
+      LCALS_ALIGN_DATA(dptr);
+      return( (Real_type* __restrict__) dptr)[i];
+#else // use alignment attribute
+      return( (TDRAReal_ptr) dptr)[i];
+#endif
+   }
+
+#elif defined(LCALS_COMPILER_GNU)
+   ///
+   Real_type& operator [] (Index_type i)
+   {
+#if 1 // NOTE: alignment instrinsic not available for older GNU compilers
+      return( (Real_type* __restrict__) LCALS_ALIGN_DATA(dptr) )[i];
+#else
+      return( (Real_type* __restrict__) dptr)[i];
+#endif
+   }
+
+   ///
+   const Real_type& operator [] (Index_type i) const
+   {
+#if 1 // NOTE: alignment instrinsic not available for older GNU compilers
+      return( (Real_type* __restrict__) LCALS_ALIGN_DATA(dptr) )[i];
+#else
+      return( (Real_type* __restrict__) dptr)[i];
+#endif
+   }
+
+
+#elif defined(LCALS_COMPILER_XLC12)
+   ///
+   Real_type& operator [] (Index_type i)
+   {
+      LCALS_ALIGN_DATA(dptr);
+      return( (Real_type* __restrict__) dptr)[i];
+   }
+
+   ///
+   const Real_type& operator [] (Index_type i) const
+   {
+      LCALS_ALIGN_DATA(dptr);
+      return( (Real_type* __restrict__) dptr)[i];
+   }
+
+
+#elif defined(LCALS_COMPILER_CLANG)
+   ///
+   Real_type& operator [] (Index_type i)
+   {
+      return( (TDRAReal_ptr) dptr)[i];
+   }
+
+   ///
+   const Real_type& operator [] (Index_type i) const
+   {
+      return( (TDRAReal_ptr) dptr)[i];
+   }
+
+
+#else
+#error LCALS compiler macro is undefined!
+
+#endif
+
+   ///
+   /// + operator for (non-const) pointer arithmetic.
+   ///
+   Real_type* operator+ (Index_type i) { return dptr+i; }
+
+   ///
+   /// + operator for const pointer arithmetic.
+   ///
+   const Real_type* operator+ (Index_type i) const { return dptr+i; }
+
+private:
+   Real_type* dptr;
+};
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Class representing a restricted Complex_type const pointer.
+ *
+ ******************************************************************************
+ */
+class ConstRestrictComplexPtr
+{
+public:
+
+   ///
+   /// Ctors and assignment op.
+   ///
+
+   ConstRestrictComplexPtr() : dptr(0) { ; }
+
+   ConstRestrictComplexPtr(const Complex_type* d) : dptr(d) { ; }
+
+   ConstRestrictComplexPtr& operator=(const Complex_type* d) {
+      ConstRestrictComplexPtr copy(d);
+      std::swap(dptr, copy.dptr);
+      return *this;
+   }
+
+   ///
+   /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op.
+   ///
+
+   ///
+   ///  Implicit conversion operator to bare const pointer.
+   ///
+   operator const Complex_type*() const { return dptr; }
+
+   ///
+   ///  "Explicit conversion operator" to bare const pointer,
+   ///  consistent with boost shared ptr.
+   ///
+   const Complex_type* get() const { return dptr; }
+
+   ///
+   ///  Bracket operator.
+   ///
+   const Complex_type& operator [] (Index_type i) const
+   {
+      return( (const Complex_type* __restrict__) dptr)[i];
+   }
+
+   ///
+   /// + operator for pointer arithmetic.
+   ///
+   const Complex_type* operator+ (Index_type i) const { return dptr+i; }
+
+private:
+   const Complex_type* dptr;
+};
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Class representing a restricted Complex_type (non-const) pointer.
+ *
+ ******************************************************************************
+ */
+class RestrictComplexPtr
+{
+public:
+
+   ///
+   /// Ctors and assignment op.
+   ///
+
+   RestrictComplexPtr() : dptr(0) { ; }
+
+   RestrictComplexPtr(Complex_type* d) : dptr(d) { ; }
+
+   RestrictComplexPtr& operator=(Complex_type* d) { RestrictComplexPtr copy(d);
+                                                    std::swap(dptr, copy.dptr);
+                                                    return *this; }
+
+   ///
+   /// NOTE: Using compiler-generated copy ctor, dtor, and copy assignment op.
+   ///
+
+   ///
+   ///  Implicit conversion operator to (non-const) bare pointer.
+   ///
+   operator Complex_type*() { return dptr; }
+
+   ///
+   ///  Implicit conversion operator to const bare pointer.
+   ///
+   operator const Complex_type*() const { return dptr; }
+
+   ///
+   ///  "Explicit conversion operator" to (non-const) bare pointer,
+   ///  consistent with boost shared ptr.
+   ///
+   Complex_type* get() { return dptr; }
+
+   ///
+   ///  "Explicit conversion operator" to const bare pointer,
+   ///  consistent with boost shared ptr.
+   ///
+   const Complex_type* get() const { return dptr; }
+
+   ///
+   ///  Operator that enables implicit conversion from RestrictComplexPtr to
+   ///  RestrictComplexConstPtr.
+   ///
+   operator ConstRestrictComplexPtr ()
+      { return ConstRestrictComplexPtr(dptr); }
+
+   ///
+   ///  (Non-const) bracket operator.
+   ///
+   Complex_type& operator [] (Index_type i)
+   {
+      return( (Complex_type* __restrict__) dptr)[i];
+   }
+
+   ///
+   ///  Const bracket operator.
+   ///
+   const Complex_type& operator [] (Index_type i) const
+   {
+      return( (Complex_type* __restrict__) dptr)[i];
+   }
+
+   ///
+   /// + operator for (non-const) pointer arithmetic.
+   ///
+   Complex_type* operator+ (Index_type i) { return dptr+i; }
+
+   ///
+   /// + operator for const pointer arithmetic.
+   ///
+   const Complex_type* operator+ (Index_type i) const { return dptr+i; }
+
+private:
+   Complex_type* dptr;
+};
+#endif  // defined(LCALS_USE_PTR_CLASS)
+
+
+/*
+ ******************************************************************************
+ *
+ * Finally, we define data pointer types based on definitions above and
+ * -D value given at compile time.
+ *
+ ******************************************************************************
+ */
+#if defined(LCALS_USE_BARE_PTR)
+typedef Real_type* Real_ptr;
+typedef const Real_type* const_Real_ptr;
+typedef Complex_type* Complex_ptr;
+typedef const Complex_type* const_Complex_ptr;
+
+typedef Real_type* UnalignedReal_ptr;
+typedef const Real_type* const_UnalignedReal_ptr;
+
+
+#elif defined(LCALS_USE_RESTRICT_PTR)
+typedef Real_type* __restrict__ Real_ptr;
+typedef const Real_type* __restrict__ const_Real_ptr;
+typedef Complex_type* __restrict__ Complex_ptr;
+typedef const Complex_type* __restrict__ const_Complex_ptr;
+
+typedef Real_type* __restrict__ UnalignedReal_ptr;
+typedef const Real_type* __restrict__ const_UnalignedReal_ptr;
+
+
+#elif defined(LCALS_USE_RESTRICT_ALIGNED_PTR)
+typedef TDRAReal_ptr Real_ptr;
+typedef const_TDRAReal_ptr const_Real_ptr;
+typedef Complex_type* __restrict__ Complex_ptr;
+typedef const Complex_type* __restrict__ const_Complex_ptr;
+
+typedef Real_type* __restrict__ UnalignedReal_ptr;
+typedef const Real_type* __restrict__ const_UnalignedReal_ptr;
+
+
+#elif defined(LCALS_USE_PTR_CLASS)
+typedef RestrictAlignedRealPtr Real_ptr;
+typedef ConstRestrictAlignedRealPtr const_Real_ptr;
+typedef RestrictComplexPtr Complex_ptr;
+typedef ConstRestrictComplexPtr const_Complex_ptr;
+
+typedef RestrictRealPtr UnalignedReal_ptr;
+typedef ConstRestrictRealPtr const_UnalignedReal_ptr;
+
+
+#else
+#error LCALS pointer type is undefined!
+
+#endif
+
+
+//
+// By default, all loop variants defined as supported in the
+// compiler-specific sections above are turned on here (for
+// both compilation and execution).
+//
+// Loop variants can be turned off via the #define/#undef macros below.
+// It's a bit cheesy, but it's simple and it works!
+//
+// Execution of individual loop variants can also be controlled
+// by modifying the strings that get added to the run_loop_variants
+// vector in main.cxx
+//
+
+
+//
+//  Execution policies applicable to "forall" loop variants.  
+//  Traversal method templates are defined in LCALSTraversalMethods.hxx 
+//  header files.
+//
+
+// Tag struct types define available forall method execution policies
+struct seq_exec {};
+struct simd_exec {};
+struct omp_parallel_for_exec {};
+struct omp_for_nowait_exec {};
+
+// 
+// Execution policy used in (non-OpenMP) "forall" loop variants.  
+// To use another policy in all such loops, change this typedef.
+//
+typedef simd_exec exec_policy;
+
+
+
+#endif  // closing endif for header file include guard
Index: MicroBenchmarks/LCALS/LCALSStats.hxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/LCALSStats.hxx
@@ -0,0 +1,306 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Header file defining routines and structures to gather and report
+// LCALS loop suite execution information.
+//
+
+#ifndef LCALSStats_HXX
+#define LCALSStats_HXX
+
+#include "LCALSParams.hxx"
+
+#include <vector>
+#include <map>
+#include <string>
+#include <iostream>
+
+#if defined(LCALS_USE_CYCLE)
+#include "cycle.h"
+typedef ticks LoopTime;
+
+#elif defined(LCALS_USE_CLOCK)
+#include <time.h>
+typedef clock_t LoopTime;
+
+#else
+#error LCALS_TIMER_TYPE is undefined!
+
+#endif
+
+
+class LoopStat;
+
+//
+// Loop timing should follow the implementation described here.
+// (See files containing loop implementations for details.)
+//
+// 1) Execute loop (identified by integer variable "iloop"):
+//
+//    LoopTimer ltimer;
+//
+//    flushCache(); 
+//
+//    TIMER_START(ltimer);
+//
+//        ...CODE FOR LOOP "iloop" GOES HERE...
+//
+//    TIMER_STOP(ltimer);
+//
+// 2) At some point after loop is run, but before next loop in file is run, 
+//    copy timing information to appropriate loop stat object:
+//
+//    copyTimers(loop_stat, ilength, ltimer);
+//
+
+struct LoopTimer
+{
+   LoopTime start;
+   LoopTime stop;
+   bool was_run;
+
+   LoopTimer() : start(0), stop(0), was_run(false) { ; }
+};
+
+void flushCache();
+void copyTimer(LoopStat& loop_stat, int ilength, 
+               const LoopTimer& loop_timer);
+
+#if defined(LCALS_USE_CYCLE)
+
+#define TIMER_START(lt)  lt.start = getticks(); 
+#define TIMER_STOP(lt)  lt.stop = getticks(); \
+
+#elif defined(LCALS_USE_CLOCK)
+
+#define TIMER_START(lt) lt.start = clock();
+#define TIMER_STOP(lt)  lt.stop = clock(); \
+                        lt.was_run = true;
+
+#else
+#error LCALS_TIMER_TYPE is undefined!
+
+#endif
+
+
+
+//////////////////////////////////////////////////////////////////
+//
+//  Routines to set up loop data, run loops, generate output, etc.
+//
+//////////////////////////////////////////////////////////////////
+
+//
+// Forward declarations for structs defined below.
+//
+struct LoopStat;
+struct LoopSuiteRunInfo;
+
+//
+// Routines for accessing loop suite run info.
+//
+LoopSuiteRunInfo& getLoopSuiteRunInfo();
+
+
+//
+// Routine to allocate and setup basic structures used to run loop suite
+// and free them when done.
+//
+void allocateLoopSuiteRunInfo(const std::string& host_name,
+                              unsigned num_loops,
+                              unsigned num_loop_lengths,
+                              unsigned num_suite_passes,
+                              bool run_loop_length[],
+                              CacheIndex_type cache_size);
+void freeLoopSuiteRunInfo();
+
+
+//
+// Routine to generate loop excution timing report.
+// 
+// Also write output files if non-empty directory name is given.
+//
+void generateTimingReport(const std::vector< std::string >& run_loop_variants,
+                          const std::string& output_dirname);
+
+//
+// Routine to generate report about loop checksums.
+// 
+// Also write output files if non-empty directory name is given.
+//
+void generateChecksumReport(const std::vector< std::string >& run_loop_variants,
+                            const std::string& output_dirname);
+
+//
+// Routine to generate FOM report.
+// 
+// Also write output files if non-empty directory name is given.
+//
+void generateFOMReport(const std::vector< std::string >& run_loop_variants,
+                       const std::string& output_dirname);
+
+
+
+//////////////////////////////////////////////////////////////////
+//
+//  Structures holding parameters defining execution of loop suite 
+//  and loop timing statistic information.
+//
+//////////////////////////////////////////////////////////////////
+
+class LoopStat
+{
+public:
+
+   bool loop_is_run; 
+
+   double loop_weight;
+
+   //
+   // The following vectors are indexed by loop length ID.
+   //
+   // The second vector index for loop_run_time
+   // is number of suite pass.
+   //
+
+   std::vector< std::vector<long double> > loop_run_time;
+   std::vector< unsigned long > loop_run_count;
+
+   std::vector< long double > mean;
+   std::vector< long double > std_dev;
+   std::vector< long double > min;
+   std::vector< long double > max;
+   std::vector< long double > harm_mean;
+   std::vector< long double > meanrel2ref;
+
+   std::vector< int > loop_length;
+   std::vector< int > samples_per_pass;
+
+   std::vector< long double > loop_chksum;
+
+   explicit LoopStat(unsigned num_loop_lengths)
+   : loop_is_run(false),
+     loop_weight(0.0),
+     loop_run_time(num_loop_lengths),
+     loop_run_count(num_loop_lengths, 0),
+     mean(num_loop_lengths, 0.0),
+     std_dev(num_loop_lengths, 0.0),
+     min(num_loop_lengths, 0.0),
+     max(num_loop_lengths, 0.0),
+     harm_mean(num_loop_lengths, 0.0),
+     meanrel2ref(num_loop_lengths, 0.0),
+     loop_length(num_loop_lengths, 0),
+     samples_per_pass(num_loop_lengths, 0),
+     loop_chksum(num_loop_lengths, 0.0)
+   { ; } 
+
+   //
+   // Print routine for debugging.
+   //
+   void print(std::ostream& os) const;
+
+private:
+   //
+   // The following methods are not implemented.
+   //
+   LoopStat();
+};
+
+class LoopSuiteRunInfo
+{
+public:
+
+   std::string host_name;
+
+   //
+   // The following vectors are indexed by loop ID.
+   //
+   unsigned num_loops;
+   std::vector<std::string> loop_names;
+
+   //
+   // The following vectors are indexed by loop length ID.
+   //
+   unsigned num_loop_lengths;
+   std::vector<bool> run_loop_length;
+   std::vector<std::string> loop_length_names;
+
+   unsigned num_suite_passes;
+   double loop_samp_frac;
+
+   LoopStat ref_loop_stat;
+   //
+   // The following vectors are indexed by loop WeightGroup
+   //
+   std::vector<double> loop_weights;
+
+   //
+   // The following vectors are indexed first by loop variant 
+   // (according to order in LoopStatMap, which is the same as 
+   // run_loop_variants vector in main.cxx) and then by loop length.  
+   // So we have NUM_LENGTHS values for each variant.
+   //
+   std::vector< std::vector< int > > num_loops_run;
+   std::vector< std::vector< long double > > tot_time;
+   std::vector< std::vector< long double > > fom_rel;
+   std::vector< std::vector< long double > > fom_rate;
+
+
+   CacheIndex_type cache_flush_data_len;
+   double* cache_flush_data;
+   long double cache_flush_data_sum;
+
+   LoopSuiteRunInfo()
+   : ref_loop_stat(static_cast<unsigned>(0)),
+     num_loops(0),
+     num_loop_lengths(0),
+     num_suite_passes(0),
+     loop_samp_frac(0.0),
+     cache_flush_data_len(0),
+     cache_flush_data(0),
+     cache_flush_data_sum(0.0)
+   { ; }
+
+
+   typedef std::map< std::string, std::vector<LoopStat>* > LoopStatMap;
+
+   ~LoopSuiteRunInfo() 
+   {
+      LoopStatMap::iterator lsi = loop_test_stats.begin();
+      for (  ; lsi != loop_test_stats.end(); ++ lsi ) {
+         delete (*lsi).second; 
+      }
+   }
+
+   //
+   // Add vector of loop stats for loop test with given name.
+   //
+   void addLoopStats(const std::string& name)
+   {
+      std::vector<LoopStat>* stat_vec = new std::vector<LoopStat>();
+      loop_test_stats.insert( LoopStatMap::value_type( name, stat_vec ) );
+   }
+
+   //
+   // Return reference to vector of loop stats for loop test with given name.
+   //
+   std::vector<LoopStat>& getLoopStats( const std::string& name )
+   {
+      LoopStatMap::iterator lsi = loop_test_stats.find(name);
+      return( *( (*lsi).second ) );
+   }
+
+private:
+   //
+   // The following methods are not implemented.
+   //
+   LoopSuiteRunInfo(const LoopSuiteRunInfo&);
+   LoopSuiteRunInfo& operator=(const LoopStat&); 
+
+   LoopStatMap loop_test_stats;
+};
+
+
+#endif  // closing endif for header file include guard
Index: MicroBenchmarks/LCALS/LCALSStats.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/LCALSStats.cxx
@@ -0,0 +1,1024 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file containing routines used to gather and report
+// performance data for LCALS suite
+//
+
+#include "LCALSStats.hxx"
+
+#include<string>
+#include<iostream>
+#include<iomanip>
+#include<sstream>
+#include<fstream>
+#include<limits>
+#include<cstdlib>
+#include<cmath>
+
+using namespace std;
+
+//
+// LoopStat print routine for debugging.
+//
+void LoopStat::print(ostream& os) const
+{
+   os << "\nLoopStat::print..." << endl;
+   os << "\tloop_is_run = " <<  loop_is_run << endl;
+   os << "\tnum loop lengths = " <<  loop_length.size() << endl;
+   for (unsigned i = 0; i < loop_length.size(); ++i) {
+      os << "\t\t ilength = " <<  i << " --> " << endl;  
+      os << "\t\t\t loop_length = " <<  loop_length[i] << endl;
+      os << "\t\t\t samples_per_pass = " <<  samples_per_pass[i] << endl;
+      os << "\t\t\t loop_run_count = " <<  loop_run_count[i] << endl;
+      if ( loop_run_count[i] > 0 ) {
+         for (unsigned j = 0; j < loop_run_time[i].size(); ++j) {
+            os << "\t\t\t\t sample time = " <<  loop_run_time[i][j] << endl;
+         }
+         os << "\t\t\t\t mean = "            <<  mean[i] << endl; 
+         os << "\t\t\t\t std_dev = "         <<  std_dev[i] << endl; 
+         os << "\t\t\t\t min = "             <<  min[i] << endl; 
+         os << "\t\t\t\t max = "             <<  max[i] << endl; 
+         os << "\t\t\t\t harm_mean = "       <<  harm_mean[i] << endl; 
+         os << "\t\t\t\t meanrel2ref = "     <<  meanrel2ref[i] << endl; 
+         os << endl;
+         for (unsigned j = 0; j < loop_run_time[i].size(); ++j) {
+            os << "\t\t\t\t sample time = " <<  loop_run_time[i][j] << endl;
+         }
+      }    
+   }
+   os << endl;
+}
+
+
+//
+// File scope data holding structures needed to execute and time loops.
+//
+static LoopSuiteRunInfo* s_loop_suite_run_info = 0;
+
+//
+// Accessor routine for suite run info.
+//
+LoopSuiteRunInfo& getLoopSuiteRunInfo() { return *s_loop_suite_run_info; }
+
+
+//
+// Define how suite will run and initialize timing structures for loops.
+//
+void allocateLoopSuiteRunInfo(const string& host_name,
+                              unsigned num_loops,
+                              unsigned num_loop_lengths,
+                              unsigned num_suite_passes,
+                              bool run_loop_length[],
+                              CacheIndex_type cache_size)
+{
+#ifdef TESTSUITE
+   cout << "\n allocateLoopSuiteRunInfo..." << endl;
+#endif
+   if ( s_loop_suite_run_info == 0 ) { 
+      s_loop_suite_run_info = new LoopSuiteRunInfo(); 
+   }
+
+   s_loop_suite_run_info->host_name = host_name;
+
+   s_loop_suite_run_info->num_loops = num_loops;
+   s_loop_suite_run_info->num_loop_lengths = num_loop_lengths;
+   s_loop_suite_run_info->num_suite_passes = num_suite_passes;
+   for (unsigned ilen = 0; ilen < num_loop_lengths; ++ilen) {
+      s_loop_suite_run_info->run_loop_length.push_back(
+                                             run_loop_length[ilen]);
+   }
+
+   //
+   // To make sure all data cache levels are flushed completely, we
+   // define a data buffer with length equal to twice given cache size.
+   //
+   s_loop_suite_run_info->cache_flush_data_len = 
+                          (cache_size*2)/sizeof(Real_type);
+   s_loop_suite_run_info->cache_flush_data = 
+      new double[s_loop_suite_run_info->cache_flush_data_len];
+   for (CacheIndex_type i = 0; 
+        i < s_loop_suite_run_info->cache_flush_data_len; ++i) {
+      s_loop_suite_run_info->cache_flush_data[i] = drand48() + 0.1;
+   }
+
+}
+
+//
+// Free data structures defining loop suite execution.
+//
+void freeLoopSuiteRunInfo()
+{
+   if ( s_loop_suite_run_info ) { 
+      if ( s_loop_suite_run_info->cache_flush_data ) {
+         delete [] s_loop_suite_run_info->cache_flush_data;
+      }
+      delete s_loop_suite_run_info; 
+      s_loop_suite_run_info = 0;
+   }
+}
+
+//////////////////////////////////////////////////////////////////
+//
+//  Routines used for loop timing...
+//
+//////////////////////////////////////////////////////////////////
+
+//
+// Flush cache before each loop is run to minimize impact of one
+// loop on another's execution.
+//
+void flushCache()
+{
+   for (CacheIndex_type i = 0; 
+        i < s_loop_suite_run_info->cache_flush_data_len; ++i) {
+      s_loop_suite_run_info->cache_flush_data_sum +=
+         s_loop_suite_run_info->cache_flush_data[i];
+   }
+   s_loop_suite_run_info->cache_flush_data_sum /=
+      s_loop_suite_run_info->cache_flush_data_len; 
+}
+
+
+//
+// Copy loop run time to LoopStat.
+//
+void copyTimer(LoopStat& loop_stat, int ilength,
+               const LoopTimer& loop_timer)
+{
+   if ( loop_timer.was_run ) {
+#if defined(LCALS_USE_CYCLE)
+      long double run_time = elapsed(loop_timer.stop,
+                                     loop_timer.start);
+#elif defined(LCALS_USE_CLOCK)
+      long double run_time =
+         static_cast<long double>(loop_timer.stop -
+                                  loop_timer.start) / CLOCKS_PER_SEC;
+#else
+#error LCALS_TIMER_TYPE is undefined! 
+
+#endif
+      loop_stat.loop_run_time[ilength].push_back( run_time );
+   }
+}
+
+
+
+//
+// Compute statistics for loop run for variant with given index.
+//
+void computeStats( unsigned ilv, vector<LoopStat>& loop_stats,
+                   bool do_fom )
+{
+
+   // compute stats for each loop...
+   for ( unsigned iloop = 0; iloop < loop_stats.size(); ++iloop ) {
+
+      LoopStat& stat = loop_stats[iloop];
+
+      if ( stat.loop_is_run ) {
+
+         // compute stats for each length loop is run...
+         for ( unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen )  {
+   
+            if ( stat.loop_run_count[ilen] > 0 ) { 
+
+               vector<long double>& time_sample = 
+                                         stat.loop_run_time[ilen]; 
+               unsigned sample_size = time_sample.size();
+    
+               long double mean = 0.0;
+               long double sdev = 0.0;
+               long double max = -std::numeric_limits<long double>::max();
+               long double min = std::numeric_limits<long double>::max();
+               long double harm = 0.0;
+
+               for (unsigned is = 0; is < sample_size; ++is) {
+                  mean += time_sample[is];
+                  max = std::max(max, time_sample[is]);
+                  min = std::min(min, time_sample[is]);
+                  if ( time_sample[is] > 0.0 ) {
+                     harm += 1.0/time_sample[is];
+                  }
+               }
+
+               mean /= sample_size;
+   
+               if ( harm > 0.0 ) { harm = sample_size/harm; }
+
+               for (unsigned is = 0; is < sample_size; ++is) {
+                  sdev += (time_sample[is] - mean)*(time_sample[is] - mean);
+               }
+
+               sdev /= sample_size;
+
+               stat.mean[ilen]      = mean;
+               stat.std_dev[ilen]   = sdev;
+               stat.min[ilen]       = min;
+               stat.max[ilen]       = max;
+               stat.harm_mean[ilen] = harm;
+
+            }  // if loop length was run  
+
+         }  // iterate over loop lengths
+
+      }  // if loop is run
+
+   }  // iterate over loops
+
+   //
+   // FOM calculations (done separately for simplicity)
+   //
+   if ( do_fom ) {
+
+      LoopSuiteRunInfo& suite_info = getLoopSuiteRunInfo();
+      LoopStat& ref_loop_stat = suite_info.ref_loop_stat;
+
+      std::vector<int> num_loops_run(suite_info.num_loop_lengths, 0);
+      std::vector< long double > tot_weight(suite_info.num_loop_lengths, 0.0);
+      std::vector< long double > tot_time(suite_info.num_loop_lengths, 0.0);
+      std::vector< long double > fom_rel(suite_info.num_loop_lengths, 0.0);
+      std::vector< long double > fom_rate(suite_info.num_loop_lengths, 0.0);
+
+      for ( unsigned iloop = 0; iloop < loop_stats.size(); ++iloop ) {
+
+         LoopStat& stat = loop_stats[iloop];
+
+         if ( stat.loop_is_run ) {
+
+            for ( unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen )  {
+
+               if ( stat.loop_run_count[ilen] > 0 ) {
+
+                  num_loops_run[ilen]++;
+                  tot_weight[ilen]   += stat.loop_weight; 
+                  tot_time[ilen]     += stat.mean[ilen]; 
+   
+                  //
+                  // sum weighted loop time 
+                  //
+                  fom_rel[ilen] += stat.loop_weight * stat.mean[ilen];
+
+                  //
+                  // sum weighted loop iteration rate
+                  //
+                  fom_rate[ilen] += (stat.loop_weight * stat.mean[ilen]) /
+                      (stat.loop_length[ilen] * stat.samples_per_pass[ilen]);
+   
+               }  // if loop length was run
+   
+            }  // iterate over loop lengths
+   
+         }  // if loop is run
+   
+      }  // iterate over loops
+
+      for (unsigned ilen = 0; ilen < suite_info.num_loop_lengths; ++ilen) {
+         suite_info.num_loops_run[ilv][ilen] = num_loops_run[ilen];
+         suite_info.tot_time[ilv][ilen] = tot_time[ilen];
+
+         long double ref_time = ref_loop_stat.loop_run_time[ilen][0];
+   
+         if ( num_loops_run[ilen] > 0 ) {
+#if 0  // this makes 0 <= fom_rel <= 1/tot_time
+            suite_info.fom_rel[ilv][ilen] = 
+               ref_time * tot_weight[ilen] / ( tot_time[ilen] * fom_rel[ilen] );
+#else  // this makes 0 <= fom_rel <= 1
+            suite_info.fom_rel[ilv][ilen] = 
+               ref_time * tot_weight[ilen] / fom_rel[ilen] ;
+#endif
+            suite_info.fom_rate[ilv][ilen] = 1.0 / fom_rate[ilen];
+         }
+      }
+
+   }
+
+}
+
+
+//
+// Forward declarations for routines that write loop reports.
+//
+namespace {
+
+   void writeTimingSummaryReport(
+        const vector< string >& run_loop_variants,
+        ostream& os);
+
+   void writeChecksumReport(
+        const vector< string >& run_loop_variants,
+        ostream& os);
+
+   void writeFOMReport(
+        const vector< string >& run_loop_variants,
+        ostream& os);
+
+   void writeMeanTimeReport(const string& variant_name, 
+                            const string& output_dirname);
+
+   void writeRelativeTimeReport(const string& variant_name, 
+                                const string& output_dirname);
+
+   std::string buildVersionInfo();
+
+};  // unnamed namespace
+
+
+//
+// Routine called from main() to generate timing report(s).
+//
+void generateTimingReport(const vector< string >& run_loop_variants,
+                          const string& output_dirname)
+{
+   if ( run_loop_variants.size() == 0 ) return;
+
+   bool do_fom = true;
+
+   std::string ver_info = buildVersionInfo(); 
+
+   //
+   // Compute statistics for all loops.
+   //
+   LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo();
+   const unsigned nvariants = run_loop_variants.size();
+   for (unsigned ilv = 0; ilv < nvariants; ++ilv) {
+      computeStats( ilv, suite_run_info.getLoopStats(run_loop_variants[ilv]),
+                    do_fom );
+   }
+
+   //
+   // If output directory name is given, write files in that directory.
+   // Else, write only summary to standard output.  
+   //
+   if (!output_dirname.empty()) {
+
+      string timing_fname(output_dirname + "/" + "timing.txt");
+      ofstream file(timing_fname.c_str(), ios::out | ios::trunc);
+      if ( !file ) {
+         cout << " ERROR: Can't open output file " 
+                   << timing_fname << endl;
+      }
+      cout << "\n writeTimingSummaryReport...   " << timing_fname << endl;
+      writeTimingSummaryReport(run_loop_variants, file);
+
+      //
+      // Write mean run time file for each loop variant.
+      //
+      for (unsigned ilv = 0; ilv < nvariants; ++ilv) {
+         writeMeanTimeReport( run_loop_variants[ilv], output_dirname );
+      }
+
+      //
+      // Write relative run time file for each loop variant.
+      //
+      // NOTE: We assume variant "zero" is reference.
+      //
+      for (unsigned ilv = 1; ilv < nvariants; ++ilv) {
+         writeRelativeTimeReport( run_loop_variants[ilv], output_dirname );
+      }
+
+   } else {
+
+      writeTimingSummaryReport(run_loop_variants, cout);
+
+   }
+}
+
+//
+// Routine called from main() to generate checksum report.
+//
+void generateChecksumReport(
+   const vector< string >& run_loop_variants,
+   const string& output_dirname)
+{
+#if defined(LCALS_VERIFY_CHECKSUM)
+   if ( run_loop_variants.size() == 0 ) return;
+
+   //
+   // If output directory name is given, write file in that directory.
+   // Else, write summary to standard output.
+   //
+   if (!output_dirname.empty()) {
+      string checksum_fname(output_dirname + "/" + "checksum.txt");
+      ofstream file(checksum_fname.c_str(), ios::out | ios::trunc);
+      if ( !file ) {
+         cout << " ERROR: Can't open output file "                   
+                   << checksum_fname << endl;
+      }
+      cout << "\n writeChecksumReport...    " << checksum_fname << endl;
+      writeChecksumReport(run_loop_variants, file);   
+   } else {
+      writeChecksumReport(run_loop_variants, cout);
+   }
+#endif
+}
+
+//
+// Routine called from main() to generate FOM report.
+//
+void generateFOMReport(
+   const vector< string >& run_loop_variants,
+   const string& output_dirname)
+{
+   if ( run_loop_variants.size() == 0 ) return;
+
+   //
+   // If output directory name is given, write file in that directory.
+   // Else, write only summary to standard output.
+   //
+   if (!output_dirname.empty()) {
+      string fom_fname(output_dirname + "/" + "fom.txt");
+      ofstream file(fom_fname.c_str(), ios::out | ios::trunc);
+      if ( !file ) {
+         cout << " ERROR: Can't open output file "
+                   << fom_fname << endl;
+      }
+      cout << "\n writeFOMReport... " << fom_fname << endl;
+      writeFOMReport(run_loop_variants, file);   
+   } else {
+      writeFOMReport(run_loop_variants, cout);
+   }
+}
+ 
+
+//
+// Implementation of file-scope routines that write loop reports.
+//
+namespace {
+
+//
+// Write report about loop execution timings to given output stream.
+//
+void writeTimingSummaryReport(const vector< string >& run_loop_variants,
+                              ostream& os)
+{
+   LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo();
+   const unsigned nvariants = run_loop_variants.size();
+
+   const string& ref_variant = run_loop_variants[0];
+   vector<string>& loop_names = suite_run_info.loop_names;
+
+   //
+   //  Define some strings used to print summary table.
+   //
+   string equal_line("===========================================================================================================\n");
+   string dash_line("------------------------------------------------------------------------------------------------------------\n");
+   string dash_line_part("-------------------------------------------------------\n");
+   string dot_line_part("............................................\n");
+   vector<string> len_id(suite_run_info.loop_length_names.size());
+   for (unsigned ilen = 0; ilen < len_id.size(); ++ilen) {
+      len_id[ilen] = suite_run_info.loop_length_names[ilen][0];
+   }
+
+   std::string ver_info = buildVersionInfo();
+
+   //
+   //  Print compilation summary information.
+   //
+   os << "\n\n\n";
+   os << equal_line;
+   os << equal_line;
+
+   os << "LCALS compilation summary: " << endl;
+   os << ver_info << endl;
+
+   //
+   //  Print basic run summary information.
+   //
+   os << "\n\n";
+   os << equal_line;
+   os << equal_line;
+
+   os << "LCALS run summary: " << endl;
+   os << "sizeof(Real_type) = " << sizeof(Real_type) << endl;
+   os << "     num suite passes = " << suite_run_info.num_suite_passes << endl;
+   os << "     loop sample fraction = " << suite_run_info.loop_samp_frac << endl;
+   os << "     loop variants run : ";
+   for (unsigned ilv = 0; ilv < nvariants; ++ilv) {
+      string last_char;
+      if ( ilv+1 < run_loop_variants.size() ) last_char = string(" , ");
+      os << run_loop_variants[ilv] << last_char;
+   }
+   os << "\n     reference variant : " << ref_variant << endl;
+   os << equal_line;
+   os << equal_line;
+
+   //
+   // Set basic table formatting.
+   //
+   size_t max_name_len = 0;
+   for (size_t iloop = 0; iloop < loop_names.size(); ++iloop) {
+      max_name_len = max(max_name_len, loop_names[iloop].size()); 
+   }
+
+   size_t max_var_name_len = 0;
+   for (size_t ilv = 0; ilv < nvariants; ++ilv) {
+      max_var_name_len = 
+         max(max_var_name_len, run_loop_variants[ilv].size()); 
+   }
+
+   string var_field("Variant(length id)");
+   size_t var_field_len = var_field.size(); 
+   unsigned prec = 10;
+   unsigned prec_buf = prec + 8;
+   unsigned reldiff_prec = 6;
+
+   //
+   // Print table column headers.
+   //
+   os << "Loop name(Loop ID) -->   <length id>:(length, samples/pass), etc." 
+      << endl;
+   os <<left<< setw(var_field_len+1) << var_field;
+   os <<right<< setw(prec_buf) << "   Mean Time ";
+   os <<left<< setw(prec_buf) << "        Min Time";
+   os <<left<< setw(prec_buf) << "      Max Time";
+   os <<left<< setw(prec_buf) << "    Std. Dev.";
+   os <<left<< setw(prec_buf) << "Mean time rel to ref variant" << endl;
+   os << dash_line;
+
+
+   //
+   // Print timing results for all loops in a table.
+   //
+   for (unsigned iloop = 0; iloop < loop_names.size(); ++iloop) {
+
+      LoopStat& ref_variant_stat = suite_run_info.
+                                   getLoopStats(run_loop_variants[0])[iloop]; 
+      vector<long double> ref_mean(ref_variant_stat.mean); 
+
+      if ( !loop_names[iloop].empty() && ref_variant_stat.loop_is_run ) {
+
+         if ( iloop > 1 ) {  // magic numbers are bad!!
+            os << endl << dash_line_part;
+         }
+         os <<left << loop_names[iloop] << " (" << iloop << ") --> ";
+
+         for (unsigned ilv = 0; ilv < nvariants; ++ilv) {
+
+            LoopStat& stat = suite_run_info.
+                             getLoopStats(run_loop_variants[ilv])[iloop];
+
+            if ( stat.loop_is_run ) {
+
+               //
+               // Print separator line for new loop or new variant.
+               //
+               if ( ilv == 0 ) {
+
+                  for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) {
+                     os << "   " << len_id[ilen] << ":(" 
+                        << stat.loop_length[ilen] << ", "
+                        << stat.samples_per_pass[ilen] << ")";
+                  }
+                  os << endl; 
+
+               } else {
+
+                  os << dot_line_part;
+
+               }
+
+               //
+               // Print statistics for each length of loop run.
+               //
+               for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) {
+
+                  if ( stat.loop_run_count[ilen] > 0 ) {
+
+                     string var_string(run_loop_variants[ilv] + 
+                                         "(" + len_id[ilen] + ")");
+                  
+                     os << showpoint << setprecision(prec)
+                          <<left<< setw(var_field_len+1) << var_string;
+
+                     os <<right<< setw(prec_buf) << stat.mean[ilen];
+                     os <<right<< setw(prec_buf) << stat.min[ilen];
+                     os <<right<< setw(prec_buf) << stat.max[ilen];
+                     os <<right<< setw(prec_buf) << stat.std_dev[ilen];
+
+                     if ( ilv > 0 ) {
+                        // compare mean run time to reference variant 
+
+                        long double rel_mean_diff = 0;
+                        if ( ref_mean[ilen] != 0.0 ) {
+                           rel_mean_diff = 1.0 + 
+                              (stat.mean[ilen]-ref_mean[ilen])/ref_mean[ilen];
+                        }
+                        os <<right<< setprecision(reldiff_prec) << setw(prec_buf) 
+                             << rel_mean_diff << endl;
+                        stat.meanrel2ref[ilen] = rel_mean_diff;
+                     } else {
+                        os << endl;
+                     }
+
+                  }  // if loop length was run
+
+               } // iterate over loop lengths
+
+            } // if loop is run
+
+         }  // iterate over variants of loop run
+
+      }  // if loop name is not empty
+      
+   }  // iterate over loops
+
+
+   os << dash_line;
+   os << "\n\n\n"; 
+
+   os.flush();
+}
+
+
+//
+// Write report about loop chksums to given output stream.
+//
+void writeChecksumReport(const vector< string >& run_loop_variants,
+                         ostream& os)
+{
+   LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo();
+   const unsigned nvariants = run_loop_variants.size();
+   const string& ref_variant = run_loop_variants[0];
+   vector<string>& loop_names = suite_run_info.loop_names;
+
+   //
+   //  Define some strings used to print summary table.
+   //
+   string equal_line("===========================================================================================================\n");
+   string dash_line("------------------------------------------------------------------------------------------------------------\n");
+   string dash_line_part("-------------------------------------------------------\n");
+   string dot_line_part("............................................\n");
+   vector<string> len_id(suite_run_info.loop_length_names.size());
+   for (unsigned ilen = 0; ilen < len_id.size(); ++ilen) {
+      len_id[ilen] = suite_run_info.loop_length_names[ilen][0];
+   }
+
+   std::string ver_info = buildVersionInfo();
+
+   //
+   //  Print compilation summary information.
+   //
+   os << "\n\n\n";
+   os << equal_line;
+   os << equal_line;
+
+   os << "LCALS compilation summary: " << endl;
+   os << ver_info << endl;
+
+   //
+   //  Print checksum information.
+   //
+   os << "\n\n";
+   os << equal_line;
+   os << equal_line;
+
+   //
+   // Set basic table formatting.
+   //
+   size_t max_name_len = 0;
+   for (size_t iloop = 0; iloop < loop_names.size(); ++iloop) {
+      max_name_len = max(max_name_len, loop_names[iloop].size());
+   }
+
+   size_t max_var_name_len = 0;
+   for (size_t ilv = 0; ilv < nvariants; ++ilv) {
+      max_var_name_len =
+         max(max_var_name_len, run_loop_variants[ilv].size());
+   }
+
+   string var_field("Variant(length #)");
+   size_t var_field_len = var_field.size();
+   unsigned prec = 32;
+   unsigned prec_buf = prec + 8;
+
+   //
+   // Print table column headers.
+   //
+   os << "Loop name -->" << endl;
+   os <<left<< setw(var_field_len+1) << var_field;
+   os <<right<< setw(prec_buf) << "Check Sum    ";
+   os <<left<< setw(prec_buf) << "        Delta from reference" << endl;
+   os << dash_line;
+
+   //
+   // Print check sums for all loops in a table.
+   //
+   for (unsigned iloop = 0; iloop < loop_names.size(); ++iloop) {
+
+      LoopStat& ref_variant_stat = suite_run_info.
+                                   getLoopStats(run_loop_variants[0])[iloop];
+      vector<long double> ref_chksum(ref_variant_stat.loop_chksum);
+
+      if ( !loop_names[iloop].empty() && ref_variant_stat.loop_is_run ) {
+
+         if ( iloop > 1 ) {  // magic numbers are bad!!
+            os << endl << dash_line_part;
+         }
+         os <<left << loop_names[iloop] << " (" << iloop << ") --> ";
+
+         for (unsigned ilv = 0; ilv < nvariants; ++ilv) {
+
+            LoopStat& stat = suite_run_info.
+                             getLoopStats(run_loop_variants[ilv])[iloop];
+
+            if ( stat.loop_is_run ) {
+
+               //
+               // Print separator line for new loop or new variant.
+               //
+               if ( ilv == 0 ) {
+                  os << endl;
+               } else {
+                  os << dot_line_part;
+               }
+
+               //
+               // Print checksum for each length of loop run.
+               //
+               for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) {
+
+                  if ( stat.loop_run_count[ilen] > 0 ) {
+
+                     string var_string(run_loop_variants[ilv] +
+                                         "(" + len_id[ilen] + ")");
+
+                     os << showpoint << setprecision(prec)
+                          <<left<< setw(var_field_len+1) << var_string;
+
+                     os <<right<< setw(prec_buf) << stat.loop_chksum[ilen];
+
+                     if ( ilv > 0 ) {
+                        // compare checksum to reference variant
+                        long double chksum_diff = fabs(
+                           stat.loop_chksum[ilen]-ref_chksum[ilen] );
+                        os <<right<< setw(prec_buf)
+                             << chksum_diff << endl;
+                     } else {
+                        os << endl;
+                     }
+
+                  }  // if loop length was run
+
+               } // iterate over loop lengths
+
+            } // if loop is run
+
+         }  // iterate over variants of loop run
+
+      }  // if loop name is not empty
+
+   }  // iterate over loops
+
+
+   os << dash_line;
+   os << "\n\n\n";
+
+   os.flush();
+}
+
+//
+// Generate FOM report file to given output stream.
+//
+void writeFOMReport(const vector< string >& run_loop_variants,
+                    ostream& os)
+{
+   LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo();
+   const unsigned nvariants = run_loop_variants.size();
+
+   //
+   //  Define some strings used to print FOM summary table.
+   //
+   string equal_line("===========================================================================================================\n");
+   string dash_line_part("-------------------------------------------------------\n");
+   string dot_line_part("............................................\n");
+
+   std::string ver_info = buildVersionInfo();
+
+   //
+   //  Print compilation summary information.
+   //
+   os << "\n\n\n";
+   os << equal_line;
+   os << equal_line;
+
+   os << "LCALS compilation summary: " << endl;
+   os << ver_info << endl;
+
+   //
+   //  Print checksum information.
+   //
+   os << "\n\n";
+   os << equal_line;
+   os << equal_line;
+
+   os << "LCALS FOM results: " << endl;
+   os << equal_line;
+
+   vector<string>& len_name = suite_run_info.loop_length_names;
+
+   unsigned prec = 32;
+   //
+   // Output FOM for each loop variant (and loop lengths)
+   //
+   for (unsigned ilv = 0; ilv < nvariants; ++ilv) {
+
+      vector< int >& num_loops_run    = suite_run_info.num_loops_run[ilv];
+      vector< long double >& tot_time = suite_run_info.tot_time[ilv];
+      vector< long double >& fom_rel  = suite_run_info.fom_rel[ilv];
+      vector< long double >& fom_rate = suite_run_info.fom_rate[ilv];
+
+      os <<left << "Loop variant -- " << run_loop_variants[ilv] << endl;
+
+      for (unsigned ilen = 0; ilen < len_name.size(); ++ilen) {
+         os << "\t" << len_name[ilen]
+                    << " :   # loops run = " << num_loops_run[ilen];
+         os << showpoint << setprecision(prec)
+                    << " ,   total exec time = " << tot_time[ilen] << endl;
+         os << "\t\tFOM_relative = " << fom_rel[ilen] << endl;
+#if 0  // It's not clear what this FOM rate means...
+         os << "\t\tFOM_rate     = " << fom_rate[ilen] << endl;
+#endif
+
+         if ( ilen < len_name.size() - 1 ) {
+            os << dot_line_part;
+         }
+      }
+
+      if ( ilv < nvariants - 1 ) {
+         os << endl << dash_line_part;
+      }
+
+   }
+
+   os << equal_line;
+   os << "\n\n\n";
+
+   os.flush();
+}
+
+//
+// Write mean run time report file.
+//
+void writeMeanTimeReport(const string& variant_name, 
+                         const string& output_dirname)
+{
+   string rept_fname(output_dirname + "/");
+   rept_fname += variant_name;
+   rept_fname += string("-meantime.txt");
+
+   ofstream file(rept_fname.c_str(), ios::out | ios::trunc);
+   if ( !file ) {
+      cout << " ERROR: Can't open output file " << rept_fname << endl;
+   }
+   cout << "\n writeMeanTimeReport...   " << rept_fname << endl;
+
+   LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo();
+
+   vector<string>& loop_names = suite_run_info.loop_names;
+   vector<string>& len_names = suite_run_info.loop_length_names;
+
+   const string sepchr(" , ");
+   unsigned prec = 8;
+
+   //
+   // Print title line.
+   // 
+   file << variant_name << " Mean Run Times ";
+   for (unsigned i = 0; i < len_names.size(); ++i) {
+      file << sepchr;
+   } 
+   file << endl;
+   
+   //
+   // Print column header line.
+   // 
+   for (unsigned i = 0; i < len_names.size(); ++i) {
+      file << sepchr << len_names[i];
+   } 
+   file << endl;
+
+   //
+   // Print row of times for each loop.
+   //
+   for (unsigned iloop = 0; iloop < loop_names.size(); ++iloop) {
+
+      LoopStat& stat = suite_run_info.
+                       getLoopStats(variant_name)[iloop]; 
+
+      if ( !loop_names[iloop].empty() && stat.loop_is_run ) {
+
+         file << loop_names[iloop];
+         for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) {
+            file << sepchr << setprecision(prec) << stat.mean[ilen];
+         }
+         file << endl;
+
+      }
+
+   }
+
+   file.flush();
+}
+
+//
+// Write relative run time report file.
+//
+void writeRelativeTimeReport(const string& variant_name, 
+                             const string& output_dirname)
+{
+   string rept_fname(output_dirname + "/");
+   rept_fname += variant_name;
+   rept_fname += string("-reltime.txt");
+
+   ofstream file(rept_fname.c_str(), ios::out | ios::trunc);
+   if ( !file ) {
+      cout << " ERROR: Can't open output file " << rept_fname << endl;
+   }
+   cout << "\n writeRelativeTimeReport...   " << rept_fname << endl;
+
+   LoopSuiteRunInfo& suite_run_info = getLoopSuiteRunInfo();
+
+   vector<string>& loop_names = suite_run_info.loop_names;
+   vector<string>& len_names = suite_run_info.loop_length_names;
+
+   const string sepchr(" , ");
+   unsigned prec = 6;
+
+   //
+   // Print title line.
+   // 
+   file << variant_name << " Relative Run Times ";
+   for (unsigned i = 0; i < len_names.size(); ++i) {
+      file << sepchr;
+   } 
+   file << endl;
+   
+   //
+   // Print column header line.
+   // 
+   for (unsigned i = 0; i < len_names.size(); ++i) {
+      file << sepchr << len_names[i];
+   } 
+   file << endl;
+
+   //
+   // Print row of times for each loop.
+   //
+   for (unsigned iloop = 0; iloop < loop_names.size(); ++iloop) {
+
+      LoopStat& stat = suite_run_info.
+                       getLoopStats(variant_name)[iloop]; 
+
+      if ( !loop_names[iloop].empty() && stat.loop_is_run ) {
+
+         file << loop_names[iloop];
+         for (unsigned ilen = 0; ilen < stat.loop_length.size(); ++ilen) {
+            file << sepchr << setprecision(prec) << stat.meanrel2ref[ilen];
+         }
+         file << endl;
+
+      }
+
+   }
+
+   file.flush();
+}
+
+//
+// Build string containing LCALS compilation information from
+// file created when make is invoked.
+//
+std::string buildVersionInfo()
+{
+   std::ifstream infile("lcalsversioninfo.txt", std::ios::in);
+  
+   std::string ver_info;
+
+   infile.seekg(0, std::ios::end);
+   ver_info.reserve(infile.tellg());
+   infile.seekg(0, std::ios::beg);
+   
+   ver_info.assign((std::istreambuf_iterator<char>(infile)),
+                    std::istreambuf_iterator<char>());
+   infile.close(); 
+   
+#if 0
+   std::string ver_info = "LCALS compilation info: \n" 
+              << "\tUser = " << VER_PERSON << "\n"
+              << "\tDate, Time = " << VER_DATE << " , " << VER_TIME << "\n"
+              << "\tMachine = " << VER_MACHINE << "\n"
+              << "\tOS = " << VER_OS << "\n"
+              << "\t-----------------------------------------------" << "\n"
+              << "\tCompiler + options = " << lcals_ver_info_values[0] << "\n"
+              << "\tLCALS rules (defines) = " << lcals_ver_info_values[1] << "\n";
+#endif
+   return ver_info;
+}
+
+};  // unnamed namespace
+
+
+
Index: MicroBenchmarks/LCALS/LCALSSuite.hxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/LCALSSuite.hxx
@@ -0,0 +1,586 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Header file with enums, macros, routines and structures used to
+// compile and run loops in LCALS suite and to generate execution
+// statistics.
+//
+
+#ifndef LCALSSuite_HXX
+#define LCALSSuite_HXX
+
+#include "LCALSParams.hxx" 
+#include "LCALSStats.hxx"
+
+#include <vector>
+#include <string>
+
+
+//
+// Enumeration defining unique id for each loop KERNEL in suite.
+//
+// IMPORTANT: Generally, this should not need modification unless
+//            new loops (i.e., kernels) are added to the suite.
+//
+// Note: To keep output understandable, keep this consistent with
+//       routine defineLoopSuiteRunInfo().
+//
+enum LoopKernelID {
+
+   // Keep this one first and don't comment out (!!)
+   // This insures loop ids start at zero so all array indexing
+   // or data structures is correct.  Also, this loop is not
+   // executed the same way the others are. 
+   REF_LOOP = 0,
+
+   //
+   // Loop Subset A: Loops extracted from LLNL app codes.
+   // They are implemented in runA<variant>Loops.cxx files.
+   //
+   PRESSURE_CALC,
+   PRESSURE_CALC_ALT,
+   ENERGY_CALC,
+   ENERGY_CALC_ALT,
+   VOL3D_CALC,
+   DEL_DOT_VEC_2D,
+   COUPLE,
+   FIR,
+
+   //
+   // Loop Subset B: "Basic" Loops.
+   // They are implemented in runB<variant>Loops.cxx files.
+   //
+   INIT3,
+   MULADDSUB,
+   IF_QUAD,
+   TRAP_INT,
+
+   //
+   // Loop Subset C: Loops from older Livermore Loops in "C" suite.
+   // They are implemented in runC<variant>Loops.cxx files.
+   //
+   HYDRO_1D,
+   ICCG,
+   INNER_PROD,
+   BAND_LIN_EQ,
+   TRIDIAG_ELIM,
+   EOS,
+   ADI,
+   INT_PREDICT,
+   DIFF_PREDICT,
+   FIRST_SUM,
+   FIRST_DIFF,
+   PIC_2D,
+   PIC_1D,
+   HYDRO_2D,
+   GEN_LIN_RECUR,
+   DISC_ORD,
+   MAT_X_MAT,
+   PLANCKIAN,
+   IMP_HYDRO_2D,
+   FIND_FIRST_MIN,
+
+   NUM_LOOP_KERNELS // Keep this one last and NEVER comment out (!!)
+
+};
+
+
+//
+// Enumeration defining unique id for each loop VARIANT in suite.
+//
+// IMPORTANT: Generally, this should not need modification unless
+//            new loop variants are added to the suite.
+//
+enum LoopVariantID {
+   //
+   // These variants define LCALS benchmark
+   //
+   RAW,
+   RAW_OMP,
+   FORALL_LAMBDA,
+   FORALL_LAMBDA_OMP,
+
+#if defined(LCALS_DO_MISC)
+
+   //
+   // These variants are used in miscellaneous LCALS studies
+   //
+   FORALL_HYBRID_LAMBDA,
+#if 0  // THESE ARE NOT AVAILABLE YET!!!
+   FORALL_HYBRID_LAMBDA_OMP,
+#endif
+   FORALL_FUNCTOR,
+   FORALL_FUNCTOR_OMP,
+#if 0  // THESE ARE NOT AVAILABLE YET!!!
+   FORALL_HYBRID_FUNCTOR,
+   FORALL_HYBRID_FUNCTOR_OMP,
+#endif
+   RAW_FUNC,
+   FORALL_LAMBDA_TYPEFIX,
+   FORALL_LAMBDA_OMP_TYPEFIX,
+   FORALL_HYBRID_LAMBDA_TYPEFIX,
+
+#endif // if LCALS_DO_MISC 
+
+};
+
+
+//
+// Enumeration defining possible loop lengths to run.
+//
+enum LoopLength {
+
+   LONG = 0,
+   MEDIUM,
+   SHORT,
+
+   NUM_LENGTHS // Keep this one last (!!)
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// The following macro constants define which loop VARIANTS can be compiled 
+// (and potentially) run for a given compiler.
+//
+// NOTE: The Makefile sets the LCALS_COMPILER_* macro constant.
+//
+//  --> IMPORTANT: Actual selection of which loop variants are run is done
+//                 in main.cxx via the vector 'run_variants'.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(LCALS_COMPILER_ICC)
+//
+// Configuration options for Intel compilers
+//
+
+#define COMPILE_RAW_VARIANTS
+#define COMPILE_LAMBDA_VARIANTS
+#define COMPILE_FUNCTOR_VARIANTS
+#define COMPILE_OMP_VARIANTS
+
+
+#elif defined(LCALS_COMPILER_GNU)
+//
+// Configuration options for GNU compilers
+//
+
+#define COMPILE_RAW_VARIANTS
+#define COMPILE_LAMBDA_VARIANTS
+#define COMPILE_FUNCTOR_VARIANTS
+#define COMPILE_OMP_VARIANTS
+
+
+#elif defined(LCALS_COMPILER_XLC12)
+//
+// Configuration options for IBM xlC compilers
+//
+
+//
+// xlC compilers DO NOT support lambda functions currently!!
+//
+#define COMPILE_RAW_VARIANTS
+#undef COMPILE_LAMBDA_VARIANTS
+#define COMPILE_FUNCTOR_VARIANTS
+#define COMPILE_OMP_VARIANTS
+
+
+#elif defined(LCALS_COMPILER_CLANG)
+//
+// Configuration options for clang compilers
+//
+
+//
+// Clang compilers DO NOT support OpenMP currently!!
+//
+#define COMPILE_RAW_VARIANTS
+#define COMPILE_LAMBDA_VARIANTS
+#define COMPILE_FUNCTOR_VARIANTS
+#undef COMPILE_OMP_VARIANTS
+
+
+#else
+#error LCALS compiler is undefined!
+
+#endif
+
+
+//
+// The following macro constants are used to turn on/off compilation of
+// individual loop KERNELS in suite.  Names are consistent with LoopID 
+// enum above.
+//
+
+#if defined (LCALS_DO_OMP_ONLY)
+//
+// Only these loops have OpenMP implementations.  The imlementations are
+// found in runOMP<variant>Loops.cxx files.
+//
+
+// Loop Subset A: Loops extracted from LLNL app codes.
+#define COMPILE_PRESSURE_CALC
+#define COMPILE_PRESSURE_CALC_ALT
+#define COMPILE_ENERGY_CALC
+#define COMPILE_ENERGY_CALC_ALT
+#define COMPILE_VOL3D_CALC
+#define COMPILE_DEL_DOT_VEC_2D
+#define COMPILE_COUPLE
+#define COMPILE_FIR
+
+// Loop Subset B: "Basic" Loops.
+#define COMPILE_INIT3
+#define COMPILE_MULADDSUB
+#define COMPILE_IF_QUAD
+#define COMPILE_TRAP_INT
+
+// Loop Subset C: Loops from older Livermore Loops in "C" suite.
+#define COMPILE_PIC_2D
+
+#else // compile all loop kernels
+//
+// Loop Subset A: Loops extracted from LLNL app codes.
+// They are implemented in runA<variant>Loops.cxx files.
+//
+#define COMPILE_PRESSURE_CALC
+#define COMPILE_PRESSURE_CALC_ALT
+#define COMPILE_ENERGY_CALC
+#define COMPILE_ENERGY_CALC_ALT
+#define COMPILE_VOL3D_CALC
+#define COMPILE_DEL_DOT_VEC_2D
+#define COMPILE_COUPLE
+#define COMPILE_FIR
+
+//
+// Loop Subset B: "Basic" Loops.
+// They are implemented in runB<variant>Loops.cxx files.
+//
+#define COMPILE_INIT3
+#define COMPILE_MULADDSUB
+#define COMPILE_IF_QUAD
+#define COMPILE_TRAP_INT
+
+//
+// Loop Subset C: Loops from older Livermore Loops in "C" suite.
+// They are implemented in runLCK<variant>Loops.cxx files.
+//
+#define COMPILE_HYDRO_1D
+#define COMPILE_ICCG
+#define COMPILE_INNER_PROD
+#define COMPILE_BAND_LIN_EQ
+#define COMPILE_TRIDIAG_ELIM
+#define COMPILE_EOS
+#define COMPILE_ADI
+#define COMPILE_INT_PREDICT
+#define COMPILE_DIFF_PREDICT
+#define COMPILE_FIRST_SUM
+#define COMPILE_FIRST_DIFF
+#define COMPILE_PIC_2D
+#define COMPILE_PIC_1D
+#define COMPILE_HYDRO_2D
+#define COMPILE_GEN_LIN_RECUR
+#define COMPILE_DISC_ORD
+#define COMPILE_MAT_X_MAT
+#define COMPILE_PLANCKIAN
+#define COMPILE_IMP_HYDRO_2D
+#define COMPILE_FIND_FIRST_MIN
+
+#endif
+
+
+
+//////////////////////////////////////////////////////////////////
+//
+//  Structure holding double arrays and scalars used in loops.
+//
+//  Note: These are initialized in allocateLoopData().
+//
+///////////////////////////////////////////////////////////////////
+
+struct LoopData
+{
+
+   //
+   // Structures to hold data for easy reinitialization
+   // (useful for verifying result checksums, etc.)
+   //  
+   struct RealArray 
+   {
+      int id;
+      Real_ptr data; 
+      Index_type len;
+   };
+
+   struct IndxArray
+   {
+      int id;
+      Index_type* data;
+      Index_type  len;
+   };
+
+   struct ComplexArray
+   {
+      int id;
+      Complex_ptr data;
+      Index_type  len;
+   };
+
+
+   Index_type max_loop_length;
+
+   //
+   // Static values indicating number of data arrays 
+   // of various forms used in loop suite.
+   //
+   // NOTE: These number may need to change to accomodate new loops. 
+   //       Also, other arrays may need to be added. 
+   //
+   static const unsigned s_num_1D_Real_arrays = 16;
+   static const unsigned s_num_1D_Nx4_Real_arrays = 2;
+   static const unsigned s_num_1D_Indx_arrays = 5;
+   static const unsigned s_num_1D_Complex_arrays = 5;
+
+   static const unsigned s_num_2D_Nx25_Real_arrays = 4;
+   static const unsigned s_num_2D_7xN_Real_arrays = 11;
+   static const unsigned s_num_2D_64x64_Real_arrays = 1;
+
+   static const unsigned s_num_3D_2xNx4_Real_arrays = 3;
+
+   static const unsigned s_num_Real_scalars   = 10;
+
+   //
+   // NOTE: To see how the following data structures are related,
+   //       please see the routine allocateLoopData() in the 
+   //       file LCALSSuite.cxx.
+   //
+   //       The reason that we hold on to the same data in two 
+   //       different ways is two-fold:
+   //          1) The first set of arrays below makes it easy to 
+   //             access pointers to data based on what is used in 
+   //             each loop kernel; e.g., arrays of variaous dimensions.
+   //          2) The second set of arrays makes it easy to process
+   //             arrays for (re)initialization and checksum 
+   //             computation to verify results; e.g., we simply
+   //             iterate through 1-dim arrays without having to
+   //             know their lengths, if they are really being used 
+   //             as 2- or 3-dimensional arrays, for example. 
+   //
+
+   //
+   // Data arrays and scalars used in loop execution.
+   //
+   Real_ptr  array_1D_Real[s_num_1D_Real_arrays];
+   Real_ptr  array_1D_Nx4_Real[s_num_1D_Nx4_Real_arrays];
+   Index_type* array_1D_Indx[s_num_1D_Indx_arrays]; 
+   Complex_ptr array_1D_Complex[s_num_1D_Complex_arrays]; 
+
+   Real_ptr* array_2D_Nx25_Real[s_num_2D_Nx25_Real_arrays];
+   Real_ptr* array_2D_7xN_Real[s_num_2D_7xN_Real_arrays];
+   Real_ptr* array_2D_64x64_Real[s_num_2D_64x64_Real_arrays];
+
+   Real_ptr** array_3D_2xNx4_Real[s_num_3D_2xNx4_Real_arrays];
+
+   Real_type scalar_Real[s_num_Real_scalars];
+
+   //
+   // Arrays of structs holding data arrays used for data initialization
+   // and checksum verification.
+   //
+   RealArray RealArray_1D[s_num_1D_Real_arrays];
+   RealArray RealArray_1D_Nx4[s_num_1D_Nx4_Real_arrays];
+   IndxArray IndxArray_1D[s_num_1D_Indx_arrays];
+   ComplexArray ComplexArray_1D[s_num_1D_Complex_arrays];
+
+   RealArray RealArray_2D_Nx25[s_num_2D_Nx25_Real_arrays];
+   RealArray RealArray_2D_7xN[s_num_2D_7xN_Real_arrays];
+   RealArray RealArray_2D_64x64[s_num_2D_64x64_Real_arrays];
+
+   RealArray RealArray_3D_2xNx4[s_num_3D_2xNx4_Real_arrays];
+
+   RealArray RealArray_scalars;
+
+};
+
+//
+// Routine to access data structure that holds data needed to execute loops.
+// 
+LoopData& getLoopData();
+
+
+//
+//  Routine that generates vector of loop variant names string 
+//  from vector of LoopVariantID enum values.
+//
+std::vector<std::string> getVariantNames(
+   const std::vector<LoopVariantID>& lvids);
+
+//
+//  Routine that maps LoopVariantID enum value (used in main to help
+//  insure correctness) to string (used in loop framework for flexibility).
+//
+std::string getVariantName(LoopVariantID lvid);
+
+
+//////////////////////////////////////////////////////////////////
+//
+//  Routines to define how loop suite will be run and
+//  to set up data for loop suite.
+//
+//////////////////////////////////////////////////////////////////
+
+//
+// Routines to define specific details about how to run loop suite.
+//
+// Note:  Individual loop lengths and sampling parameters 
+//        are defined in this routine.
+//
+void defineLoopSuiteRunInfo(const std::vector<LoopVariantID>& run_variants,
+                            bool run_loop[], 
+                            double sample_frac,
+                            double loop_length_factor );
+
+
+//
+// Routines to allocate and initialize arrays (and scalars) for 
+// loops in suite and to free those arrays when done.
+//
+void allocateLoopData();
+void freeLoopData();
+
+
+//
+// Routines to initialize and finalize loop data, statistics, timers, etc.  
+//
+// Each of these routines must be called before and after the execution
+// of each loop.
+//
+void loopInit(unsigned iloop, LoopStat& stat);
+void loopInit(unsigned iloop); //, LoopStat& stat);
+//
+void loopFinalize(unsigned iloop, LoopStat& stat, LoopLength ilength);
+
+
+//
+// Routines to run reference loops for figure of merit (FOM) calculations.
+//
+void defineReferenceLoopRunInfo();
+void computeReferenceLoopTimes();
+
+//
+// Routine called in main to execute loops corresponding to given 
+// variant ID and length.  The run_loop boolean array indicates which 
+// loop kernels in suite to execute
+//
+void runLoopVariant( LoopVariantID lvid,
+                     bool run_loop[],
+                     LoopLength ilength );
+                 
+
+//
+// Routines to run specific loop variants for suite.
+//
+// THESE SHOULD NOT BE CALLED BY ROUTINE ABOVE, NOT DIRECTLY!!! 
+//
+// loop_stats is vector of LoopStat objects corresponding to loop variant.
+// run_loop boolean array indicates which loop kernels in suite to execute.
+// ilength indicates which loop length to run (see LoopLength enum).
+//
+void runARawLoops( std::vector<LoopStat>& loop_stats,
+                   bool run_loop[],
+                   LoopLength ilength );
+void runBRawLoops( std::vector<LoopStat>& loop_stats,
+                   bool run_loop[],
+                   LoopLength ilength );
+void runCRawLoops( std::vector<LoopStat>& loop_stats,
+                   bool run_loop[],
+                   LoopLength ilength );
+
+void runARawFuncLoops( std::vector<LoopStat>& loop_stats,
+                       bool run_loop[],
+                       LoopLength ilength );
+void runBRawFuncLoops( std::vector<LoopStat>& loop_stats,
+                       bool run_loop[],
+                       LoopLength ilength );
+void runCRawFuncLoops( std::vector<LoopStat>& loop_stats,
+                       bool run_loop[],
+                       LoopLength ilength );
+
+void runOMPRawLoops( std::vector<LoopStat>& loop_stats,
+                     bool run_loop[],
+                     LoopLength ilength );
+
+
+void runAForallLambdaLoops( std::vector<LoopStat>& loop_stats,
+                            bool run_loop[],
+                            LoopLength ilength );
+void runBForallLambdaLoops( std::vector<LoopStat>& loop_stats,
+                            bool run_loop[],
+                            LoopLength ilength );
+void runCForallLambdaLoops( std::vector<LoopStat>& loop_stats,
+                            bool run_loop[],
+                            LoopLength ilength );
+void runOMPForallLambdaLoops( std::vector<LoopStat>& loop_stats,
+                              bool run_loop[],
+                              LoopLength ilength );
+
+void runAForallLambdaLoops_TYPEFIX( std::vector<LoopStat>& loop_stats,
+                                    bool run_loop[],
+                                    LoopLength ilength );
+void runBForallLambdaLoops_TYPEFIX( std::vector<LoopStat>& loop_stats,
+                                    bool run_loop[],
+                                    LoopLength ilength );
+void runCForallLambdaLoops_TYPEFIX( std::vector<LoopStat>& loop_stats,
+                                    bool run_loop[],
+                                    LoopLength ilength );
+void runOMPForallLambdaLoops_TYPEFIX( std::vector<LoopStat>& loop_stats,
+                                      bool run_loop[],
+                                      LoopLength ilength );
+
+
+void runAForallFunctorLoops( std::vector<LoopStat>& loop_stats,
+                             bool run_loop[],
+                             LoopLength ilength );
+void runBForallFunctorLoops( std::vector<LoopStat>& loop_stats,
+                             bool run_loop[],
+                             LoopLength ilength );
+void runCForallFunctorLoops( std::vector<LoopStat>& loop_stats,
+                             bool run_loop[],
+                             LoopLength ilength );
+void runOMPForallFunctorLoops( std::vector<LoopStat>& loop_stats,
+                               bool run_loop[],
+                               LoopLength ilength );
+
+void runAForallHybridLambdaLoops( std::vector<LoopStat>& loop_stats,
+                                  bool run_loop[],
+                                  LoopLength ilength );
+void runBForallHybridLambdaLoops( std::vector<LoopStat>& loop_stats,
+                                  bool run_loop[],
+                                  LoopLength ilength );
+void runCForallHybridLambdaLoops( std::vector<LoopStat>& loop_stats,
+                                  bool run_loop[],
+                                  LoopLength ilength );
+
+void runAForallHybridLambdaLoops_TYPEFIX( std::vector<LoopStat>& loop_stats,
+                                          bool run_loop[],
+                                          LoopLength ilength );
+void runBForallHybridLambdaLoops_TYPEFIX( std::vector<LoopStat>& loop_stats,
+                                          bool run_loop[],
+                                          LoopLength ilength );
+void runCForallHybridLambdaLoops_TYPEFIX( std::vector<LoopStat>& loop_stats,
+                                          bool run_loop[],
+                                          LoopLength ilength );
+
+//
+// Recursively construct directories based on a relative or
+// absolute path name.  Return true if directory created
+// successfully, else false.
+// 
+bool recursiveMkdir(const std::string& path);
+
+
+
+
+#endif  // closing endif for header file include guard
Index: MicroBenchmarks/LCALS/LCALSSuite.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/LCALSSuite.cxx
@@ -0,0 +1,2519 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file with routines to allocate data for LCALS suite
+// and define parameters controlling execution of each loop.
+//
+
+#include "LCALSSuite.hxx"
+#include "LCALSStats.hxx"
+
+#include "SubsetDataA.hxx"
+
+#include<cstdlib>
+#include<string>
+#include<iostream>
+
+#include<sys/types.h>
+#include<sys/stat.h>
+
+//#define LCALS_OMP_MEM_INIT
+#undef LCALS_OMP_MEM_INIT
+
+//
+// File scope data holding structures used in loop suite
+//
+static LoopData* s_loop_data = 0;
+
+
+//
+// Default value for static ADomain member;
+//
+double ADomain::loop_length_factor = 1.0;
+
+//
+// Prototypes for file scope routines used in to manage loop data and checksums
+//
+
+namespace {
+
+Real_ptr allocAndInitData(LoopData::RealArray& ra, Index_type len);
+Index_type* allocAndInitData(LoopData::IndxArray& ia, Index_type len);
+Complex_ptr allocAndInitData(LoopData::ComplexArray& ca, Index_type len);
+void initData(LoopData::RealArray& ra);
+void initData(LoopData::IndxArray& ia);
+void initData(LoopData::ComplexArray& ca);
+
+void initChksum(LoopStat& stat, LoopLength ilength);
+void updateChksum(LoopStat& stat, LoopLength ilength,
+                  const LoopData::RealArray& ra, Real_type scale_factor = 1.0);
+void updateChksum(LoopStat& stat, LoopLength ilength,
+                  Real_type val);
+void updateChksum(LoopStat& stat, LoopLength ilength,
+                  const LoopData::ComplexArray& ca, Real_type scale_factor = 1.0);
+
+}  // closing brace for unnamed namespace
+
+
+
+
+//
+// Accessor routine for suite kernel data.
+//
+LoopData& getLoopData() { return *s_loop_data; }
+
+
+//
+// Define how suite will run and initialize stat structures for loops.
+//
+// NOTE: Loop lengths, loop sample counts (and weights for optimization
+//       evaluation) are defined here!
+//
+// These values should be set large enough to accurately generate 
+// execution timings (i.e., not too small to be masked by CPU timing 
+// resolution and overhead).  The values set here were manually determined 
+// so that O(1) seconds of execution time is required to sample each loop 
+// on some of our fastest Intel machines.
+//
+void defineLoopSuiteRunInfo(const std::vector<LoopVariantID>& run_variants,
+                            bool run_loop[], 
+                            double sample_frac,
+                            double loop_length_factor)
+{
+#ifdef TESTSUITE
+   std::cout << "\n defineLoopSuiteRunInfo..." << std::endl;
+#endif
+   std::vector<std::string> run_variant_names = getVariantNames(run_variants);
+
+   if ( s_loop_data == 0 ) { 
+      s_loop_data = new LoopData(); 
+   }
+
+   //
+   //
+   // Enumeration defining loop groups for relative weighting of
+   // execution timing based on what we think is most important.
+   //
+   // In computation of figures of merit (FOM), loops with higher 
+   // weights will reduce FOM value more for higher run-time than 
+   // those with lower weights.
+   //
+   enum WeightGroup {
+
+      DATA_PARALLEL = 0,
+      ORDER_DEPENDENT,
+      TRANSCENDENTAL,
+      DATA_DEPENDENT,
+      POINTER_NEST,
+      COMPLEX,
+
+      NUM_WEIGHT_GROUPS  // Keep this one last and NEVER comment out (!!)
+   };
+
+
+   //
+   // Initialize structure holding loop suite execution data.
+   //
+   LoopSuiteRunInfo& suite_info = getLoopSuiteRunInfo();
+
+   suite_info.loop_samp_frac = sample_frac;
+
+   suite_info.loop_weights.resize(NUM_WEIGHT_GROUPS);
+   suite_info.loop_weights[DATA_PARALLEL]   = 2.0;
+   suite_info.loop_weights[ORDER_DEPENDENT] = 1.8;
+   suite_info.loop_weights[TRANSCENDENTAL]  = 1.7;
+   suite_info.loop_weights[DATA_DEPENDENT]  = 1.7;
+   suite_info.loop_weights[POINTER_NEST]    = 1.4;
+   suite_info.loop_weights[COMPLEX]         = 1.0;
+
+   suite_info.loop_length_names.resize(NUM_LENGTHS);
+   suite_info.loop_length_names[LONG] = std::string("LONG");
+   suite_info.loop_length_names[MEDIUM] = std::string("MEDIUM");
+   suite_info.loop_length_names[SHORT] = std::string("SHORT");
+
+   suite_info.num_loops_run.resize( run_variant_names.size() );
+   suite_info.tot_time.resize( run_variant_names.size() );
+   suite_info.fom_rel.resize( run_variant_names.size() );
+   suite_info.fom_rate.resize( run_variant_names.size() );
+
+   for (unsigned ilv = 0; ilv < run_variant_names.size(); ++ilv) {
+      suite_info.addLoopStats(run_variant_names[ilv]);
+
+      suite_info.num_loops_run[ilv].resize(NUM_LENGTHS, 0);
+      suite_info.tot_time[ilv].resize(NUM_LENGTHS, 0.0);
+      suite_info.fom_rel[ilv].resize(NUM_LENGTHS, 0.0);
+      suite_info.fom_rate[ilv].resize(NUM_LENGTHS, 0.0);
+   }
+
+
+   //
+   // Define common loop lengths for LONG, MEDIUM, SHORT loops.
+   //
+   // The values assigned here are propagated across all kernels 
+   // (with a few exceptions) to simplify suite configuration en masse.  
+   // These can also be set per-kernel below. 
+   //
+   std::vector< int > shared_loop_length(NUM_LENGTHS);
+   shared_loop_length[LONG]   = static_cast<int>(44217 * loop_length_factor);
+   shared_loop_length[MEDIUM] = static_cast<int>(5001 * loop_length_factor);
+   shared_loop_length[SHORT]  = static_cast<int>(171 * loop_length_factor);
+
+   ADomain::loop_length_factor = loop_length_factor;
+
+
+   std::vector<double>& weight = suite_info.loop_weights;
+
+   Index_type max_loop_length = 0;
+
+   for (unsigned iloop = 0 ; iloop < suite_info.num_loops; ++iloop) {
+
+      std::string loop_name;
+      LoopStat loop_stat(suite_info.num_loop_lengths);
+
+      Index_type max_loop_indx = 0;
+
+      if ( run_loop[iloop] ) {
+
+         switch ( iloop ) {
+   
+            case REF_LOOP : {
+               loop_name = std::string("REF_LOOP"); 
+               //
+               // Note: Reference loop stats are not used in
+               //       in suite.  Parameters are defined in 
+               //       defineReferenceLoopRunInfo( ) routine.
+               //
+               break;
+            }
+
+
+            //
+            // Parameters defining how loops in Subset A are run...
+            //
+            case PRESSURE_CALC   :
+            case PRESSURE_CALC_ALT   : {
+
+               if ( static_cast<LoopKernelID>(iloop) == PRESSURE_CALC ) {
+                  loop_name = std::string("PRESSURE_CALC");
+               } else {
+                  loop_name = std::string("PRESSURE_CALC_ALT");
+               }
+
+               loop_stat.loop_weight = weight[DATA_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 15000;
+               loop_stat.samples_per_pass[MEDIUM] = 200000;
+               loop_stat.samples_per_pass[SHORT]  = 10000000;
+
+               break;
+            }
+
+            case ENERGY_CALC   :
+            case ENERGY_CALC_ALT   : {
+
+               if ( static_cast<LoopKernelID>(iloop) == ENERGY_CALC ) {
+                  loop_name = std::string("ENERGY_CALC");
+               } else {
+                  loop_name = std::string("ENERGY_CALC_ALT");
+               } 
+
+               loop_stat.loop_weight = weight[DATA_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 3000;
+               loop_stat.samples_per_pass[MEDIUM] = 30000;
+               loop_stat.samples_per_pass[SHORT]  = 1000000;
+
+               break;
+            }
+
+            case VOL3D_CALC   : {
+               loop_name = std::string("VOL3D_CALC");
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               Index_type ndims = 3;
+
+               ADomain Ldomain(LONG, ndims);
+               loop_stat.loop_length[LONG]   = Ldomain.lpz - Ldomain.fpz + 1;
+               ADomain Mdomain(MEDIUM, ndims);
+               loop_stat.loop_length[MEDIUM] = Mdomain.lpz - Mdomain.fpz + 1;
+               ADomain Sdomain(SHORT, ndims);
+               loop_stat.loop_length[SHORT]  = Sdomain.lpz - Sdomain.fpz + 1;
+
+               max_loop_indx = Ldomain.lpn;
+
+               loop_stat.samples_per_pass[LONG]   = 6500;
+               loop_stat.samples_per_pass[MEDIUM] = 30000;
+               loop_stat.samples_per_pass[SHORT]  = 800000;
+
+               break;
+            }
+
+            case DEL_DOT_VEC_2D : {
+               loop_name = std::string("DEL_DOT_VEC_2D");
+
+               loop_stat.loop_weight = weight[DATA_PARALLEL]; 
+
+               Index_type ndims = 2;
+
+               ADomain Ldomain(LONG, ndims);
+               loop_stat.loop_length[LONG]   = Ldomain.n_real_zones;
+               ADomain Mdomain(MEDIUM, ndims);
+               loop_stat.loop_length[MEDIUM] = Mdomain.n_real_zones;
+               ADomain Sdomain(SHORT, ndims);
+               loop_stat.loop_length[SHORT]  = Sdomain.n_real_zones;
+
+               max_loop_indx = Ldomain.lrn;
+
+               loop_stat.samples_per_pass[LONG]   = 4000;
+               loop_stat.samples_per_pass[MEDIUM] = 25000;
+               loop_stat.samples_per_pass[SHORT]  = 2000000;
+
+               break;
+            }
+
+            case COUPLE   : {
+               loop_name = std::string("COUPLE");
+
+               loop_stat.loop_weight = weight[TRANSCENDENTAL]; 
+
+               Index_type ndims = 3;
+
+               ADomain Ldomain(LONG, ndims);
+               loop_stat.loop_length[LONG]   = Ldomain.lpz - Ldomain.fpz + 1;
+               ADomain Mdomain(MEDIUM, ndims);
+               loop_stat.loop_length[MEDIUM] = Mdomain.lpz - Mdomain.fpz + 1;
+               ADomain Sdomain(SHORT, ndims);
+               loop_stat.loop_length[SHORT]  = Sdomain.lpz - Sdomain.fpz + 1;
+
+               max_loop_indx = Ldomain.lrn;
+
+               loop_stat.samples_per_pass[LONG]   = 2000;
+               loop_stat.samples_per_pass[MEDIUM] = 10000;
+               loop_stat.samples_per_pass[SHORT]  = 600000;
+
+               break;
+            }
+
+            case FIR   : {
+               loop_name = std::string("FIR");
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 10000;
+               loop_stat.samples_per_pass[MEDIUM] = 80000;
+               loop_stat.samples_per_pass[SHORT]  = 3000000;
+
+               break;
+            }
+
+
+            //
+            // Parameters defining how loops in Subset B are run...
+            //
+            case INIT3 : {
+               loop_name = std::string("INIT3");
+
+               loop_stat.loop_weight = weight[DATA_PARALLEL]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 10000;
+               loop_stat.samples_per_pass[MEDIUM] = 110000;
+               loop_stat.samples_per_pass[SHORT]  = 12000000;
+
+               break;
+            }
+
+            case MULADDSUB : { 
+               loop_name = std::string("MULADDSUB"); 
+
+               loop_stat.loop_weight = weight[DATA_PARALLEL]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 12000;
+               loop_stat.samples_per_pass[MEDIUM] = 140000;
+               loop_stat.samples_per_pass[SHORT]  = 15000000;
+
+               break; 
+            }
+
+            case IF_QUAD : {
+               loop_name = std::string("IF_QUAD");
+
+               loop_stat.loop_weight = weight[DATA_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 3000;
+               loop_stat.samples_per_pass[MEDIUM] = 30000;
+               loop_stat.samples_per_pass[SHORT]  = 1000000;
+
+               break;
+            }
+
+            case TRAP_INT : {
+               loop_name = std::string("TRAP_INT");
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 4000;
+               loop_stat.samples_per_pass[MEDIUM] = 32000;
+               loop_stat.samples_per_pass[SHORT]  = 1000000;
+
+               break;
+            }
+
+
+            //
+            // Parameters defining how loops in Subset C are run...
+            //
+            case HYDRO_1D : { 
+               loop_name = std::string("HYDRO_1D"); 
+
+               loop_stat.loop_weight = weight[DATA_PARALLEL]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 30000;
+               loop_stat.samples_per_pass[MEDIUM] = 320000;
+               loop_stat.samples_per_pass[SHORT]  = 15000000;
+
+               break; 
+            }
+
+            case ICCG : {
+               loop_name = std::string("ICCG"); 
+
+               loop_stat.loop_weight = weight[COMPLEX]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 20000;
+               loop_stat.samples_per_pass[MEDIUM] = 200000;
+               loop_stat.samples_per_pass[SHORT]  = 6000000;
+
+               break; 
+            }
+
+            case INNER_PROD : {
+               loop_name = std::string("INNER_PROD"); 
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 50000;
+               loop_stat.samples_per_pass[MEDIUM] = 600000;
+               loop_stat.samples_per_pass[SHORT]  = 30000000;
+
+               break; 
+            }
+
+            case BAND_LIN_EQ : {
+               loop_name = std::string("BAND_LIN_EQ"); 
+
+               loop_stat.loop_weight = weight[COMPLEX]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 40000;
+               loop_stat.samples_per_pass[MEDIUM] = 600000;
+               loop_stat.samples_per_pass[SHORT]  = 20000000;
+
+               break; 
+            }
+
+            case TRIDIAG_ELIM : {
+               loop_name = std::string("TRIDIAG_ELIM"); 
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 10000;
+               loop_stat.samples_per_pass[MEDIUM] = 100000;
+               loop_stat.samples_per_pass[SHORT]  = 3000000;
+
+               break; 
+            }
+
+            case EOS   : { 
+               loop_name = std::string("EOS");
+
+               loop_stat.loop_weight = weight[DATA_PARALLEL]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 18000;
+               loop_stat.samples_per_pass[MEDIUM] = 140000;
+               loop_stat.samples_per_pass[SHORT]  = 5000000;
+            
+               break; 
+            }
+
+            case ADI   : {
+               loop_name = std::string("ADI");
+
+               loop_stat.loop_weight = weight[COMPLEX]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 1000;
+               loop_stat.samples_per_pass[MEDIUM] = 9000;
+               loop_stat.samples_per_pass[SHORT]  = 300000;
+            
+               break; 
+            }
+
+            case INT_PREDICT   : {
+               loop_name = std::string("INT_PREDICT");
+
+               loop_stat.loop_weight = weight[POINTER_NEST]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 3000;
+               loop_stat.samples_per_pass[MEDIUM] = 30000;
+               loop_stat.samples_per_pass[SHORT]  = 2000000;
+            
+               break; 
+            }
+
+            case DIFF_PREDICT   : {
+               loop_name = std::string("DIFF_PREDICT");
+
+               loop_stat.loop_weight = weight[POINTER_NEST]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 2000;
+               loop_stat.samples_per_pass[MEDIUM] = 22000;
+               loop_stat.samples_per_pass[SHORT]  = 1800000;
+            
+               break; 
+            }
+
+            case FIRST_SUM   : { 
+               loop_name = std::string("FIRST_SUM");
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 30000;
+               loop_stat.samples_per_pass[MEDIUM] = 250000;
+               loop_stat.samples_per_pass[SHORT]  = 8000000;
+
+               break; 
+            }
+
+            case FIRST_DIFF   : { 
+               loop_name = std::string("FIRST_DIFF");
+
+               loop_stat.loop_weight = weight[DATA_PARALLEL]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 30000;
+               loop_stat.samples_per_pass[MEDIUM] = 500000;
+               loop_stat.samples_per_pass[SHORT]  = 30000000;
+
+               break; 
+            }
+
+            case PIC_2D   : {
+               loop_name = std::string("PIC_2D");
+
+               loop_stat.loop_weight = weight[COMPLEX]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 2000;
+               loop_stat.samples_per_pass[MEDIUM] = 18000;
+               loop_stat.samples_per_pass[SHORT]  = 700000;
+
+               break; 
+            }
+
+            case PIC_1D   : {
+               loop_name = std::string("PIC_1D");
+
+               loop_stat.loop_weight = weight[DATA_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 3000;
+               loop_stat.samples_per_pass[MEDIUM] = 24000;
+               loop_stat.samples_per_pass[SHORT]  = 1000000;
+
+               break; 
+            }
+
+            case HYDRO_2D   : {   
+               loop_name = std::string("HYDRO_2D");
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 300;
+               loop_stat.samples_per_pass[MEDIUM] = 2000;
+               loop_stat.samples_per_pass[SHORT]  = 50000;
+
+               break; 
+            }
+
+            case GEN_LIN_RECUR   : { 
+               loop_name = std::string("GEN_LIN_RECUR");
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 4000;
+               loop_stat.samples_per_pass[MEDIUM] = 36000;
+               loop_stat.samples_per_pass[SHORT]  = 1000000;
+
+               break; 
+            }
+
+            case DISC_ORD   : { 
+               loop_name = std::string("DISC_ORD");
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 1000;
+               loop_stat.samples_per_pass[MEDIUM] = 8000;
+               loop_stat.samples_per_pass[SHORT]  = 200000;
+
+               break; 
+            }
+
+            case MAT_X_MAT   : {   
+               loop_name = std::string("MAT_X_MAT");
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 8;
+               loop_stat.samples_per_pass[MEDIUM] = 70;
+               loop_stat.samples_per_pass[SHORT]  = 8000;
+
+               break; 
+            }
+
+            case PLANCKIAN   : { 
+               loop_name = std::string("PLANCKIAN");
+
+               loop_stat.loop_weight = weight[TRANSCENDENTAL]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 4000;
+               loop_stat.samples_per_pass[MEDIUM] = 30000;
+               loop_stat.samples_per_pass[SHORT]  = 1000000;
+
+               break; 
+            }
+
+            case IMP_HYDRO_2D   : {
+               loop_name = std::string("IMP_HYDRO_2D");
+
+               loop_stat.loop_weight = weight[ORDER_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 800;
+               loop_stat.samples_per_pass[MEDIUM] = 6000;
+               loop_stat.samples_per_pass[SHORT]  = 150000;
+
+               break; 
+            }
+
+            case FIND_FIRST_MIN   : { 
+               loop_name = std::string("FIND_FIRST_MIN");
+
+               loop_stat.loop_weight = weight[DATA_DEPENDENT]; 
+
+               for (int i = 0; i < NUM_LENGTHS; ++i) {
+                  loop_stat.loop_length[i] = shared_loop_length[i];
+               }
+               max_loop_indx = loop_stat.loop_length[LONG];
+
+               loop_stat.samples_per_pass[LONG]   = 50000;
+               loop_stat.samples_per_pass[MEDIUM] = 330000;
+               loop_stat.samples_per_pass[SHORT]  = 8000000;
+
+               break; 
+            }
+
+            default : { 
+               std::cout << "\n Unknown loop id = " << iloop << std::endl; 
+            }
+
+         }  // switch statement on loop id
+
+      }  // if loop with id is to be run
+
+
+      suite_info.loop_names.push_back(loop_name); 
+
+      //
+      // Set max loop length to be largest loop index used over all loops.
+      // 
+      max_loop_length = 
+         std::max(max_loop_length, max_loop_indx);
+            
+      //
+      // Set number of times each loop length will be run.
+      // 
+      for (unsigned i = 0; i < suite_info.num_loop_lengths; ++i) {
+
+         loop_stat.samples_per_pass[i] = static_cast<int>(
+            loop_stat.samples_per_pass[i] * suite_info.loop_samp_frac / 
+            loop_length_factor); 
+
+         if ( suite_info.run_loop_length[i] ) {
+            loop_stat.loop_run_count[i] = 
+               loop_stat.samples_per_pass[i] * suite_info.num_suite_passes;
+         } else {
+            loop_stat.loop_run_count[i] = 0;
+         }
+
+      }
+
+      //
+      // We add loop stat for each loop to maintain consistent array indexing.
+      // However, only loops specified to be run will be executed.
+      // 
+      for (unsigned ilv = 0; ilv < run_variant_names.size(); ++ilv) {
+         suite_info.getLoopStats(run_variant_names[ilv]).push_back(loop_stat);
+      }
+
+   }  // loop over loop IDs
+
+
+   defineReferenceLoopRunInfo();
+
+   s_loop_data->max_loop_length = 
+      std::max(max_loop_length, suite_info.ref_loop_stat.loop_length[LONG]); 
+}
+
+//
+// Generate vector of loop variant names string from vector of 
+// LoopVariantID enum values.
+//
+std::vector<std::string> getVariantNames(
+   const std::vector<LoopVariantID>& lvids)
+{
+   std::vector<std::string> run_variant_names;
+   for (unsigned ilv = 0; ilv < lvids.size(); ++ilv) {
+      std::string variant_name = getVariantName(lvids[ilv]);
+      run_variant_names.push_back(variant_name);
+   }
+   return run_variant_names;
+}
+
+//
+// Generate loop variant name string from LoopVariantID enum value.
+//
+std::string getVariantName(LoopVariantID lvid)
+{
+   std::string lvname;
+
+   switch ( lvid ) {
+
+// Bechmark variants
+//
+      case RAW: { 
+         lvname = "Raw"; break; 
+      }
+      case RAW_OMP: { 
+         lvname = "Raw_OMP"; break; 
+      }
+      case FORALL_LAMBDA: {
+         lvname = "Forall_Lambda"; break;
+      }
+      case FORALL_LAMBDA_OMP: {
+         lvname = "Forall_Lambda_OMP"; break;
+      }
+
+#if defined(LCALS_DO_MISC)
+
+// Misc variants
+//
+      case FORALL_HYBRID_LAMBDA: {
+         lvname = "Hybrid_Lambda"; break; 
+      }
+#if 0  // THESE ARE AVAILABLE YET!!!
+      case FORALL_HYBRID_LAMBDA_OMP: {
+         lvname = "Hybrid_Lambda_OMP"; break; 
+      }
+#endif
+      case FORALL_FUNCTOR: {
+         lvname = "Forall_Functor"; break;
+      }
+      case FORALL_FUNCTOR_OMP: {
+         lvname = "Forall_Functor_OMP"; break;
+      }
+#if 0  // THESE ARE AVAILABLE YET!!!
+      case FORALL_HYBRID_FUNCTOR: {
+         lvname = "Hybrid_Functor"; break; 
+      }
+      case FORALL_HYBRID_FUNCTOR_OMP: { 
+         lvname = "Hybrid_Functor_OMP"; break; 
+      }
+#endif
+      case RAW_FUNC: {
+         lvname = "Raw_Func"; break;
+      }
+      case FORALL_LAMBDA_TYPEFIX: {
+         lvname = "Forall_Lambda_TYPEFIX"; break;
+      }
+      case FORALL_LAMBDA_OMP_TYPEFIX: {
+         lvname = "Forall_Lambda_OMP_TYPEFIX"; break;
+      }
+      case FORALL_HYBRID_LAMBDA_TYPEFIX: {
+         lvname = "Hybrid_Lambda_TYPEFIX"; break; 
+      }
+
+#endif // if LCALS_DO_MISC
+
+      default: {
+         std::cout << "\n Unknown loop variant id = " << lvid << std::endl; 
+      }
+
+   }
+
+   return lvname;
+}
+
+
+//
+// Execute loop variant identified by function args.
+//
+void runLoopVariant( LoopVariantID lvid,
+                     bool run_loop[],
+                     LoopLength ilength )
+{
+   LoopSuiteRunInfo& loop_suite_run_info = getLoopSuiteRunInfo();
+
+   std::string loop_variant_name = getVariantName(lvid);
+   std::vector<LoopStat>& loop_stats =
+        loop_suite_run_info.getLoopStats(loop_variant_name);
+
+   switch ( lvid ) {
+
+// Bechmark variants
+//
+      case RAW: { 
+         runARawLoops(loop_stats, run_loop, ilength);
+         runBRawLoops(loop_stats, run_loop, ilength);
+         runCRawLoops(loop_stats, run_loop, ilength);
+         break; 
+      }
+      case FORALL_LAMBDA: {
+         runAForallLambdaLoops(loop_stats, run_loop, ilength);
+         runBForallLambdaLoops(loop_stats, run_loop, ilength);
+         runCForallLambdaLoops(loop_stats, run_loop, ilength); 
+         break;
+      }
+      case RAW_OMP: { 
+         runOMPRawLoops(loop_stats, run_loop, ilength);
+         break; 
+      }
+      case FORALL_LAMBDA_OMP: {
+         runOMPForallLambdaLoops(loop_stats, run_loop, ilength);
+         break;
+      }
+
+#if defined(LCALS_DO_MISC)
+
+// Misc variants
+//
+      case FORALL_HYBRID_LAMBDA: {
+         runAForallHybridLambdaLoops(loop_stats, run_loop, ilength);
+         runBForallHybridLambdaLoops(loop_stats, run_loop, ilength);
+         runCForallHybridLambdaLoops(loop_stats, run_loop, ilength);
+         break; 
+      }
+#if 0  // THESE ARE NOT DEFINED YET!!!
+      case FORALL_HYBRID_LAMBDA_OMP: {
+         break; 
+      }
+#endif
+      case FORALL_FUNCTOR: {
+         runAForallFunctorLoops(loop_stats, run_loop, ilength);
+         runBForallFunctorLoops(loop_stats, run_loop, ilength);
+         runCForallFunctorLoops(loop_stats, run_loop, ilength);
+         break;
+      }
+      case FORALL_FUNCTOR_OMP: {
+         runOMPForallFunctorLoops(loop_stats, run_loop, ilength);
+         break;
+      }
+#if 0  // THESE ARE NOT DEFINED YET!!!
+      case FORALL_HYBRID_FUNCTOR: {
+         break; 
+      }
+      case FORALL_HYBRID_FUNCTOR_OMP: { 
+         break; 
+      }
+#endif
+      case RAW_FUNC: {
+         runARawFuncLoops(loop_stats, run_loop, ilength);
+         runBRawFuncLoops(loop_stats, run_loop, ilength);
+         runCRawFuncLoops(loop_stats, run_loop, ilength);
+         break;
+      }
+      case FORALL_LAMBDA_TYPEFIX: {
+         runAForallLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength);
+         runBForallLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength);
+         runCForallLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength);
+         break;
+      }
+      case FORALL_LAMBDA_OMP_TYPEFIX: {
+         runOMPForallLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength); 
+         break;
+      }
+      case FORALL_HYBRID_LAMBDA_TYPEFIX: {
+         runAForallHybridLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength);
+         runBForallHybridLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength);
+         runCForallHybridLambdaLoops_TYPEFIX(loop_stats, run_loop, ilength); 
+         break; 
+      }
+
+#endif // if LCALS_DO_MISC
+
+      default: {
+         std::cout << "\n Unknown loop variant id = " << lvid << std::endl; 
+      }
+
+   }
+
+}
+
+
+//
+// Initialize data to run loop with given ID.  Note that this routine 
+// assumes that it is called before the loop with given ID is run and
+// that data initialization calls in here are concistent with what is
+// needed to execute loop.
+//
+// Loop data is initialized in this routine so all variants of loop
+// tun the same way.  Note that data arrays are initialized for 
+// each loop only under the circumstances that it is actually required.
+//
+//
+
+void loopInit(unsigned iloop, LoopStat& stat)
+{
+   LoopData& loop_data = getLoopData();
+
+   flushCache();
+
+   stat.loop_is_run = true;
+
+
+   switch ( iloop ) {
+   
+      case REF_LOOP : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         break;
+      }
+
+      case PRESSURE_CALC   :
+      case PRESSURE_CALC_ALT   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+      case ENERGY_CALC   :
+      case ENERGY_CALC_ALT   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+         initData(loop_data.RealArray_1D[5]);
+         initData(loop_data.RealArray_1D[6]);
+         initData(loop_data.RealArray_1D[7]);
+         initData(loop_data.RealArray_1D[8]);
+         initData(loop_data.RealArray_1D[9]);
+         initData(loop_data.RealArray_1D[10]);
+         initData(loop_data.RealArray_1D[11]);
+         initData(loop_data.RealArray_1D[12]);
+         initData(loop_data.RealArray_1D[13]);
+         initData(loop_data.RealArray_1D[14]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+      case VOL3D_CALC   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+
+         break;
+      }
+
+      case DEL_DOT_VEC_2D : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case COUPLE   : {
+
+         initData(loop_data.ComplexArray_1D[0]);
+         initData(loop_data.ComplexArray_1D[1]);
+         initData(loop_data.ComplexArray_1D[2]);
+         initData(loop_data.ComplexArray_1D[3]);
+         initData(loop_data.ComplexArray_1D[4]);
+
+         break;
+      }
+
+      case FIR   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break;
+      }
+
+      case INIT3 : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case MULADDSUB : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case IF_QUAD : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case TRAP_INT : {
+
+         initData(loop_data.IndxArray_1D[0]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+
+      case HYDRO_1D : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+      case ICCG : {
+
+         initData(loop_data.RealArray_1D_Nx4[0]);
+         initData(loop_data.RealArray_1D_Nx4[1]);
+
+         break;
+      }
+
+      case INNER_PROD : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break;
+      }
+
+      case BAND_LIN_EQ : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break;
+      }
+
+      case TRIDIAG_ELIM : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         break;
+      }
+
+      case EOS   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+      case ADI   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         initData(loop_data.RealArray_3D_2xNx4[0]);
+         initData(loop_data.RealArray_3D_2xNx4[1]);
+         initData(loop_data.RealArray_3D_2xNx4[2]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+      case INT_PREDICT   : {
+
+         initData(loop_data.RealArray_2D_Nx25[0]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+      case DIFF_PREDICT   : {
+
+         initData(loop_data.RealArray_2D_Nx25[0]);
+         initData(loop_data.RealArray_2D_Nx25[1]);
+
+         break;
+      }
+
+      case FIRST_SUM   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break;
+      }
+
+      case FIRST_DIFF   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break;
+      }
+
+      case PIC_2D   : {
+
+         initData(loop_data.RealArray_2D_Nx25[0]);
+         initData(loop_data.RealArray_2D_Nx25[1]);
+         initData(loop_data.RealArray_2D_Nx25[2]);
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         initData(loop_data.IndxArray_1D[0]);
+         initData(loop_data.IndxArray_1D[1]);
+
+         initData(loop_data.RealArray_2D_64x64[0]);
+
+         break;
+      }
+
+      case PIC_1D   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+         initData(loop_data.RealArray_1D[5]);
+         initData(loop_data.RealArray_1D[6]);
+         initData(loop_data.RealArray_1D[7]);
+         initData(loop_data.RealArray_1D[8]);
+
+         initData(loop_data.RealArray_scalars);
+
+         initData(loop_data.IndxArray_1D[2]);
+         initData(loop_data.IndxArray_1D[3]);
+         initData(loop_data.IndxArray_1D[4]);
+
+         break;
+      }
+
+      case HYDRO_2D   : {
+
+         initData(loop_data.RealArray_2D_7xN[0]);
+         initData(loop_data.RealArray_2D_7xN[1]);
+         initData(loop_data.RealArray_2D_7xN[2]);
+         initData(loop_data.RealArray_2D_7xN[3]);
+         initData(loop_data.RealArray_2D_7xN[4]);
+         initData(loop_data.RealArray_2D_7xN[5]);
+         initData(loop_data.RealArray_2D_7xN[6]);
+         initData(loop_data.RealArray_2D_7xN[7]);
+         initData(loop_data.RealArray_2D_7xN[8]);
+         initData(loop_data.RealArray_2D_7xN[9]);
+         initData(loop_data.RealArray_2D_7xN[10]);
+
+         break;
+      }
+
+      case GEN_LIN_RECUR   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+      case DISC_ORD   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+         initData(loop_data.RealArray_1D[5]);
+         initData(loop_data.RealArray_1D[6]);
+         initData(loop_data.RealArray_1D[7]);
+         initData(loop_data.RealArray_1D[8]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+      case MAT_X_MAT   : {
+
+         initData(loop_data.RealArray_2D_Nx25[0]);
+         initData(loop_data.RealArray_2D_Nx25[1]);
+         initData(loop_data.RealArray_2D_64x64[0]);
+
+         break;
+      }
+
+      case PLANCKIAN   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case IMP_HYDRO_2D   : {
+
+         initData(loop_data.RealArray_2D_7xN[0]);
+         initData(loop_data.RealArray_2D_7xN[1]);
+         initData(loop_data.RealArray_2D_7xN[2]);
+         initData(loop_data.RealArray_2D_7xN[3]);
+         initData(loop_data.RealArray_2D_7xN[4]);
+         initData(loop_data.RealArray_2D_7xN[5]);
+
+         break;
+      }
+
+      case FIND_FIRST_MIN   : {
+
+         initData(loop_data.RealArray_1D[0]);
+
+         break;
+      }
+
+
+      default : {
+         std::cout << "\n Unknown loop id = " << iloop << std::endl;
+      }
+   }
+}
+
+/* *********** LLVM Test Suite ************* *
+ *                                           *
+ *   Overloaded for use in the test suite.   *
+ *   Removes LoopStat argument and setting   *
+ *   the loop as run.  Benchmark library     *
+ *   replaces the stat object for timing     *
+ *   statistics.                             *
+ *                                           *
+ * ***************************************** */
+
+void loopInit(unsigned iloop) //, LoopStat& stat)
+{
+   LoopData& loop_data = getLoopData();
+
+   flushCache();
+
+//   stat.loop_is_run = true;
+
+
+   switch ( iloop ) {
+   
+      case REF_LOOP : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         break;
+      }
+
+
+      //
+      // Initialize data for Loop Subset A...
+      //
+      case PRESSURE_CALC   :
+      case PRESSURE_CALC_ALT   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         initData(loop_data.RealArray_scalars); 
+
+         break;
+      }
+
+      case ENERGY_CALC   :
+      case ENERGY_CALC_ALT   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+         initData(loop_data.RealArray_1D[5]);
+         initData(loop_data.RealArray_1D[6]);
+         initData(loop_data.RealArray_1D[7]);
+         initData(loop_data.RealArray_1D[8]);
+         initData(loop_data.RealArray_1D[9]);
+         initData(loop_data.RealArray_1D[10]);
+         initData(loop_data.RealArray_1D[11]);
+         initData(loop_data.RealArray_1D[12]);
+         initData(loop_data.RealArray_1D[13]);
+         initData(loop_data.RealArray_1D[14]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+      case VOL3D_CALC   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+
+         break;
+      }
+
+      case DEL_DOT_VEC_2D : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case COUPLE   : {
+
+         initData(loop_data.ComplexArray_1D[0]);
+         initData(loop_data.ComplexArray_1D[1]);
+         initData(loop_data.ComplexArray_1D[2]);
+         initData(loop_data.ComplexArray_1D[3]);
+         initData(loop_data.ComplexArray_1D[4]);
+
+         break;
+      }
+
+      case FIR   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break;
+      }
+
+
+      //
+      // Initialize data for Loop Subset B...
+      //
+      case INIT3 : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case MULADDSUB : { 
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break; 
+      }
+
+      case IF_QUAD : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case TRAP_INT : {
+
+         initData(loop_data.IndxArray_1D[0]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break;
+      }
+
+
+      //
+      // Initialize data for Loop Subset C...
+      //
+      case HYDRO_1D : { 
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break; 
+      }
+
+      case ICCG : {
+
+         initData(loop_data.RealArray_1D_Nx4[0]);
+         initData(loop_data.RealArray_1D_Nx4[1]);
+
+         break; 
+      }
+
+      case INNER_PROD : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break; 
+      }
+
+      case BAND_LIN_EQ : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break; 
+      }
+
+      case TRIDIAG_ELIM : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         break; 
+      }
+
+      case EOS   : { 
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+
+         initData(loop_data.RealArray_scalars);
+      
+         break; 
+      }
+
+      case ADI   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         initData(loop_data.RealArray_3D_2xNx4[0]);
+         initData(loop_data.RealArray_3D_2xNx4[1]);
+         initData(loop_data.RealArray_3D_2xNx4[2]);
+
+         initData(loop_data.RealArray_scalars);
+      
+         break; 
+      }
+
+      case INT_PREDICT   : {
+
+         initData(loop_data.RealArray_2D_Nx25[0]);
+
+         initData(loop_data.RealArray_scalars);
+      
+         break; 
+      }
+
+      case DIFF_PREDICT   : {
+
+         initData(loop_data.RealArray_2D_Nx25[0]);
+         initData(loop_data.RealArray_2D_Nx25[1]);
+      
+         break; 
+      }
+
+      case FIRST_SUM   : { 
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break; 
+      }
+
+      case FIRST_DIFF   : { 
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         break; 
+      }
+
+      case PIC_2D   : {
+
+         initData(loop_data.RealArray_2D_Nx25[0]);
+         initData(loop_data.RealArray_2D_Nx25[1]);
+         initData(loop_data.RealArray_2D_Nx25[2]);
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+
+         initData(loop_data.IndxArray_1D[0]);
+         initData(loop_data.IndxArray_1D[1]);
+
+         initData(loop_data.RealArray_2D_64x64[0]);
+
+         break; 
+      }
+
+      case PIC_1D   : {
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+         initData(loop_data.RealArray_1D[5]);
+         initData(loop_data.RealArray_1D[6]);
+         initData(loop_data.RealArray_1D[7]);
+         initData(loop_data.RealArray_1D[8]);
+
+         initData(loop_data.RealArray_scalars);
+
+         initData(loop_data.IndxArray_1D[2]);
+         initData(loop_data.IndxArray_1D[3]);
+         initData(loop_data.IndxArray_1D[4]);
+
+         break; 
+      }
+
+      case HYDRO_2D   : {   
+
+         initData(loop_data.RealArray_2D_7xN[0]);
+         initData(loop_data.RealArray_2D_7xN[1]);
+         initData(loop_data.RealArray_2D_7xN[2]);
+         initData(loop_data.RealArray_2D_7xN[3]);
+         initData(loop_data.RealArray_2D_7xN[4]);
+         initData(loop_data.RealArray_2D_7xN[5]);
+         initData(loop_data.RealArray_2D_7xN[6]);
+         initData(loop_data.RealArray_2D_7xN[7]);
+         initData(loop_data.RealArray_2D_7xN[8]);
+         initData(loop_data.RealArray_2D_7xN[9]);
+         initData(loop_data.RealArray_2D_7xN[10]);
+
+         break; 
+      }
+
+      case GEN_LIN_RECUR   : { 
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break; 
+      }
+
+      case DISC_ORD   : { 
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+         initData(loop_data.RealArray_1D[5]);
+         initData(loop_data.RealArray_1D[6]);
+         initData(loop_data.RealArray_1D[7]);
+         initData(loop_data.RealArray_1D[8]);
+
+         initData(loop_data.RealArray_scalars);
+
+         break; 
+      }
+
+      case MAT_X_MAT   : {   
+
+         initData(loop_data.RealArray_2D_Nx25[0]);
+         initData(loop_data.RealArray_2D_Nx25[1]);
+         initData(loop_data.RealArray_2D_64x64[0]);
+
+         break; 
+      }
+
+      case PLANCKIAN   : { 
+
+         initData(loop_data.RealArray_1D[0]);
+         initData(loop_data.RealArray_1D[1]);
+         initData(loop_data.RealArray_1D[2]);
+         initData(loop_data.RealArray_1D[3]);
+         initData(loop_data.RealArray_1D[4]);
+
+         break; 
+      }
+
+      case IMP_HYDRO_2D   : {
+
+         initData(loop_data.RealArray_2D_7xN[0]);
+         initData(loop_data.RealArray_2D_7xN[1]);
+         initData(loop_data.RealArray_2D_7xN[2]);
+         initData(loop_data.RealArray_2D_7xN[3]);
+         initData(loop_data.RealArray_2D_7xN[4]);
+         initData(loop_data.RealArray_2D_7xN[5]);
+
+         break; 
+      }
+
+      case FIND_FIRST_MIN   : { 
+
+         initData(loop_data.RealArray_1D[0]);
+
+         break; 
+      }
+
+
+      default : { 
+         std::cout << "\n Unknown loop id = " << iloop << std::endl; 
+      }
+
+   }  // switch statement on loop id
+
+}
+
+
+//
+// Finalize data for loop with given ID.  Note that this routine assumes 
+// that it is called after the loop with given ID is run and that checksum 
+// calls in here are concistent with what is needed for loop.
+//
+void loopFinalize(unsigned iloop, LoopStat& stat, LoopLength ilength)
+{
+#if defined(LCALS_VERIFY_CHECKSUM)
+   initChksum(stat, ilength);
+
+   LoopData& loop_data = getLoopData();
+
+   switch ( iloop ) {
+   
+      case REF_LOOP : {
+
+         // Nothing to do for REF_LOOP case...
+
+         break;
+      }
+
+
+      //
+      // Update checksums for Loop Subset A...
+      //
+      case PRESSURE_CALC   :
+      case PRESSURE_CALC_ALT   : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[2]);
+
+         break;
+      }
+
+      case ENERGY_CALC   :
+      case ENERGY_CALC_ALT     : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+         updateChksum(stat, ilength, loop_data.RealArray_1D[5]);
+
+         break;
+      }
+
+      case VOL3D_CALC   : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[3]);
+
+         break;
+      }
+
+      case DEL_DOT_VEC_2D : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case COUPLE   : {
+
+         updateChksum(stat, ilength, loop_data.ComplexArray_1D[0]);
+         updateChksum(stat, ilength, loop_data.ComplexArray_1D[1]);
+         updateChksum(stat, ilength, loop_data.ComplexArray_1D[2]);
+
+         break;
+      }
+
+      case FIR   : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+
+         break;
+      }
+
+
+      //
+      // Update checksums for Loop Subset B...
+      //
+      case INIT3 : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+         updateChksum(stat, ilength, loop_data.RealArray_1D[1]);
+         updateChksum(stat, ilength, loop_data.RealArray_1D[2]);
+
+         break;
+      }
+
+      case MULADDSUB : { 
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+         updateChksum(stat, ilength, loop_data.RealArray_1D[1]);
+         updateChksum(stat, ilength, loop_data.RealArray_1D[2]);
+
+         break; 
+      }
+
+      case IF_QUAD : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[3]);
+         updateChksum(stat, ilength, loop_data.RealArray_1D[4]);
+
+         break;
+      }
+
+      case TRAP_INT : {
+
+         updateChksum(stat, ilength, loop_data.scalar_Real[0]);
+
+         break;
+      }
+
+
+      //
+      // Update checksums for Loop Subset C...
+      //
+      case HYDRO_1D : { 
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+
+         break; 
+      }
+
+      case ICCG : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D_Nx4[0]);
+
+         break; 
+      }
+
+      case INNER_PROD : {
+
+         updateChksum(stat, ilength, loop_data.scalar_Real[0]);
+
+         break; 
+      }
+
+      case BAND_LIN_EQ : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+
+         break; 
+      }
+
+      case TRIDIAG_ELIM : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]); 
+
+         break; 
+      }
+
+      case EOS   : { 
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+
+         break; 
+      }
+
+      case ADI   : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_3D_2xNx4[0]);
+         updateChksum(stat, ilength, loop_data.RealArray_3D_2xNx4[1]);
+         updateChksum(stat, ilength, loop_data.RealArray_3D_2xNx4[2]);
+
+         break; 
+      }
+
+      case INT_PREDICT   : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_2D_Nx25[0]);
+
+         break; 
+      }
+
+      case DIFF_PREDICT   : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_2D_Nx25[0]);
+
+         break; 
+      }
+
+      case FIRST_SUM   : { 
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+
+         break; 
+      }
+
+      case FIRST_DIFF   : { 
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+
+         break; 
+      }
+
+      case PIC_2D   : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_2D_Nx25[0]);
+         updateChksum(stat, ilength, loop_data.RealArray_2D_64x64[0]); 
+
+         break; 
+      }
+
+      case PIC_1D   : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[6]);
+         updateChksum(stat, ilength, loop_data.RealArray_1D[1]);
+         updateChksum(stat, ilength, loop_data.RealArray_1D[7]);
+
+         break; 
+      }
+
+      case HYDRO_2D   : {   
+
+         updateChksum(stat, ilength, loop_data.RealArray_2D_7xN[9]);
+         updateChksum(stat, ilength, loop_data.RealArray_2D_7xN[10]);
+
+         break; 
+      }
+
+      case GEN_LIN_RECUR   : { 
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[0]);
+
+         break; 
+      }
+
+      case DISC_ORD   : { 
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[7]);
+
+         break; 
+      }
+
+      case MAT_X_MAT   : {   
+
+         updateChksum(stat, ilength, loop_data.RealArray_2D_Nx25[0]); 
+
+         break; 
+      }
+
+      case PLANCKIAN   : { 
+
+         updateChksum(stat, ilength, loop_data.RealArray_1D[4]);
+
+         break; 
+      }
+
+      case IMP_HYDRO_2D   : {
+
+         updateChksum(stat, ilength, loop_data.RealArray_2D_7xN[0]);
+
+         break; 
+      }
+
+      case FIND_FIRST_MIN   : { 
+
+         updateChksum(stat, ilength, loop_data.scalar_Real[0]); 
+
+         break; 
+      }
+
+
+      default : { 
+         std::cout << "\n Unknown loop id = " << iloop << std::endl; 
+      }
+
+   }  // switch statement on loop id
+
+#endif  // if LCALS_VERIFY_CHECKSUM
+}
+
+
+//
+// Allocate and initialize arrays (and scalars) used to execute loops in suite.
+//
+void allocateLoopData()
+{
+#ifdef TESTSUITE
+   std::cout << "\n allocateLoopData..." << std::endl;
+#endif
+   unsigned num_aligned_segments = 
+      (s_loop_data->max_loop_length + 20)/LCALS_DATA_ALIGN + 1;
+   unsigned aligned_chunksize = num_aligned_segments * LCALS_DATA_ALIGN;
+
+   //
+   //  Allocate and initialize 1D loop length Real arrays.
+   // 
+   for (unsigned i = 0; i < s_loop_data->s_num_1D_Real_arrays; ++i) { 
+      Index_type data_len = aligned_chunksize;
+
+      LoopData::RealArray* rarray = s_loop_data->RealArray_1D;
+      rarray[i].id = i+1;
+      Real_ptr data = allocAndInitData(rarray[i], data_len); 
+
+      s_loop_data->array_1D_Real[i] = data;
+   }
+
+   //
+   //  Allocate and initialize 1D loop length X 4 Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_1D_Nx4_Real_arrays; ++i) {
+      Index_type data_len = aligned_chunksize*4;
+
+      LoopData::RealArray* rarray = s_loop_data->RealArray_1D_Nx4;
+      rarray[i].id = i+1;
+      Real_ptr data = allocAndInitData(rarray[i], data_len);
+
+      s_loop_data->array_1D_Nx4_Real[i] = data;
+   }
+
+   //
+   //  Allocate and initialize 1D loop length Indx arrays.
+   // 
+   for (unsigned i = 0; i < s_loop_data->s_num_1D_Indx_arrays; ++i) {
+      Index_type data_len = aligned_chunksize;
+
+      LoopData::IndxArray* iarray = s_loop_data->IndxArray_1D;
+      iarray[i].id = i;
+      Index_type* data = allocAndInitData(iarray[i], data_len);
+
+      s_loop_data->array_1D_Indx[i] = data;
+   }
+
+   //
+   //  Allocate and initialize 1D loop length Complex arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_1D_Complex_arrays; ++i) {
+      Index_type data_len = aligned_chunksize;
+
+      LoopData::ComplexArray* carray = s_loop_data->ComplexArray_1D;
+      carray[i].id = i+1;
+      Complex_ptr data = allocAndInitData(carray[i], data_len);
+
+      s_loop_data->array_1D_Complex[i] = data;
+   }
+
+   //
+   //  Allocate and initialize 2D loop length X 25 Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_2D_Nx25_Real_arrays; ++i) {
+      Index_type data_len = aligned_chunksize*25;
+
+      LoopData::RealArray* rarray = s_loop_data->RealArray_2D_Nx25;
+      rarray[i].id = i+1;
+      Real_ptr data = allocAndInitData(rarray[i], data_len);
+
+      s_loop_data->array_2D_Nx25_Real[i] = new Real_ptr[aligned_chunksize];
+      for (Index_type k = 0; k < aligned_chunksize; ++k) {
+         s_loop_data->array_2D_Nx25_Real[i][k] = &data[k*25];
+      }
+   }
+
+   //
+   //  Allocate and initialize 2D 7 X loop length Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_2D_7xN_Real_arrays; ++i) {
+      Index_type data_len = 7*aligned_chunksize;
+
+      LoopData::RealArray* rarray = s_loop_data->RealArray_2D_7xN;
+      rarray[i].id = i+1;
+      Real_ptr data = allocAndInitData(rarray[i], data_len);
+
+      s_loop_data->array_2D_7xN_Real[i] = new Real_ptr[7];
+      for (Index_type k = 0; k < 7; ++k) {
+         s_loop_data->array_2D_7xN_Real[i][k] = &data[k*aligned_chunksize];
+      }
+   }
+
+   //
+   //  Allocate and initialize 2D 64 X 64 Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_2D_64x64_Real_arrays; ++i) {
+      Index_type data_len = 64*64;
+
+      LoopData::RealArray* rarray = s_loop_data->RealArray_2D_64x64;
+      rarray[i].id = i+1;
+      Real_ptr data = allocAndInitData(rarray[i], data_len);
+
+      s_loop_data->array_2D_64x64_Real[i] = new Real_ptr[64];
+      for (Index_type k = 0; k < 64; ++k) {
+         s_loop_data->array_2D_64x64_Real[i][k] = &data[k*64];
+      }
+   }
+
+   //
+   //  Allocate and initialize 3D 2 X loop length X 4 Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_3D_2xNx4_Real_arrays; ++i) {
+      Index_type data_len = 2*aligned_chunksize*4;
+
+      LoopData::RealArray* rarray = s_loop_data->RealArray_3D_2xNx4;
+      rarray[i].id = i+1;
+      Real_ptr data = allocAndInitData(rarray[i], data_len);
+
+      s_loop_data->array_3D_2xNx4_Real[i] = new Real_ptr*[2];
+      for (Index_type k = 0; k < 2; ++k) {
+         s_loop_data->array_3D_2xNx4_Real[i][k] = new Real_ptr[aligned_chunksize];
+      }
+
+      for (Index_type k = 0; k < 2; ++k) {
+         for (Index_type l = 0; l < aligned_chunksize; ++l) {
+            s_loop_data->array_3D_2xNx4_Real[i][k][l] = &data[k*l*4];
+         }
+      }
+   }
+
+
+   //
+   //  Initialize Real scalars.
+   //
+   s_loop_data->RealArray_scalars.id = 21;
+   s_loop_data->RealArray_scalars.data = s_loop_data->scalar_Real;
+   s_loop_data->RealArray_scalars.len = s_loop_data->s_num_Real_scalars;
+   initData(s_loop_data->RealArray_scalars);
+
+}
+
+//
+// Free arrays used in loop suite loop execution (allocated in routine above).
+//
+void freeLoopData()
+{
+   if ( s_loop_data != 0 )  return;
+#ifdef TESTSUITE
+   std::cout << "\n freeLoopData..." << std::endl;
+#endif
+   //
+   //  De-allocate 1D loop length Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_1D_Real_arrays; ++i) {
+#if defined(USE_PTR_CLASS)
+      free( s_loop_data->array_1D_Real[i].get() );
+#else
+      free( s_loop_data->array_1D_Real[i] );
+#endif
+   }
+
+   //
+   //  De-allocate 1D loop length X 4 Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_1D_Nx4_Real_arrays; ++i) {
+#if defined(USE_PTR_CLASS)
+      free( s_loop_data->array_1D_Nx4_Real[i].get() );
+#else
+      free( s_loop_data->array_1D_Nx4_Real[i] );
+#endif
+   }
+
+   //
+   //  De-allocate 1D loop length Indx arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_1D_Indx_arrays; ++i) {
+      free( s_loop_data->array_1D_Indx[i] );
+   }
+
+   //
+   //  De-allocate 1D loop length Complex arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_1D_Complex_arrays; ++i) {
+#if defined(USE_PTR_CLASS)
+      free( s_loop_data->array_1D_Complex[i].get() );
+#else
+      free( s_loop_data->array_1D_Complex[i] );
+#endif
+   }
+
+   //
+   //  De-allocate 2D 7 X loop length Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_2D_7xN_Real_arrays; ++i) {
+#if defined(USE_PTR_CLASS)
+      free( s_loop_data->array_2D_7xN_Real[i][0].get() );
+#else
+      free( s_loop_data->array_2D_7xN_Real[i][0] );
+#endif
+      delete [] s_loop_data->array_2D_7xN_Real[i]; 
+   }
+
+   //
+   //  De-allocate 2D 64 X 64 Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_2D_64x64_Real_arrays; ++i) {
+#if defined(USE_PTR_CLASS)
+      free( s_loop_data->array_2D_64x64_Real[i][0].get() );
+#else
+      free( s_loop_data->array_2D_64x64_Real[i][0] );
+#endif
+      delete [] s_loop_data->array_2D_64x64_Real[i]; 
+   }
+
+   //
+   //  De-allocate and initialize 3D 2 X loop length X 4 Real arrays.
+   //
+   for (unsigned i = 0; i < s_loop_data->s_num_3D_2xNx4_Real_arrays; ++i) {
+#if defined(USE_PTR_CLASS)
+      free( s_loop_data->array_3D_2xNx4_Real[i][0][0].get() );
+#else
+      free( s_loop_data->array_3D_2xNx4_Real[i][0][0] );
+#endif
+      for (Index_type k = 0; k < 2; ++k) {
+         delete [] s_loop_data->array_3D_2xNx4_Real[i][k]; 
+      }
+      delete [] s_loop_data->array_3D_2xNx4_Real[i]; 
+   }
+
+   delete s_loop_data;
+   s_loop_data = 0;
+}
+
+
+//
+// Implementations of file scope routines used to manage loop data 
+// and checksums
+//
+
+namespace {
+
+
+//
+// Routines to allocate and initialize individual arrays consistently for 
+// checking results.
+//
+Real_ptr allocAndInitData(LoopData::RealArray& ra, Index_type len)
+{
+   Real_ptr data = 0; 
+   posix_memalign( (void **)&data, LCALS_DATA_ALIGN, len*sizeof(Real_type) );
+   ra.data = data;
+   ra.len = len; 
+
+   initData(ra);
+
+   return data;
+}
+
+Index_type* allocAndInitData(LoopData::IndxArray& ia, Index_type len)
+{
+   Index_type* data = 0;
+   posix_memalign( (void **)&data, LCALS_DATA_ALIGN, len*sizeof(Index_type) );
+   ia.data = data;
+   ia.len = len;
+
+   initData(ia);
+
+   return data;
+}
+
+Complex_ptr allocAndInitData(LoopData::ComplexArray& ca, Index_type len)
+{
+   Complex_ptr data = new Complex_type[len];
+   ca.data = data; 
+   ca.len = len;
+
+   initData(ca);
+
+   return data;
+}
+
+void initData(LoopData::RealArray& ra)
+{
+   int id = ra.id;
+   Real_type factor = ( id % 2 ? 0.1 : 0.2 );
+   Real_ptr data = ra.data;
+   Index_type totlen = ra.len;
+#if defined(LCALS_OMP_MEM_INIT)
+#pragma omp parallel for
+   for (Index_type j = 0; j < totlen; ++j) {
+      data[j] = factor*(j + 1.1)/(j + 1.12345);
+   }
+#else
+   for (Index_type j = 0; j < totlen; ++j) {
+      data[j] = factor*(j + 1.1)/(j + 1.12345);
+   }
+#endif
+}
+
+void initData(LoopData::IndxArray& ia)
+{
+   int id = ia.id;
+   Index_type* data = ia.data;
+   Index_type totlen = ia.len;
+#if defined(LCALS_OMP_MEM_INIT)
+#pragma omp parallel for
+   for (Index_type j = 0; j < totlen; ++j) {
+      data[j] = 0;
+   }
+#else
+   for (Index_type j = 0; j < totlen; ++j) {
+      data[j] = 0;
+   }
+#endif
+}
+
+void initData(LoopData::ComplexArray& ca)
+{
+   int id = ca.id;
+   Complex_type factor = ( id % 2 ? Complex_type(0.1,0.2) : 
+                                    Complex_type(0.2,0.3) );
+   Complex_ptr data = ca.data;
+   Index_type totlen = ca.len;
+#if defined(LCALS_OMP_MEM_INIT)
+#pragma omp parallel for
+   for (Index_type j = 0; j < totlen; ++j) {
+      data[j] = factor*(j + 1.1)/(j + 1.12345);
+   }
+#else
+   for (Index_type j = 0; j < totlen; ++j) {
+      data[j] = factor*(j + 1.1)/(j + 1.12345);
+   }
+#endif
+}
+
+
+//
+// Routines to initialize loop check sum.
+//
+void initChksum(LoopStat& stat, LoopLength ilength)
+{
+   stat.loop_chksum[ilength] = 0.0;
+}
+
+//
+// Routines to update loop check sum.
+//
+void updateChksum(LoopStat& stat, LoopLength ilength,
+                  const LoopData::RealArray& ra,
+                  Real_type scale_factor)
+{
+   Real_ptr data = ra.data;
+   Index_type len = ra.len;
+   long double tchk = stat.loop_chksum[ilength];
+   for (Index_type j = 0; j < len; ++j) {
+      tchk += (j+1)*data[j]*scale_factor;
+   }
+   stat.loop_chksum[ilength] = tchk;
+}
+
+void updateChksum(LoopStat& stat, LoopLength ilength,
+                  Real_type val)
+{
+   stat.loop_chksum[ilength] += val;
+}
+
+void updateChksum(LoopStat& stat, LoopLength ilength,
+                  const LoopData::ComplexArray& ca,
+                  Real_type scale_factor)
+{
+   Complex_ptr data = ca.data;
+   Index_type len = ca.len;
+   long double tchk = stat.loop_chksum[ilength];
+   for (Index_type j = 0; j < len; ++j) {
+      tchk += (j+1)*(real(data[j])+imag(data[j]))*scale_factor;
+   }
+   stat.loop_chksum[ilength] = tchk;
+}
+
+}  // closing brace for unnamed namespace
+
+
+
+//
+// Recursively construct directories for given path name.
+//
+bool recursiveMkdir(const std::string& path)
+{
+   bool retval = true;
+
+   mode_t mode = (S_IRUSR | S_IWUSR | S_IXUSR);
+   const char separator = '/';
+
+   int length = static_cast<int>(path.length());
+   char* path_buf = new char[length + 1];
+   sprintf(path_buf, "%s", path.c_str());
+   struct stat status;
+   int pos = length - 1;
+
+   /* find part of path that has not yet been created */
+   while ((stat(path_buf, &status) != 0) && (pos >= 0)) {
+
+      /* slide backwards in string until next slash found */
+      bool slash_found = false;
+      while ((!slash_found) && (pos >= 0)) {
+         if (path_buf[pos] == separator) {
+            slash_found = true;
+            if (pos >= 0) path_buf[pos] = '\0';
+         } else pos--;
+      }
+   }
+
+   /*
+    * if there is a part of the path that already exists make sure
+    * it is really a directory
+    */
+   if (pos >= 0) {
+      if (!S_ISDIR(status.st_mode)) {
+         std::cout << "Cannot create directories in path = " << path
+                   << "\n    because some intermediate item in path exists and"
+                   << "is NOT a directory" << std::endl;
+         retval = false;
+      }
+   }
+
+   /*
+    * make all directories that do not already exist
+    *
+    * if (pos < 0), then there is no part of the path that
+    * already exists.  Need to make the first part of the
+    * path before sliding along path_buf.
+    */
+   if ( retval && pos < 0) {
+      if (mkdir(path_buf, mode) != 0) {
+         std::cout << "   Cannot create directory  = "
+                   << path_buf << std::endl;
+         retval = false;
+      }
+      pos = 0;
+   }
+
+   if ( retval ) {
+
+      /* make remaining directories */
+      do {
+
+         /* slide forward in string until next '\0' found */
+            bool null_found = false;
+         while ((!null_found) && (pos < length)) {
+            if (path_buf[pos] == '\0') {
+               null_found = true;
+            path_buf[pos] = separator;
+            }
+            pos++;
+         }
+
+         /* make directory if not at end of path */
+         if (pos < length) {
+            if (mkdir(path_buf, mode) != 0) {
+              std::cout << "   Cannot create directory  = "
+                        << path_buf << std::endl;
+              retval = false;
+            }
+         }
+      } while (pos < length && retval);
+
+   }
+
+   delete[] path_buf;
+
+   return retval;
+}
Index: MicroBenchmarks/LCALS/LCALSTraversalMethods.hxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/LCALSTraversalMethods.hxx
@@ -0,0 +1,455 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Header file containing LCALS traversal method templates used with
+// "forall" loop variants.
+//
+// Tag structs for traversal types are located in LCALSParams.hxx. 
+//
+
+#ifndef LCALSTraversalMethods_HXX
+#define LCALSTraversalMethods_HXX
+
+#include "LCALSParams.hxx"
+
+#include <vector>
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Traverse contiguous range of indices using sequential execution.
+ *
+ ******************************************************************************
+ */
+template <typename LOOP_BODY>
+LCALS_INLINE
+void forall(seq_exec,
+            Index_type begin, Index_type end, LOOP_BODY loop_body)
+{
+#pragma novector
+   for ( Index_type ii = begin ; ii < end ; ++ii ) {
+      loop_body( ii );
+   }
+}
+
+/// with stride
+template <typename LOOP_BODY>
+LCALS_INLINE
+void forall(seq_exec,
+            Index_type begin, Index_type end, Index_type stride,
+            LOOP_BODY loop_body)
+{
+#pragma novector
+   for ( Index_type ii = begin ; ii < end ; ii += stride ) {
+      loop_body( ii );
+   }
+}
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Traverse contiguous range of indices using SIMD vectorization.
+ *         No assumption made on data alignment.
+ *
+ ******************************************************************************
+ */
+template <typename LOOP_BODY>
+LCALS_INLINE
+void forall(simd_exec,
+            Index_type begin, Index_type end, LOOP_BODY loop_body)
+{
+   for ( Index_type ii = begin ; ii < end ; ++ii ) {
+      loop_body( ii );
+   }
+}
+
+/// with stride
+template <typename LOOP_BODY>
+LCALS_INLINE
+void forall(simd_exec,
+            Index_type begin, Index_type end, Index_type stride,
+            LOOP_BODY loop_body)
+{
+   for ( Index_type ii = begin ; ii < end ; ii += stride ) {
+      loop_body( ii );
+   }
+}
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Traverse contiguous range of indices using OpenMP parallel for.
+ *
+ ******************************************************************************
+ */
+template <typename LOOP_BODY>
+LCALS_INLINE
+void forall(omp_parallel_for_exec,
+            Index_type begin, Index_type end, LOOP_BODY loop_body)
+{
+//#pragma omp parallel for schedule(static)
+#pragma omp parallel for
+   for ( Index_type ii = begin ; ii < end ; ++ii ) {
+      loop_body( ii );
+   }
+}
+
+/// with stride
+template <typename LOOP_BODY>
+LCALS_INLINE
+void forall(omp_parallel_for_exec,
+            Index_type begin, Index_type end, Index_type stride,
+            LOOP_BODY loop_body)
+{
+//#pragma omp parallel for schedule(static)
+#pragma omp parallel for
+   for ( Index_type ii = begin ; ii < end ; ii += stride ) {
+      loop_body( ii );
+   }
+}
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Traverse contiguous range of indices using OpenMP for with
+ *         nowait clause.
+ *
+ ******************************************************************************
+ */
+template <typename LOOP_BODY>
+LCALS_INLINE
+void forall(omp_for_nowait_exec,
+            Index_type begin, Index_type end, LOOP_BODY loop_body)
+{
+//#pragma omp for nowait schedule(static)
+#pragma omp for nowait
+   for ( Index_type ii = begin ; ii < end ; ++ii ) {
+      loop_body( ii );
+   }
+}
+
+/// with stride
+template <typename LOOP_BODY>
+LCALS_INLINE
+void forall(omp_for_nowait_exec,
+            Index_type begin, Index_type end, Index_type stride,
+            LOOP_BODY loop_body)
+{
+//#pragma omp for nowait schedule(static)
+#pragma omp for nowait
+   for ( Index_type ii = begin ; ii < end ; ii += stride ) {
+      loop_body( ii );
+   }
+}
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Class representing a contiguous range of indices.
+ *
+ *         Range is specified by begin and end values.
+ *         Traversal executes as:
+ *            for (i = m_begin; i < m_end; ++i) {
+ *               expression using i as array index.
+ *            }
+ *
+ ******************************************************************************
+ */
+class RangeIndexSet
+{
+public:
+
+   RangeIndexSet(Index_type begin, Index_type end)
+     : m_begin(begin), m_end(end) { ; }
+
+   Index_type getBegin() const { return m_begin; }
+   Index_type getEnd() const { return m_end; }
+
+   Index_type getLength() const { return (m_end-m_begin); }
+
+   void print(std::ostream& os) const;
+
+private:
+   //
+   // The default ctor is not implemented.
+   //
+   RangeIndexSet();
+
+   Index_type m_begin;
+   Index_type m_end;
+};
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Class representing a contiguous range of indices with stride.
+ *
+ *         Range is specified by begin and end values.
+ *         Traversal executes as:
+ *            for (i = m_begin; i < m_end; i = i + m_stride) {
+ *               expression using i as array index.
+ *            }
+ *
+ ******************************************************************************
+ */
+class RangeStrideIndexSet
+{
+public:
+
+   RangeStrideIndexSet(Index_type begin, Index_type end, Index_type stride)
+     : m_begin(begin), m_end(end), m_stride(stride) { ; }
+
+   Index_type getBegin() const { return m_begin; }
+   Index_type getEnd() const { return m_end; }
+   Index_type getStride() const { return m_stride; }
+
+   Index_type getLength() const { return (m_end-m_begin); }
+
+   void print(std::ostream& os) const;
+
+private:
+   //
+   // The default ctor is not implemented.
+   //
+   RangeStrideIndexSet();
+
+   Index_type m_begin;
+   Index_type m_end;
+   Index_type m_stride;
+};
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Traversal methods for index set objects passed as arguments.
+ *
+ ******************************************************************************
+ */
+/// RangeIndexSet object
+template <typename EXEC_T, typename LOOP_BODY>
+LCALS_INLINE
+void forall(EXEC_T exec,
+            const RangeIndexSet& is, LOOP_BODY loop_body)
+{
+   forall( exec,
+           is.getBegin(), is.getEnd(), loop_body );
+}
+
+/// RangeStrideIndexSet object
+template <typename EXEC_T, typename LOOP_BODY>
+LCALS_INLINE
+void forall(EXEC_T exec,
+            const RangeStrideIndexSet& is, LOOP_BODY loop_body)
+{
+   forall( exec,
+           is.getBegin(), is.getEnd(), is.getStride(), loop_body );
+}
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Class representing a hybrid index set which is a collection
+ *         of index set objects defined above.  Within a hybrid, the
+ *         individual index sets are referred to as segments.
+ *
+ *  NOTE: This class is an abreviated version of the actual RAJA class.
+ *
+ ******************************************************************************
+ */
+class HybridIndexSet
+{
+public:
+
+   ///
+   /// Enum describing types of segments in hybrid index set.
+   ///
+   enum SegmentType { _Range_, _RangeStride_, _Unknown_ };
+
+   ///
+   /// Class holding segment and segment type.
+   ///
+   class Segment
+   {
+   public:
+
+      Segment()
+         : m_type(_Unknown_), m_segment(0) { ; }
+
+      Segment(SegmentType type,  const void* segment)
+         : m_type(type), m_segment(segment) { ; }
+
+      SegmentType m_type;
+      const void* m_segment;
+
+   };
+
+   ///
+   /// Construct empty hybrid index set
+   ///
+   HybridIndexSet() 
+   : m_len(0) { ; }  
+
+   //
+   // Copy-constructor for hybrid index set
+   //
+   HybridIndexSet(const HybridIndexSet& other)
+   : m_len(0)
+   {
+      copySegments(other);
+   } 
+
+   //
+   // Copy-assignment for hybrid index set
+   //
+   HybridIndexSet& operator=(const HybridIndexSet& rhs)
+   {
+      if (this != &rhs) {
+      copySegments(rhs);
+      }
+      return *this;
+   }
+
+   ///
+   /// Hybrid index set destructor destroys all index set segments.
+   ///
+   ~HybridIndexSet();
+
+   ///
+   /// Create copy of given RangeIndexSet and add to hybrid index set.
+   ///
+   void addIndexSet(const RangeIndexSet& index_set);
+
+   ///
+   /// Add contiguous range of indices to hybrid index set as a RangeIndexSet.
+   ///
+   void addRangeIndices(Index_type begin, Index_type end);
+
+   ///
+   /// Create copy of given RangeStrideIndexSet and add to hybrid index set.
+   ///
+   void addIndexSet(const RangeStrideIndexSet& index_set);
+
+   ///
+   /// Add contiguous range of indices with stride to hybrid index set
+   /// as a RangeStrideIndexSet.
+   ///
+   void addRangeStrideIndices(Index_type begin, Index_type end, Index_type stride);
+
+   ///
+   /// Return total length of hybrid index set; i.e., sum of lengths
+   /// over all segments.
+   ///
+   Index_type getLength() const { return m_len; }
+
+   ///
+   /// Return total number of segments in hybrid index set.
+   ///
+   int getNumSegments() const { return m_segments.size(); }
+
+   ///
+   /// Return total number of segments in hybrid index set.
+   ///
+   const Segment* getSegments() const { return &m_segments[0]; }
+
+private:
+   //
+   // Copy segments (deep copy) from given HybridIndexSet object.
+   //
+   void copySegments(const HybridIndexSet& other);
+
+   Index_type  m_len;
+   std::vector<Segment> m_segments;
+
+};
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Iterate over segments sequentially, and use exec policy
+ *         specified by template parameter for individual segments.
+ *
+ ******************************************************************************
+ */
+template <typename EXEC_T, typename LOOP_BODY>
+LCALS_INLINE
+void forall(EXEC_T exec,
+            const HybridIndexSet& is, LOOP_BODY loop_body)
+{
+   const int num_seg = is.getNumSegments();
+   const HybridIndexSet::Segment* seg = is.getSegments();
+   for ( int isi = 0; isi < num_seg; ++isi ) {
+
+      switch ( seg[isi].m_type ) {
+
+         case HybridIndexSet::_Range_ : {
+            forall(exec,
+               *(static_cast<const RangeIndexSet*>(seg[isi].m_segment)),
+               loop_body
+            );
+            break;
+         }
+
+         case HybridIndexSet::_RangeStride_ : {
+            forall(exec,
+               *(static_cast<const RangeStrideIndexSet*>(seg[isi].m_segment)),
+               loop_body
+            );
+            break;
+         }
+
+         default : {
+         }
+
+      }  // switch on segment type
+
+   } // iterate over segments of hybrid index set
+}
+
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Generic methods with exec policy specified by template 
+ *        parameter.
+ *
+ ******************************************************************************
+ */
+template <typename EXEC_T, typename LOOP_BODY>
+LCALS_INLINE
+void forall(Index_type begin, Index_type end, LOOP_BODY loop_body)
+{
+   forall( EXEC_T(), begin, end, loop_body );
+}
+
+/// with stride
+template <typename EXEC_T, typename LOOP_BODY>
+LCALS_INLINE
+void forall(Index_type begin, Index_type end, Index_type stride,
+            LOOP_BODY loop_body)
+{
+   forall( EXEC_T(), begin, end, stride, loop_body );
+}
+
+/// passing index set object
+template <typename EXEC_T,
+          typename INDEXSET_T, typename LOOP_BODY>
+LCALS_INLINE
+void forall(const INDEXSET_T& is, LOOP_BODY loop_body)
+{
+   forall(EXEC_T(), is, loop_body);
+}
+
+
+#endif  // closing endif for header file include guard
Index: MicroBenchmarks/LCALS/LCALSTraversalMethods.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/LCALSTraversalMethods.cxx
@@ -0,0 +1,137 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file containing LCALS traversal method implementations 
+// used in forall-hybrid loop variants.
+//
+
+#include "LCALSTraversalMethods.hxx"
+
+#include <iostream>
+
+
+/*
+*************************************************************************
+*
+* HybridIndexSet class dtor.
+*
+*************************************************************************
+*/
+HybridIndexSet::~HybridIndexSet()
+{
+   const int num_segs = m_segments.size();
+   for ( int isi = 0; isi < num_segs; ++isi ) {
+      Segment& seg = m_segments[isi];
+
+      switch ( seg.m_type ) {
+
+         case _Range_ : {
+            if ( seg.m_segment ) {
+               RangeIndexSet* is =
+                  const_cast<RangeIndexSet*>(
+                     static_cast<const RangeIndexSet*>(seg.m_segment)
+                  );
+               delete is;
+            }
+            break;
+         }
+
+         case _RangeStride_ : {
+            if ( seg.m_segment ) {
+               RangeStrideIndexSet* is =
+                  const_cast<RangeStrideIndexSet*>(
+                     static_cast<const RangeStrideIndexSet*>(seg.m_segment)
+                  );
+               delete is;
+            }
+            break;
+         }
+
+         default : {
+            std::cout << "\t HybridIndexSet dtor: case not implemented!!\n";
+         }
+
+      } // iterate over segments of hybrid index set
+   }
+}
+
+
+/*
+*************************************************************************
+*
+* Private helper function to copy hybrid index set segments.
+*
+*************************************************************************
+*/
+void HybridIndexSet::copySegments(const HybridIndexSet& other)
+{
+   const int num_segs = m_segments.size();
+   for ( int isi = 0; isi < num_segs; ++isi ) {
+      const Segment& seg = m_segments[isi];
+
+      switch ( seg.m_type ) {
+
+         case _Range_ : {
+            addIndexSet(*static_cast<const RangeIndexSet*>(seg.m_segment));
+            break;
+         }
+
+         case _RangeStride_ : {
+            addIndexSet(*static_cast<const RangeStrideIndexSet*>(seg.m_segment));
+            break;
+         }
+
+         default : {
+            std::cout << "\t HybridIndexSet copySegments: case not implemented!!\n";
+         }
+
+      } // iterate over segments of hybrid index set
+   }
+}
+
+
+/*
+*************************************************************************
+*
+* Methods to add indices to hybrid index set.
+*
+*************************************************************************
+*/
+
+void HybridIndexSet::addIndexSet(const RangeIndexSet& index_set)
+{
+   RangeIndexSet* new_is =
+      new RangeIndexSet(index_set.getBegin(), index_set.getEnd());
+   m_segments.push_back( Segment( _Range_, new_is ) );
+
+   m_len += new_is->getLength();
+}
+
+void HybridIndexSet::addRangeIndices(Index_type begin, Index_type end)
+{
+   RangeIndexSet* new_is = new RangeIndexSet(begin, end);
+   m_segments.push_back( Segment( _Range_, new_is ) );
+
+   m_len += new_is->getLength();
+}
+
+void HybridIndexSet::addIndexSet(const RangeStrideIndexSet& index_set)
+{
+   RangeStrideIndexSet* new_is =
+      new RangeStrideIndexSet(index_set.getBegin(), index_set.getEnd(),
+                              index_set.getStride());
+   m_segments.push_back( Segment( _RangeStride_, new_is ) );
+
+   m_len += new_is->getLength() / new_is->getStride();
+}
+
+void HybridIndexSet::addRangeStrideIndices(Index_type begin, Index_type end,
+                                           Index_type stride)
+{
+   RangeStrideIndexSet* new_is = new RangeStrideIndexSet(begin, end, stride);
+   m_segments.push_back( Segment( _RangeStride_, new_is ) );
+
+   m_len += new_is->getLength() / new_is->getStride();
+}
Index: MicroBenchmarks/LCALS/README-LCALS_instructions.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/README-LCALS_instructions.txt
@@ -0,0 +1,312 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+================================================================================
+================================================================================
+LCALS: Livermore Compiler Analysis Loop Suite
+ by Rich Hornung (hornung1@llnl.gov), 
+    Center for Applied Scientific Computing,
+    Lawrence Livermore National Laboratory 
+================================================================================
+================================================================================
+
+ o This code is under continuing development.  Go to http://codesign.llnl.gov
+   to acquire the latest released version.
+
+ o This loop suite is designed to measure performance for a variety of loops
+   using different compilers and platforms.  In particular, the suite 
+   helps to understand compiler optimization, run-time performance issues,
+   and platform capabilities.  The suite is also useful as a source of
+   example code snippets for interactions with compiler developers.
+
+ o The loops in the suite are partitioned into three subsets based on their 
+   origins (and also to avoid having them all in a single source file).  Each 
+   loop is implemented using multiple software constructs (i.e., referred 
+   to herein as "variants").  The three loop subsets are:
+
+   -  Subset A: Loops representative of those found in application codes.
+      They are implemented in source files named runA<variant>Loops.cxx.  
+
+   -  Subset B: Basic loops that help to illustrate compiler optimization 
+      issues. They are implemented in source files named runB<variant>Loops.cxx
+
+   -  Subset C: Loops extracted from "Livermore Loops coded in C" developed by 
+      Steve Langer, which were derived from the Fortran version by Frank 
+      McMahon.  They are implemented in source files runC<variant>Loops.cxx 
+
+   Please see the contents of the loop source files to understand the 
+   differences among the variants.
+
+ o New loops may be added to the suite by inserting them into appropriate 
+   loop source files and modifying a few other files that control suite 
+   execution and parametrization.  Details are provided below.
+
+ o Various parameters can be adjusted to control how loops are defined and run.
+ 
+   -- Each loop may be run with different loop lengths (currently up to three
+      lengths for each loop) and will be sampled some number of times to 
+      generate execution timing data.  Loop length and sampling parameters 
+      may be modified to evaluate different platform performance 
+      characteristics.  Details are provided below.
+
+ o Various run time statistics can be generated for analysis.  Currently,
+   these include: min run time, max run time, average run time, 
+   standard deviation across run times, and average execution time relative 
+   to a reference loop variant.  Here, run time is the time required to 
+   execute the loop for one "sampling" pass through the suite.  See below.
+
+
+--------------------------------------------------------------------------------
+Loop kernels and variants:
+
+ o Each loop in the suite is defined by its traditional C/C++ for-loop 
+   "kernel".  Then, each loop appears in multiple variants that use different
+   programming and execution constructs.
+
+ o Loops that emply traditional C/C++ for-loop syntax are referred to as 
+   "Raw" variants.  The "Raw" variant of each loop represents the version 
+   obtained from its original source, plus minor modifications necessary 
+   to plug into the loop suite framework.  For example, the loops in the 
+   runCRawLoops.cxx file are essentially verbatim from the Livermore Loops 
+   Coded in C" suite mentioned above.  Typically, the "Raw" loops serve as 
+   reference implemenation for runtime comparisons.  
+
+ o Other variants use loop traversal C++ template methods and represent the 
+   loop body as a lambda function or functor class.  One of the main goals 
+   of the suite is to assess how SIMD vectorization, OpenMP multithreading, 
+   etc. work with these different loop implementation choices.
+
+   Note that only a subset of the loops in the suite appear in the OpenMP
+   variants since many of the loops do not benefit from thread parallelism
+   due to OpenMP overheads.  OpenMP loops are implmented in source files
+   named runOMP<variant>Loops.cxx; in particular, they are not broken out
+   into separate source files based on the subsets described above.
+
+ o Although all loop bodies contain only C-syntax, the loop framework 
+   uses C++ classes and templates. So a C++ compiler is required to compile 
+   the code.  All C++ compilers should be able to compile the framework
+   code and "Raw" loop variants.  
+
+ o Not all compilers implement the OpenMP standard. Thus, those loop variants
+   may not be compiled and run depending on the compiler being used.
+
+ o The intent of the C++ lambda and functor loop variants is to evaluate 
+   compilers in the context of C++ abstraction layers using template methods. 
+   Not all compilers support standard C++ lambda expressions at this time.  
+   Thus, the lambda variants of the loops may not be compiled and run 
+   depending on the compiler being used.
+
+
+******************** Test Suite Note ***********************
+*                                                          *
+*     Below is the original build instructions, the        *
+*     test suite replaces this build system with the       *
+*     llvm test-suite CMake system.  The control of        *
+*     loop suite and timing has been altered to use        *
+*     the google benchmark library included in the         *
+*     MicroBenchmarks directory of the llvm test-suite.    *
+*                                                          *
+************************************************************
+--------------------------------------------------------------------------------
+Compiling and running the loop suite:
+
+ The loop suite is typically compiled by typing 'make' and then executed as
+
+    ./lcals.exe <optional output directory>
+
+ o The executable generated by the Makefile accepts an optional argument
+   which is the name of a directory for placing output files that contain 
+   detailed timing, checksum, and FOM (when specified) results.  Some of 
+   these files provide a summary of loop suite performance.  Othere
+   contain subsets of this information in comma-delimited text files that may
+   be imported into Microsoft Excel to generate spreadsheets and plots.  
+   When no output directory is given, a summary of the results is printed 
+   to standard output.  
+
+ o LCALS is highly parametrized to explore many compilation and execution
+   options. Exercising the full range of options can be achieved by making
+   straightforward modifications in a few files, as describe below:
+
+   -- Makefile: This file contains a simple build system for the code. 
+                It has a variety of configurations for current LLNL 
+                computing systems. Building for other platforms or changing
+                any compiler options can done by modifying this file.
+
+   -- LCALS_rules.mk: This file contains "-D" compilation options that 
+                conrol some aspects of LCALS parametrization. The effect of 
+                these options is described in the comments in this file.
+                It is also helpful to see how they are used in the 
+                LCALSParams.hxx file.
+
+   -- main.cxx: The main program determines many of the LCALS execution
+                options, such as which loops are run (kernels and variants).
+
+   -- LCALSSuite.cxx: The routine defineLoopSuiteRunInfo() in this file 
+                defines loop lengths and sampling parameters for each loop
+                in the suite.  It also defines loop weights used in Figure
+                of Merit (FOM) calculations. 
+
+   -- LCALSSuite.hxx: This file contains '#define' preprocessor directives
+                that can be used to turn on/off compilation of individual
+                loop kernels and loop variants in the suite. This can be 
+                helpful for generating assembly code in small doses.
+
+ o Details on many of these items are given in the next section.
+                
+
+--------------------------------------------------------------------------------
+Controlling loop suite execution and timing output:
+
+ o The execution of the loop suite follows the pattern described here:
+
+   Iterate over specified number of passes through the loop suite {
+
+      Iterate over specified loop variants to run {
+
+         Iterate over loop lengths to run (e.g., long, medium, short) {
+
+            Iterate over each loop specified to run {
+
+               TIMER_START()
+               Iterate over specified number of samples (for loop and length) {
+
+                  Execute loop variant and length.
+ 
+               }
+               TIMER_STOP()
+
+            }  // end iteration over loops to run
+
+         } // end iteration over loop lengths
+
+      } // end iteration over loop variants
+
+   } // end iteration over suite passes
+
+ o The loop suite is parametrized so that its execution may be controlled 
+   by editing various items in a small number of source and header files
+   as described below:
+
+   -- Set number of passes through the suite by setting the variable
+      'num_suite_passes' in main.cxx.
+
+   -- Set loop variants to run by adding the corresponding enumeration
+      constants to the vector 'run_variants' in main.cxx.  To prevent a 
+      variant from running, simply comment out the line which adds the 
+      corresponding enum value to the vector.
+
+      NOTE: The first entry added this array indicates the reference variant
+            for relative execution time statistics.
+
+      NOTE: An additional argument may be given to the exectuable to run
+            loops outside of the standard LCASL benchmark.  This requires 
+            that "BUILD_MISC" is defined in the Makefile. 
+
+   -- Set which loop lengths to run by setting the appropriate entry in
+      the array 'run_loop_length' in main.cxx (true/false for each length).
+
+   -- Set which loop kernels will run be setting entries in the array 
+      'run_loop' in main.cxx (true/false for each loop).
+
+   -- The lengths and number of samples per pass for each loop are set 
+      in the routine defineLoopSuiteRunInfo() in LCALSSuite.cxx.
+
+      NOTE: The "samples per pass" values for each loop were determined 
+      manually to give approximately 1 second of execution time for its
+      serial raw variant on an Intel ES-2670 node. To reduce or increase the 
+      total suite execution time, or change the loop lengths used, change 
+      the 'sample_frac' and/or 'loop_length_factor' variables in 
+      main.cxx.  All default loop lengths will be multiplied by the 
+      loop_length_factor value.  The sample count for each loop will be 
+      multiplied by sample_frac/loop_length_factor.
+
+   -- The "LoopKernelID" and "LoopLength" enumeration types in the file 
+      LCALSSuite.hxx are used to identify loops and loop lengths
+      in the suite.  Macros are also provided in that file to conditionally
+      compile each loop in the suite.
+
+      The way in which the loops are compiled can influence execution times.
+      For example, some compilers perform optimizations for loops compiled
+      individually that they do not perform when the same loop is compiled as
+      part of a larger suite.
+
+ o All loop forms use the same data arrays, which are pre-allocated based 
+   on the loop lengths.  To help with SIMD vectorization and ensure corretness
+   data arrays are allocated to be aligned width SIMD vector unit boundaries. 
+   This can be changed by setting the 'LCALS_DATA_ALIGN' constant in the
+   file LCALSParams.hxx.
+
+ o To minimize the effects of execution of each loop on the others, 
+   data caches are flushed before each loop is run. 
+
+   -- Data cache size is set for some LLNL platforms based on hostname.
+      If unknown, a warning message will appear when loop suite is run.
+      Please edit main.cxx to set the largest data cache size for other 
+      platforms.
+
+ o A simple checksum mechanism is provided to verify that different variants 
+   of each loop, and implementation changes made to individual loops, generate
+   the same numerical results.  "-D" compiler options are provided in the
+   LCALS_rules.mk file to control this behavior.  Note that certain levels 
+   and types of compiler optimization will cause slight differences in 
+   checksums due to changes in operation order, for example.  Thus, the 
+   checksums may only be a qualitative indicator of correct execution.
+
+   -- Note that the routines loopInit() and loopFinalize() in LCALSSuite.cxx
+      initialize data and compute result checksums for each loop.  These
+      must remain consistent with the data used in each loop for correctness.
+
+
+ o There are two mechanisms available to generate execution timing data for
+   loops in the suite.  The choice is made by defining/undefining the 
+   associated "-D" option in the LCALS_rules.mk file.  See that file for 
+   more information.
+
+
+--------------------------------------------------------------------------------
+Figures of Merit:
+
+ o The program output includes a Figure of Merit (FOM) value for each loop 
+   variant and loop length that is executed.  The intent of the FOM is to 
+   complement execution timing data with another measure of performance and 
+   compiler optimization.  Using the FOM values and total loop suite execution 
+   time information in the Figure of Merit report, one can compare different 
+   compilers' abilities to optimize on a given platform, performance of 
+   different optimization levels for a given compiler, or potential performance 
+   of different architectures, etc. 
+
+ o In the FOM calculation, execution time for each loop is weighted by a 
+   factor defined in the loop setup routines.  The loops are partitioned into 
+   six classes depending on their structure; e.g., data-parallel, order-
+   dependent, etc.  The weight for each loop class indicates its relative 
+   importance based on code constructs we want the suite to emphasize 
+   and how easy we think it should be for a compiler to optimize.  Each loop 
+   in the suite is given a weight, w_i (i is the loop id), based on which 
+   class it exists in.  Loop classes and weights are defined in the file 
+   LCALSSuite.cxx.
+
+ o The FOM is calculated as follows.
+
+   - Relative FOM (FOM_rel).  The aim of the FOM_rel value is to measure 
+     a compiler's ability to optimize different loop constructs.
+
+     -- When the code is executed, a reference loop execution time, t_ref, is
+        computed using a loop that any compiler should be able to optimize 
+        well and which should run faster than any loop in the suite.  
+        To help insure this, two simple loops are run, an element-wise vector 
+        product and a vector dot product. Then, t_ref is the minimum execution 
+        time between the two.
+
+     -- After the suite is run, FOM_rel is calulated as:
+
+        FOM_rel = W * t_ref / Sum_i [ w_i * t_i ]
+
+        The denominator is a weighted sum of execution times for the loops
+        that were run; t_i is the run time for loop i.  W = Sum_i ( w_i ) is
+        the sum of loop weights.
+
+     -- Note that FOM_rel is a dimensionless quantity that satisfies 
+        0 <= FOM_rel <= 1, and FOM_rel increases as loop execution times 
+        decrease.  In the ideal case, where each loop executes as fast as the 
+        reference loop (which should be impossible), t_i = t_ref for each i.
+        So FOM_rel = 1.
Index: MicroBenchmarks/LCALS/README-LCALS_license.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/README-LCALS_license.txt
@@ -0,0 +1,170 @@
+*******************************************************************************
+LCALS: Livermore Compiler Analysis Loop Suite, version 1.0
+ by Rich Hornung, Center for Applied Scientific Computing,
+                  Lawrence Livermore National Laboratory
+
+Unclassified/Unlimited Distribution
+LLNL-CODE-638939  
+OCEC-13-189
+
+** NOTE: This code was originally released under the name LLoops21.
+         The content is essentially unchanged under the new name.
+
+*******************************************************************************
+
+This code was developed and is maintained by Lawrence Livermore
+National Laboratory (LLNL). It is intended to be shared widely with the 
+HPC community (including other laboratories, universities, and industrial 
+partners) as part of ASC and DOE exascale co-design efforts.
+
+o The software is unrestricted in its distribution.
+
+o LLNL retains copyright (see Copyright statement below)
+
+o If the code and/or results generated from it are used in a publication,
+  please cite LCALS as follows:
+
+  @misc{LCALScode,
+     author = {Richard D. Hornung},
+     title = {{LCALS}, version 1.0},
+     howpublished = {\texttt{https://codesign.llnl.gov/LCALS.php}},
+     note = {{LLNL}-{CODE}-638939},
+     year = {2013}
+  }
+
+o Please direct improvements, additions, comments, suggestions, etc. to 
+  proxyapp-info@llnl.gov or hornung1@llnl.gov
+
+o  This README-LCALS_license.txt file must be included in any redistribution of
+   the software (either partial or in its entiretly) as well as any of its 
+   derivatives.
+
+*******************************************************************************
+*******************************************************************************
+
+This work was produced at Lawrence Livermore National Laboratory (LLNL) under
+contract no. DE-AC52-07NA27344 (Contract 44) between the U.S. Department of
+Energy (DOE) and Lawrence Livermore National Security, LLC (LLNS) for the
+operation of LLNL. Copyright is reserved to Lawrence Livermore National
+Security, LLC for purposes of controlled dissemination, commercialization
+through formal licensing, or other disposition under terms of Contract 44; DOE
+policies, regulations and orders; and U.S. statutes. The rights of the Federal
+Government are reserved under Contract 44.
+
+*******************************************************************************
+*******************************************************************************
+
+                                   DISCLAIMER
+
+This work was prepared as an account of work sponsored by an agency of the
+United States Government. Neither the United States Government nor Lawrence
+Livermore National Security, LLC nor any of their employees, makes any warranty,
+express or implied, or assumes any liability or responsibility for the accuracy,
+completeness, or usefulness of any information, apparatus, product, or process
+disclosed, or represents that its use would not infringe privately-owned rights.
+Reference herein to any specific commercial products, process, or service by
+trade name, trademark, manufacturer or otherwise does not necessarily constitute
+or imply its endorsement, recommendation, or favoring by the United States
+Government or Lawrence Livermore National Security, LLC. The views and opinions
+of authors expressed herein do not necessarily state or reflect those of the
+United States Government or Lawrence Livermore National Security, LLC, and shall
+not be used for advertising or product endorsement purposes.
+
+*******************************************************************************
+*******************************************************************************
+
+                           NOTIFICATION OF COMMERCIAL USE
+
+Commercialization of this product is prohibited without notifying the 
+Department of Energy (DOE) or Lawrence Livermore National Laboratory (LLNL).
+
+
+
+*******************************************************************************
+*******************************************************************************
+
+//
+// The following is the original copyright statement from Steve Langer's
+// Livermore Loops coded in C.
+//
+// NOTE: Fonzi's Law (mentioned below) is actually called
+//       Flon's Law (just Google it).
+//
+
+/* 
+ *********************************************************************** 
+ * 
+ * Livermore Loops coded in C        Latest File Modification  27 Jul 90
+ *
+ * NOTE NOTE NOTE: Modified for use in the pure ANSI C version
+ * of the LFK test program by Steven H. Langer.
+ * Changes include calling sequence from Fortran to C and
+ * minor changes in COMMON block arguments.
+ * Split into separate header and source code files for convenience
+ * in converting the main program to C.
+ * Feb. 14, 1995.
+ *
+ * Copyright (c) 1995.  The Regents of the University of California.
+ *                  All rights reserved.
+ *
+ *
+ *     SUBROUTINE KERNEL( TK)  replaces the Fortran routine in LFK Test program.
+ ************************************************************************
+ *                                                                      *
+ *          KERNEL     executes 24 samples of "C" numerical computation *
+ *                                                                      *
+ *                TK(1) - total cpu time to execute only the 24 kernels.*
+ *                TK(2) - total Flops executed by the 24 Kernels        *
+ *                                                                      *
+ *   Link this C module with the rest of LFK Test compiled with Fortran *
+ *   using a version of the LFK Test dated April 1990 or later.         *
+ ************************************************************************
+ *                                                                      *
+ *     L. L. N. L.   " C "   K E R N E L S  T E S T:   M F L O P S      *
+ *                                                                      *
+ *     These kernels measure   " C "   numerical computation            *
+ *     rates for  a  spectrum  of  cpu-limited computational            *
+ *     structures or benchmarks.   Mathematical  through-put            *
+ *     is measured  in  units  of millions of floating-point            *
+ *     operations executed per second, called Megaflops/sec.            *
+ *                                                                      *
+ *     Fonzi's Law: There is not now and there never will be a language *
+ *                  in which it is the least bit difficult to write     *
+ *                  bad programs.                                       *
+ *                                                                      *
+ *Originally from  Greg Astfalk, AT&T, P.O.Box 900, Princeton, NJ. 08540*
+ *by way of Frank McMahon, LLNL, PO Box 808, Livermore, CA, 94550. 1986 *
+ *                                                                      *
+ *    Changes made to correct many array subscripting problems,         *
+ *      make more readable (added #define's), include the original      *
+ *      FORTRAN versions of the runs as comments, and make more         *
+ *      portable by Kelly O'Hair (LLNL) and Chuck Rasbold (LLNL)        *
+ *           and by Mark Seager  (LLNL).                                *
+ *                                                                      *
+ *      please send copy of sdtout to:   MCMAHON3@LLNL.GOV              *
+ *                                 or:   mcmahon@lll-crg.llnl.gov       *
+ *                                                                      *
+ ************************************************************************
+ *                                                                      *
+ *                               REFERENCE                              *
+ *                                                                      *
+ *              F.H.McMahon,   The Livermore Fortran Kernels:           *
+ *              A Computer Test Of The Numerical Performance Range,     *
+ *              Lawrence Livermore National Laboratory,                 *
+ *              Livermore, California, UCRL-53745, December 1986.       *
+ *                                                                      *
+ *       from:  National Technical Information Service                  *
+ *              U.S. Department of Commerce                             *
+ *              5285 Port Royal Road                                    *
+ *              Springfield, VA.  22161                                 *
+ *                                                                      *
+ *                                                                      *
+ *                   (C) Copyright 1986 the Regents of the              *
+ *               University of California. All Rights Reserved.         *
+ *                                                                      *
+ *              This work was produced under the sponsorship of         *
+ *               the U.S. Department of Energy. The Government          *
+ *                      retains certain rights therein.                 *
+ *                                                                      *
+ ************************************************************************
+ */
Index: MicroBenchmarks/LCALS/README-LCALS_llvm-test-suite.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/README-LCALS_llvm-test-suite.txt
@@ -0,0 +1,21 @@
+##################### llvm test suite notes ########################
+
+The following changes were made to the source to add LCALS to the
+llvm test suite using the google benchmark library.
+
+Macro'd out reporting and the built in control of the suite to 
+allow the benchmark library to control run and time information.
+The loop data initialization and cache flushing is maintained, but 
+the checksum information cannot be used in the test suite as it 
+can show slight differences due to compiler optimizations.
+The "Raw" and "ForeachLambda" versions of the loops have been 
+rewritten to be used by the google benchmark library, while 
+the files included other versions have not been included at this
+time.
+
+See the original README_LCALS_license.txt for copyright information
+
+See the original README_LCALS_instructions.txt for information about
+the suite.  
+
+####################################################################
Index: MicroBenchmarks/LCALS/SubsetALambdaLoops/CMakeLists.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetALambdaLoops/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(COPY lit.local.cfg DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG)
+llvm_test_run()
+llvm_test_executable(lcalsALambda ../main.cxx LambdaSubsetAbenchmarks.cxx  ../LCALSStats.cxx ../LCALSSuite.cxx  ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx)
+target_link_libraries(lcalsALambda benchmark)
Index: MicroBenchmarks/LCALS/SubsetALambdaLoops/LambdaSubsetAbenchmarks.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetALambdaLoops/LambdaSubsetAbenchmarks.cxx
@@ -0,0 +1,467 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file containing LCALS "A" subset forall lambda loops using
+// the google benchmark library.
+//
+
+#include <benchmark/benchmark.h>
+#include "../LCALSSuite.hxx"
+#include "../SubsetDataA.hxx"
+#include "../LCALSTraversalMethods.hxx"
+
+static void BM_PRESSURE_CALC_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(PRESSURE_CALC);
+
+   Real_ptr compression = loop_data.array_1D_Real[0];
+   Real_ptr bvc = loop_data.array_1D_Real[1];
+   Real_ptr p_new = loop_data.array_1D_Real[2];
+   Real_ptr e_old = loop_data.array_1D_Real[3];
+   Real_ptr vnewc = loop_data.array_1D_Real[4];
+
+   const Real_type cls = loop_data.scalar_Real[0];
+   const Real_type p_cut = loop_data.scalar_Real[1];
+   const Real_type pmin = loop_data.scalar_Real[2];
+   const Real_type eosvmax = loop_data.scalar_Real[3];
+   
+   for( auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         bvc[i] = cls * (compression[i] + 1.0);
+      } );
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         p_new[i] = bvc[i] * e_old[i] ;
+
+         if ( fabs(p_new[i]) <  p_cut )  p_new[i] = 0.0 ;
+
+         if ( vnewc[i] >= eosvmax )  p_new[i] = 0.0 ;
+
+         if ( p_new[i]  <  pmin )  p_new[i] = pmin ;
+      } );
+
+   }
+}
+
+BENCHMARK(BM_PRESSURE_CALC_LAMBDA)->Arg(171)->Arg(5001)->
+                                    Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+static void BM_ENERGY_CALC_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(ENERGY_CALC);
+
+   Real_ptr e_new = loop_data.array_1D_Real[0];
+   Real_ptr e_old = loop_data.array_1D_Real[1];
+   Real_ptr delvc = loop_data.array_1D_Real[2];
+   Real_ptr p_new = loop_data.array_1D_Real[3];
+   Real_ptr p_old = loop_data.array_1D_Real[4];
+   Real_ptr q_new = loop_data.array_1D_Real[5];
+   Real_ptr q_old = loop_data.array_1D_Real[6];
+   Real_ptr work = loop_data.array_1D_Real[7];
+   Real_ptr compHalfStep = loop_data.array_1D_Real[8];
+   Real_ptr pHalfStep = loop_data.array_1D_Real[9];
+   Real_ptr bvc = loop_data.array_1D_Real[10];
+   Real_ptr pbvc = loop_data.array_1D_Real[11];
+   Real_ptr ql_old = loop_data.array_1D_Real[12];
+   Real_ptr qq_old = loop_data.array_1D_Real[13];
+   Real_ptr vnewc = loop_data.array_1D_Real[14];
+
+   const Real_type rho0 = loop_data.scalar_Real[0];
+   const Real_type e_cut = loop_data.scalar_Real[1];
+   const Real_type emin = loop_data.scalar_Real[2];
+   const Real_type q_cut = loop_data.scalar_Real[3];
+
+   for( auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+          e_new[i] = e_old[i] - 0.5 * delvc[i] *
+          (p_old[i] + q_old[i]) + 0.5 * work[i];
+      } );
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         if ( delvc[i] > 0.0 ) {
+            q_new[i] = 0.0 ;
+         }
+         else {
+            Real_type vhalf = 1.0 / (1.0 + compHalfStep[i]) ;
+            Real_type ssc = ( pbvc[i] * e_new[i]
+               + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
+
+            if ( ssc <= 0.1111111e-36 ) {
+               ssc = 0.3333333e-18 ;
+            } else {
+               ssc = sqrt(ssc) ;
+            }
+
+            q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
+         }
+      } );
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         e_new[i] = e_new[i] + 0.5 * delvc[i]
+            * ( 3.0*(p_old[i] + q_old[i])
+               - 4.0*(pHalfStep[i] + q_new[i])) ;
+      } );
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         e_new[i] += 0.5 * work[i];
+
+         if ( fabs(e_new[i]) < e_cut ) { e_new[i] = 0.0  ; }
+
+         if ( e_new[i]  < emin ) { e_new[i] = emin ; }
+      } );
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         Real_type q_tilde ;
+
+         if (delvc[i] > 0.0) {
+            q_tilde = 0. ;
+         }
+         else {
+            Real_type ssc = ( pbvc[i] * e_new[i]
+               + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+
+            if ( ssc <= 0.1111111e-36 ) {
+               ssc = 0.3333333e-18 ;
+            } else {
+               ssc = sqrt(ssc) ;
+            }
+
+            q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
+         }
+
+         e_new[i] = e_new[i] - ( 7.0*(p_old[i] + q_old[i])
+                                - 8.0*(pHalfStep[i] + q_new[i])
+                                + (p_new[i] + q_tilde)) * delvc[i] / 6.0 ;
+
+         if ( fabs(e_new[i]) < e_cut ) {
+            e_new[i] = 0.0  ;
+         }
+         if ( e_new[i]  < emin ) {
+            e_new[i] = emin ;
+         }
+      } );
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         if ( delvc[i] <= 0.0 ) {
+            Real_type ssc = ( pbvc[i] * e_new[i]
+               + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+
+            if ( ssc <= 0.1111111e-36 ) {
+               ssc = 0.3333333e-18 ;
+            } else {
+               ssc = sqrt(ssc) ;
+            }
+
+            q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
+
+            if (fabs(q_new[i]) < q_cut) q_new[i] = 0.0 ;
+         }
+      } );
+
+   }
+}
+
+BENCHMARK(BM_ENERGY_CALC_LAMBDA)->Arg(171)->Arg(5001)->
+                                  Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_VOL3D_CALC_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(VOL3D_CALC);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+   Real_ptr vol = loop_data.array_1D_Real[3];
+
+   ADomain domain(state.range(0), /* ndims = */ 3);
+
+   UnalignedReal_ptr x0,x1,x2,x3,x4,x5,x6,x7 ;
+   UnalignedReal_ptr y0,y1,y2,y3,y4,y5,y6,y7 ;
+   UnalignedReal_ptr z0,z1,z2,z3,z4,z5,z6,z7 ;
+
+   NDPTRSET(x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+   NDPTRSET(y,y0,y1,y2,y3,y4,y5,y6,y7) ;
+   NDPTRSET(z,z0,z1,z2,z3,z4,z5,z6,z7) ;
+
+   const Real_type vnormq = 0.083333333333333333; /* vnormq = 1/12 */
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(domain.fpz, domain.lpz + 1,
+      [&] (Index_type i) {
+ 
+         Real_type x71 = x7[i] - x1[i] ;
+         Real_type x72 = x7[i] - x2[i] ;
+         Real_type x74 = x7[i] - x4[i] ;
+         Real_type x30 = x3[i] - x0[i] ;
+         Real_type x50 = x5[i] - x0[i] ;
+         Real_type x60 = x6[i] - x0[i] ;
+
+         Real_type y71 = y7[i] - y1[i] ;
+         Real_type y72 = y7[i] - y2[i] ;
+         Real_type y74 = y7[i] - y4[i] ;
+         Real_type y30 = y3[i] - y0[i] ;
+         Real_type y50 = y5[i] - y0[i] ;
+         Real_type y60 = y6[i] - y0[i] ;
+
+         Real_type z71 = z7[i] - z1[i] ;
+         Real_type z72 = z7[i] - z2[i] ;
+         Real_type z74 = z7[i] - z4[i] ;
+         Real_type z30 = z3[i] - z0[i] ;
+         Real_type z50 = z5[i] - z0[i] ;
+         Real_type z60 = z6[i] - z0[i] ;
+
+         Real_type xps = x71 + x60 ;
+         Real_type yps = y71 + y60 ;
+         Real_type zps = z71 + z60 ;
+
+         Real_type cyz = y72 * z30 - z72 * y30 ;
+         Real_type czx = z72 * x30 - x72 * z30 ;
+         Real_type cxy = x72 * y30 - y72 * x30 ;
+         vol[i] = xps * cyz + yps * czx + zps * cxy ;
+
+         xps = x72 + x50 ;
+         yps = y72 + y50 ;
+         zps = z72 + z50 ;
+
+         cyz = y74 * z60 - z74 * y60 ;
+         czx = z74 * x60 - x74 * z60 ;
+         cxy = x74 * y60 - y74 * x60 ;
+         vol[i] += xps * cyz + yps * czx + zps * cxy ;
+
+         xps = x74 + x30 ;
+         yps = y74 + y30 ;
+         zps = z74 + z30 ;
+
+         cyz = y71 * z50 - z71 * y50 ;
+         czx = z71 * x50 - x71 * z50 ;
+         cxy = x71 * y50 - y71 * x50 ;
+         vol[i] += xps * cyz + yps * czx + zps * cxy ;
+
+         vol[i] *= vnormq ;
+      } );
+
+   }
+}
+
+BENCHMARK(BM_VOL3D_CALC_LAMBDA)->Arg(SHORT)->Arg(MEDIUM)->
+                                 Arg(LONG)->Unit(benchmark::kMicrosecond);
+
+static void BM_DEL_DOT_VEC_2D_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(DEL_DOT_VEC_2D);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr xdot = loop_data.array_1D_Real[2];
+   Real_ptr ydot = loop_data.array_1D_Real[3];
+   Real_ptr div = loop_data.array_1D_Real[4];
+
+   ADomain domain(state.range(0), /* ndims = */ 2);
+
+   UnalignedReal_ptr x1,x2,x3,x4 ;
+   UnalignedReal_ptr y1,y2,y3,y4 ;
+   UnalignedReal_ptr fx1,fx2,fx3,fx4 ;
+   UnalignedReal_ptr fy1,fy2,fy3,fy4 ;
+
+   NDSET2D(x,x1,x2,x3,x4) ;
+   NDSET2D(y,y1,y2,y3,y4) ;
+   NDSET2D(xdot,fx1,fx2,fx3,fx4) ;
+   NDSET2D(ydot,fy1,fy2,fy3,fy4) ;
+
+   const Real_type ptiny = 1.0e-20;
+   const Real_type half  = 0.5;
+
+   for ( auto _ : state ) {
+
+      forall<exec_policy>(0, domain.n_real_zones,
+      [&] (Index_type ii) {
+
+         Index_type i  = domain.real_zones[ii] ;
+
+         Real_type xi  = half * ( x1[i]  + x2[i]  - x3[i]  - x4[i]  ) ;
+         Real_type xj  = half * ( x2[i]  + x3[i]  - x4[i]  - x1[i]  ) ;
+
+         Real_type yi  = half * ( y1[i]  + y2[i]  - y3[i]  - y4[i]  ) ;
+         Real_type yj  = half * ( y2[i]  + y3[i]  - y4[i]  - y1[i]  ) ;
+
+         Real_type fxi = half * ( fx1[i] + fx2[i] - fx3[i] - fx4[i] ) ;
+         Real_type fxj = half * ( fx2[i] + fx3[i] - fx4[i] - fx1[i] ) ;
+
+         Real_type fyi = half * ( fy1[i] + fy2[i] - fy3[i] - fy4[i] ) ;
+         Real_type fyj = half * ( fy2[i] + fy3[i] - fy4[i] - fy1[i] ) ;
+
+         Real_type rarea  = 1.0 / ( xi * yj - xj * yi + ptiny ) ;
+
+         Real_type dfxdx  = rarea * ( fxi * yj - fxj * yi ) ;
+
+         Real_type dfydy  = rarea * ( fyj * xi - fyi * xj ) ;
+
+         Real_type affine = ( fy1[i] + fy2[i] + fy3[i] + fy4[i] ) /
+                            ( y1[i]  + y2[i]  + y3[i]  + y4[i]  ) ;
+
+         div[i] = dfxdx + dfydy + affine ;
+      } );
+
+   }
+}
+
+BENCHMARK(BM_DEL_DOT_VEC_2D_LAMBDA)->Arg(SHORT)->Arg(MEDIUM)->
+                                     Arg(LONG)->Unit(benchmark::kMicrosecond);
+
+static void BM_COUPLE_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(COUPLE);
+
+   Complex_ptr t0 = loop_data.array_1D_Complex[0];
+   Complex_ptr t1 = loop_data.array_1D_Complex[1];
+   Complex_ptr t2 = loop_data.array_1D_Complex[2];
+   Complex_ptr denac = loop_data.array_1D_Complex[3];
+   Complex_ptr denlw = loop_data.array_1D_Complex[4];
+
+
+   ADomain domain(state.range(0), /* ndims = */ 3);
+
+   Index_type imin = domain.imin;
+   Index_type imax = domain.imax;
+   Index_type jmin = domain.jmin;
+   Index_type jmax = domain.jmax;
+   Index_type kmin = domain.kmin;
+   Index_type kmax = domain.kmax;
+
+   const Real_type clight=3.e+10;
+   const Real_type csound=3.09e+7;
+   const Real_type omega0= 0.9;
+   const Real_type omegar= 0.9;
+   const Real_type dt= 0.208;
+   const Real_type c10 = 0.25 * (clight / csound);
+   const Real_type fratio = sqrt(omegar / omega0);
+   const Real_type r_fratio = 1.0/fratio;
+   const Real_type c20 = 0.25 * (clight / csound) * r_fratio;
+   const Complex_type ireal(0.0, 1.0);
+
+   for ( auto _ : state ) {
+
+      forall<exec_policy>(kmin, kmax,
+      [&] (Index_type k) {
+
+         for (Index_type j = jmin; j < jmax; j++) {
+
+            Index_type it0=    ((k)*(jmax+1) + (j))*(imax+1) ;
+            Index_type idenac= ((k)*(jmax+2) + (j))*(imax+2) ;
+
+            for (Index_type i = imin; i < imax; i++) {
+
+               Complex_type c1 = c10 * denac[idenac+i];
+               Complex_type c2 = c20 * denlw[it0+i];
+
+               /* promote to doubles to avoid possible divide by zero
+                  errors later on. */
+               Real_type c1re = real(c1);  Real_type c1im = imag(c1);
+               Real_type c2re = real(c2);  Real_type c2im = imag(c2);
+
+               /* compute lamda = sqrt(|c1|^2 + |c2|^2) using doubles
+                  to avoid underflow. */
+               Real_type zlam = c1re*c1re + c1im*c1im +
+                                c2re*c2re + c2im*c2im + 1.0e-34;
+               zlam = sqrt(zlam);
+               Real_type snlamt = sin(zlam * dt * 0.5);
+               Real_type cslamt = cos(zlam * dt * 0.5);
+
+               Complex_type a0t = t0[it0+i];
+               Complex_type a1t = t1[it0+i];
+               Complex_type a2t = t2[it0+i] * fratio;
+
+               Real_type r_zlam= 1.0/zlam;
+               c1 *= r_zlam;
+               c2 *= r_zlam;
+               Real_type zac1 = zabs2(c1);
+               Real_type zac2 = zabs2(c2);
+
+               /* compute new A0 */
+               Complex_type z3 = ( c1 * a1t + c2 * a2t ) * snlamt ;
+               t0[it0+i] = a0t * cslamt -  ireal * z3;
+
+               /* compute new A1  */
+               Real_type r = zac1 * cslamt + zac2;
+               Complex_type z5 = c2 * a2t;
+               Complex_type z4 = conj(c1) * z5 * (cslamt-1);
+               z3 = conj(c1) * a0t * snlamt;
+               t1[it0+i] = a1t * r + z4 - ireal * z3;
+
+               /* compute new A2  */
+               r = zac1 + zac2 * cslamt;
+               z5 = c1 * a1t;
+               z4 = conj(c2) * z5 * (cslamt-1);
+               z3 = conj(c2) * a0t * snlamt;
+               t2[it0+i] = ( a2t * r + z4 - ireal * z3 ) * r_fratio;
+
+            }  // i loop
+
+         }  // j loop
+
+      } ); // k loop
+
+   } // google benchmark loop
+}
+
+BENCHMARK(BM_COUPLE_LAMBDA)->Arg(SHORT)->Arg(MEDIUM)->
+                             Arg(LONG)->Unit(benchmark::kMicrosecond);
+
+static void BM_FIR_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(FIR);
+
+   Real_ptr out = loop_data.array_1D_Real[0];
+   Real_ptr in = loop_data.array_1D_Real[1];
+
+   const Index_type coefflen = 16;
+   Real_type coeff[coefflen] = { 3.0, -1.0, -1.0, -1.0,
+                                 -1.0, 3.0, -1.0, -1.0,
+                                 -1.0, -1.0, 3.0, -1.0,
+                                 -1.0, -1.0, -1.0, 3.0 };
+   const Index_type len_minus_coeff = state.range(0) - coefflen;
+
+   Index_type val = 0;
+
+   for ( auto _ : state ) {
+
+      forall<exec_policy>(0, len_minus_coeff,
+      [&] (Index_type i) {
+         Real_type sum = 0.0;
+         for (Index_type j = 0; j < coefflen; ++j ) {
+            sum += coeff[j]*in[i+j];
+         }
+         out[i] = sum;
+      } );
+
+   }
+}
+
+BENCHMARK(BM_FIR_LAMBDA)->Arg(171)->Arg(5001)->
+                          Arg(44217)->Unit(benchmark::kMicrosecond);
Index: MicroBenchmarks/LCALS/SubsetALambdaLoops/lit.local.cfg
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetALambdaLoops/lit.local.cfg
@@ -0,0 +1,7 @@
+test_modules = config.test_modules
+if 'run' in test_modules:
+    # Insert microbenchmark module behind 'run'
+    test_modules.insert(test_modules.index('run')+1, 'microbenchmark')
+    # Timeit results are not useful for microbenchmarks
+    if 'timeit' in test_modules:
+        test_modules.remove('timeit')
Index: MicroBenchmarks/LCALS/SubsetARawLoops/CMakeLists.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetARawLoops/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(COPY lit.local.cfg DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG)
+llvm_test_run()
+llvm_test_executable(lcalsARaw ../main.cxx RawSubsetAbenchmarks.cxx ../LCALSStats.cxx ../LCALSSuite.cxx ../runReferenceLoops.cxx)
+target_link_libraries(lcalsARaw benchmark)
Index: MicroBenchmarks/LCALS/SubsetARawLoops/RawSubsetAbenchmarks.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetARawLoops/RawSubsetAbenchmarks.cxx
@@ -0,0 +1,455 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file containing LCALS "A" subset raw loops using the google
+// benchmark library
+//
+
+#include <benchmark/benchmark.h>
+#include "../LCALSSuite.hxx"
+#include "../SubsetDataA.hxx"
+
+static void BM_PRESSURE_CALC_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(PRESSURE_CALC);
+
+   Real_ptr compression = loop_data.array_1D_Real[0];
+   Real_ptr bvc = loop_data.array_1D_Real[1];
+   Real_ptr p_new = loop_data.array_1D_Real[2];
+   Real_ptr e_old = loop_data.array_1D_Real[3];
+   Real_ptr vnewc = loop_data.array_1D_Real[4];
+
+   const Real_type cls = loop_data.scalar_Real[0];
+   const Real_type p_cut = loop_data.scalar_Real[1];
+   const Real_type pmin = loop_data.scalar_Real[2];
+   const Real_type eosvmax = loop_data.scalar_Real[3];
+   
+   for( auto _ : state) {
+
+      for (Index_type i=0 ; i<state.range(0) ; i++ ) {
+         bvc[i] = cls * (compression[i] + 1.0);
+      }
+
+      for (Index_type i=0 ; i<state.range(0) ; i++ ) {
+         p_new[i] = bvc[i] * e_old[i] ;
+
+         if ( fabs(p_new[i]) <  p_cut ) p_new[i] = 0.0 ;
+
+         if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ;
+
+         if ( p_new[i]  <  pmin ) p_new[i]   = pmin ;
+      }
+
+   }
+}
+
+BENCHMARK(BM_PRESSURE_CALC_RAW)->Arg(171)->Arg(5001)->
+                                 Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+static void BM_ENERGY_CALC_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(ENERGY_CALC);
+
+   Real_ptr e_new = loop_data.array_1D_Real[0];
+   Real_ptr e_old = loop_data.array_1D_Real[1];
+   Real_ptr delvc = loop_data.array_1D_Real[2];
+   Real_ptr p_new = loop_data.array_1D_Real[3];
+   Real_ptr p_old = loop_data.array_1D_Real[4];
+   Real_ptr q_new = loop_data.array_1D_Real[5];
+   Real_ptr q_old = loop_data.array_1D_Real[6];
+   Real_ptr work = loop_data.array_1D_Real[7];
+   Real_ptr compHalfStep = loop_data.array_1D_Real[8];
+   Real_ptr pHalfStep = loop_data.array_1D_Real[9];
+   Real_ptr bvc = loop_data.array_1D_Real[10];
+   Real_ptr pbvc = loop_data.array_1D_Real[11];
+   Real_ptr ql_old = loop_data.array_1D_Real[12];
+   Real_ptr qq_old = loop_data.array_1D_Real[13];
+   Real_ptr vnewc = loop_data.array_1D_Real[14];
+
+   const Real_type rho0 = loop_data.scalar_Real[0];
+   const Real_type e_cut = loop_data.scalar_Real[1];
+   const Real_type emin = loop_data.scalar_Real[2];
+   const Real_type q_cut = loop_data.scalar_Real[3];
+
+   for( auto _ : state) {
+
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         e_new[i] = e_old[i] - 0.5 * delvc[i] *
+                    (p_old[i] + q_old[i]) + 0.5 * work[i];
+      }
+
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         if ( delvc[i] > 0.0 ) {
+                    q_new[i] = 0.0 ;
+         }
+         else {
+            Real_type vhalf = 1.0 / (1.0 + compHalfStep[i]) ;
+            Real_type ssc = ( pbvc[i] * e_new[i]
+               + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
+
+            if ( ssc <= 0.1111111e-36 ) {
+               ssc = 0.3333333e-18 ;
+            } else {
+               ssc = sqrt(ssc) ;
+            }
+
+            q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
+         }
+      }
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         e_new[i] = e_new[i] + 0.5 * delvc[i]
+            * ( 3.0*(p_old[i] + q_old[i])
+               - 4.0*(pHalfStep[i] + q_new[i])) ;
+      }
+
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         e_new[i] += 0.5 * work[i];
+
+         if ( fabs(e_new[i]) < e_cut ) { e_new[i] = 0.0  ; }
+
+            if ( e_new[i]  < emin ) { e_new[i] = emin ; }
+      }
+
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         Real_type q_tilde ;
+
+         if (delvc[i] > 0.0) {
+            q_tilde = 0. ;
+         }
+         else {
+            Real_type ssc = ( pbvc[i] * e_new[i]
+               + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+
+            if ( ssc <= 0.1111111e-36 ) {
+               ssc = 0.3333333e-18 ;
+            } else {
+               ssc = sqrt(ssc) ;
+            }
+
+            q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
+         }
+
+         e_new[i] = e_new[i] - ( 7.0*(p_old[i] + q_old[i])
+                                - 8.0*(pHalfStep[i] + q_new[i])
+                                + (p_new[i] + q_tilde)) * delvc[i] / 6.0 ;
+
+         if ( fabs(e_new[i]) < e_cut ) {
+            e_new[i] = 0.0  ;
+         }
+         if ( e_new[i]  < emin ) {
+            e_new[i] = emin ;
+         }
+      }
+
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         if ( delvc[i] <= 0.0 ) {
+            Real_type ssc = ( pbvc[i] * e_new[i]
+               + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+
+            if ( ssc <= 0.1111111e-36 ) {
+               ssc = 0.3333333e-18 ;
+            } else {
+               ssc = sqrt(ssc) ;
+            }
+
+            q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
+
+            if (fabs(q_new[i]) < q_cut) q_new[i] = 0.0 ;
+         }
+      }
+
+   }
+}
+
+BENCHMARK(BM_ENERGY_CALC_RAW)->Arg(171)->Arg(5001)->
+                               Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_VOL3D_CALC_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(VOL3D_CALC);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+   Real_ptr vol = loop_data.array_1D_Real[3];
+
+   ADomain domain(state.range(0), /* ndims = */ 3);
+
+   UnalignedReal_ptr x0,x1,x2,x3,x4,x5,x6,x7 ;
+   UnalignedReal_ptr y0,y1,y2,y3,y4,y5,y6,y7 ;
+   UnalignedReal_ptr z0,z1,z2,z3,z4,z5,z6,z7 ;
+
+   NDPTRSET(x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+   NDPTRSET(y,y0,y1,y2,y3,y4,y5,y6,y7) ;
+   NDPTRSET(z,z0,z1,z2,z3,z4,z5,z6,z7) ;
+
+   const Real_type vnormq = 0.083333333333333333; /* vnormq = 1/12 */
+
+   for (auto _ : state) {
+
+      for (Index_type i = domain.fpz ; i <= domain.lpz ; i++ ) {
+
+         Real_type x71 = x7[i] - x1[i] ;
+         Real_type x72 = x7[i] - x2[i] ;
+         Real_type x74 = x7[i] - x4[i] ;
+         Real_type x30 = x3[i] - x0[i] ;
+         Real_type x50 = x5[i] - x0[i] ;
+         Real_type x60 = x6[i] - x0[i] ;
+
+         Real_type y71 = y7[i] - y1[i] ;
+         Real_type y72 = y7[i] - y2[i] ;
+         Real_type y74 = y7[i] - y4[i] ;
+         Real_type y30 = y3[i] - y0[i] ;
+         Real_type y50 = y5[i] - y0[i] ;
+         Real_type y60 = y6[i] - y0[i] ;
+
+         Real_type z71 = z7[i] - z1[i] ;
+         Real_type z72 = z7[i] - z2[i] ;
+         Real_type z74 = z7[i] - z4[i] ;
+         Real_type z30 = z3[i] - z0[i] ;
+         Real_type z50 = z5[i] - z0[i] ;
+         Real_type z60 = z6[i] - z0[i] ;
+
+         Real_type xps = x71 + x60 ;
+         Real_type yps = y71 + y60 ;
+         Real_type zps = z71 + z60 ;
+
+         Real_type cyz = y72 * z30 - z72 * y30 ;
+         Real_type czx = z72 * x30 - x72 * z30 ;
+         Real_type cxy = x72 * y30 - y72 * x30 ;
+         vol[i] = xps * cyz + yps * czx + zps * cxy ;
+
+         xps = x72 + x50 ;
+         yps = y72 + y50 ;
+         zps = z72 + z50 ;
+
+         cyz = y74 * z60 - z74 * y60 ;
+         czx = z74 * x60 - x74 * z60 ;
+         cxy = x74 * y60 - y74 * x60 ;
+         vol[i] += xps * cyz + yps * czx + zps * cxy ;
+
+         xps = x74 + x30 ;
+         yps = y74 + y30 ;
+         zps = z74 + z30 ;
+
+         cyz = y71 * z50 - z71 * y50 ;
+         czx = z71 * x50 - x71 * z50 ;
+         cxy = x71 * y50 - y71 * x50 ;
+         vol[i] += xps * cyz + yps * czx + zps * cxy ;
+
+         vol[i] *= vnormq ;
+
+      }
+
+   }
+}
+
+BENCHMARK(BM_VOL3D_CALC_RAW)->Arg(SHORT)->Arg(MEDIUM)->
+                              Arg(LONG)->Unit(benchmark::kMicrosecond);
+
+static void BM_DEL_DOT_VEC_2D_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(DEL_DOT_VEC_2D);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr xdot = loop_data.array_1D_Real[2];
+   Real_ptr ydot = loop_data.array_1D_Real[3];
+   Real_ptr div = loop_data.array_1D_Real[4];
+
+   ADomain domain(state.range(0), /* ndims = */ 2);
+
+   UnalignedReal_ptr x1,x2,x3,x4 ;
+   UnalignedReal_ptr y1,y2,y3,y4 ;
+   UnalignedReal_ptr fx1,fx2,fx3,fx4 ;
+   UnalignedReal_ptr fy1,fy2,fy3,fy4 ;
+
+   NDSET2D(x,x1,x2,x3,x4) ;
+   NDSET2D(y,y1,y2,y3,y4) ;
+   NDSET2D(xdot,fx1,fx2,fx3,fx4) ;
+   NDSET2D(ydot,fy1,fy2,fy3,fy4) ;
+
+   const Real_type ptiny = 1.0e-20;
+   const Real_type half  = 0.5;
+
+   for ( auto _ : state ) {
+
+      for (Index_type ii = 0 ; ii < domain.n_real_zones ; ii++ ) {
+
+         Index_type i  = domain.real_zones[ii] ;
+
+         Real_type xi  = half * ( x1[i]  + x2[i]  - x3[i]  - x4[i]  ) ;
+         Real_type xj  = half * ( x2[i]  + x3[i]  - x4[i]  - x1[i]  ) ;
+
+         Real_type yi  = half * ( y1[i]  + y2[i]  - y3[i]  - y4[i]  ) ;
+         Real_type yj  = half * ( y2[i]  + y3[i]  - y4[i]  - y1[i]  ) ;
+
+         Real_type fxi = half * ( fx1[i] + fx2[i] - fx3[i] - fx4[i] ) ;
+         Real_type fxj = half * ( fx2[i] + fx3[i] - fx4[i] - fx1[i] ) ;
+
+         Real_type fyi = half * ( fy1[i] + fy2[i] - fy3[i] - fy4[i] ) ;
+         Real_type fyj = half * ( fy2[i] + fy3[i] - fy4[i] - fy1[i] ) ;
+
+         Real_type rarea  = 1.0 / ( xi * yj - xj * yi + ptiny ) ;
+
+         Real_type dfxdx  = rarea * ( fxi * yj - fxj * yi ) ;
+
+         Real_type dfydy  = rarea * ( fyj * xi - fyi * xj ) ;
+
+         Real_type affine = ( fy1[i] + fy2[i] + fy3[i] + fy4[i] ) /
+                            ( y1[i]  + y2[i]  + y3[i]  + y4[i]  ) ;
+
+         div[i] = dfxdx + dfydy + affine ;
+      }
+
+   }
+}
+
+BENCHMARK(BM_DEL_DOT_VEC_2D_RAW)->Arg(SHORT)->Arg(MEDIUM)->
+                                  Arg(LONG)->Unit(benchmark::kMicrosecond);
+
+static void BM_COUPLE_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(COUPLE);
+
+   Complex_ptr t0 = loop_data.array_1D_Complex[0];
+   Complex_ptr t1 = loop_data.array_1D_Complex[1];
+   Complex_ptr t2 = loop_data.array_1D_Complex[2];
+   Complex_ptr denac = loop_data.array_1D_Complex[3];
+   Complex_ptr denlw = loop_data.array_1D_Complex[4];
+
+
+   ADomain domain(state.range(0), /* ndims = */ 3);
+
+   Index_type imin = domain.imin;
+   Index_type imax = domain.imax;
+   Index_type jmin = domain.jmin;
+   Index_type jmax = domain.jmax;
+   Index_type kmin = domain.kmin;
+   Index_type kmax = domain.kmax;
+
+   const Real_type clight=3.e+10;
+   const Real_type csound=3.09e+7;
+   const Real_type omega0= 0.9;
+   const Real_type omegar= 0.9;
+   const Real_type dt= 0.208;
+   const Real_type c10 = 0.25 * (clight / csound);
+   const Real_type fratio = sqrt(omegar / omega0);
+   const Real_type r_fratio = 1.0/fratio;
+   const Real_type c20 = 0.25 * (clight / csound) * r_fratio;
+   const Complex_type ireal(0.0, 1.0);
+
+   for ( auto _ : state ) {
+
+      for (Index_type k = kmin; k < kmax; k++) {
+
+         for (Index_type j = jmin; j < jmax; j++) {
+
+            Index_type it0=    ((k)*(jmax+1) + (j))*(imax+1) ;
+            Index_type idenac= ((k)*(jmax+2) + (j))*(imax+2) ;
+
+            for (Index_type i = imin; i < imax; i++) {
+
+               Complex_type c1 = c10 * denac[idenac+i];
+               Complex_type c2 = c20 * denlw[it0+i];
+
+               /* promote to doubles to avoid possible divide by zero
+                  errors later on. */
+               Real_type c1re = real(c1);  Real_type c1im = imag(c1);
+               Real_type c2re = real(c2);  Real_type c2im = imag(c2);
+
+               /* compute lamda = sqrt(|c1|^2 + |c2|^2) using doubles
+                  to avoid underflow. */
+               Real_type zlam = c1re*c1re + c1im*c1im +
+                                c2re*c2re + c2im*c2im + 1.0e-34;
+               zlam = sqrt(zlam);
+               Real_type snlamt = sin(zlam * dt * 0.5);
+               Real_type cslamt = cos(zlam * dt * 0.5);
+
+               Complex_type a0t = t0[it0+i];
+               Complex_type a1t = t1[it0+i];
+               Complex_type a2t = t2[it0+i] * fratio;
+
+               Real_type r_zlam= 1.0/zlam;
+               c1 *= r_zlam;
+               c2 *= r_zlam;
+               Real_type zac1 = zabs2(c1);
+               Real_type zac2 = zabs2(c2);
+
+               /* compute new A0 */
+               Complex_type z3 = ( c1 * a1t + c2 * a2t ) * snlamt ;
+               t0[it0+i] = a0t * cslamt -  ireal * z3;
+
+               /* compute new A1  */
+               Real_type r = zac1 * cslamt + zac2;
+               Complex_type z5 = c2 * a2t;
+               Complex_type z4 = conj(c1) * z5 * (cslamt-1);
+               z3 = conj(c1) * a0t * snlamt;
+               t1[it0+i] = a1t * r + z4 - ireal * z3;
+
+               /* compute new A2  */
+               r = zac1 + zac2 * cslamt;
+               z5 = c1 * a1t;
+               z4 = conj(c2) * z5 * (cslamt-1);
+               z3 = conj(c2) * a0t * snlamt;
+               t2[it0+i] = ( a2t * r + z4 - ireal * z3 ) * r_fratio;
+
+            }  // i loop
+
+         }  // j loop
+
+      }  // k loop
+
+   } // benchmark loop
+}
+
+BENCHMARK(BM_COUPLE_RAW)->Arg(SHORT)->Arg(MEDIUM)->
+                          Arg(LONG)->Unit(benchmark::kMicrosecond);
+
+static void BM_FIR_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(FIR);
+
+   Real_ptr out = loop_data.array_1D_Real[0];
+   Real_ptr in = loop_data.array_1D_Real[1];
+
+   const Index_type coefflen = 16;
+   Real_type coeff[coefflen] = { 3.0, -1.0, -1.0, -1.0,
+                                -1.0, 3.0, -1.0, -1.0,
+                                -1.0, -1.0, 3.0, -1.0,
+                                -1.0, -1.0, -1.0, 3.0 };
+   const Index_type len_minus_coeff = state.range(0) - coefflen;
+
+   Index_type val = 0;
+
+   for ( auto _ : state ) {
+
+      for (Index_type i = 0 ; i < len_minus_coeff ; i++ ) {
+         Real_type sum = 0.0;
+
+         for (Index_type j = 0; j < coefflen; ++j ) {
+            sum += coeff[j]*in[i+j];
+         }
+         out[i] = sum;
+      }
+
+   }
+}
+
+BENCHMARK(BM_FIR_RAW)->Arg(171)->Arg(5001)->
+                       Arg(44217)->Unit(benchmark::kMicrosecond);
Index: MicroBenchmarks/LCALS/SubsetARawLoops/lit.local.cfg
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetARawLoops/lit.local.cfg
@@ -0,0 +1,7 @@
+test_modules = config.test_modules
+if 'run' in test_modules:
+    # Insert microbenchmark module behind 'run'
+    test_modules.insert(test_modules.index('run')+1, 'microbenchmark')
+    # Timeit results are not useful for microbenchmarks
+    if 'timeit' in test_modules:
+        test_modules.remove('timeit')
Index: MicroBenchmarks/LCALS/SubsetBLambdaLoops/CMakeLists.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetBLambdaLoops/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(COPY lit.local.cfg DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG)
+llvm_test_run()
+llvm_test_executable(lcalsBLambda ../main.cxx LambdaSubsetBbenchmarks.cxx  ../LCALSStats.cxx ../LCALSSuite.cxx  ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx)
+target_link_libraries(lcalsBLambda benchmark)
+
Index: MicroBenchmarks/LCALS/SubsetBLambdaLoops/LambdaSubsetBbenchmarks.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetBLambdaLoops/LambdaSubsetBbenchmarks.cxx
@@ -0,0 +1,137 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file containing LCALS "B" subset forall lambda loops using 
+// the google benchmark library.
+//
+
+#include <benchmark/benchmark.h>
+#include "../LCALSSuite.hxx"
+#include "../SubsetDataB.hxx"
+#include "../LCALSTraversalMethods.hxx"
+
+static void BM_INIT3_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(INIT3);
+
+   Real_ptr out1 = loop_data.array_1D_Real[0];
+   Real_ptr out2 = loop_data.array_1D_Real[1];
+   Real_ptr out3 = loop_data.array_1D_Real[2];
+   Real_ptr in1 = loop_data.array_1D_Real[3];
+   Real_ptr in2 = loop_data.array_1D_Real[4];
+
+   for( auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         out1[i] = out2[i] = out3[i] = - in1[i] - in2[i];
+      } );
+
+   }
+}
+
+BENCHMARK(BM_INIT3_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+static void BM_MULADDSUB_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(MULADDSUB);
+
+   Real_ptr out1 = loop_data.array_1D_Real[0];
+   Real_ptr out2 = loop_data.array_1D_Real[1];
+   Real_ptr out3 = loop_data.array_1D_Real[2];
+   Real_ptr in1 = loop_data.array_1D_Real[3];
+   Real_ptr in2 = loop_data.array_1D_Real[4];
+
+   for ( auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         out1[i] = in1[i] * in2[i] ;
+         out2[i] = in1[i] + in2[i] ;
+         out3[i] = in1[i] - in2[i] ;
+      } );
+
+   }
+}
+
+BENCHMARK(BM_MULADDSUB_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+static void BM_IF_QUAD_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(IF_QUAD);
+
+   Real_ptr a = loop_data.array_1D_Real[0];
+   Real_ptr b = loop_data.array_1D_Real[1];
+   Real_ptr c = loop_data.array_1D_Real[2];
+   Real_ptr x1 = loop_data.array_1D_Real[3];
+   Real_ptr x2 = loop_data.array_1D_Real[4];
+
+   for ( auto _ : state ) { 
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         Real_type s = b[i]*b[i] - 4.0*a[i]*c[i];
+         if ( s >= 0 ) {
+            s = sqrt(s);
+            x2[i] = (-b[i]+s)/(2.0*a[i]);
+            x1[i] = (-b[i]-s)/(2.0*a[i]);
+         } else {
+            x2[i] = 0.0;
+            x1[i] = 0.0;
+         }
+      } );
+
+   }
+}
+
+BENCHMARK(BM_IF_QUAD_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+
+static void BM_TRAP_INT_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(TRAP_INT);
+
+   Real_type xn = loop_data.scalar_Real[0];
+   Real_type x0 = loop_data.scalar_Real[1];
+   Real_type xp = loop_data.scalar_Real[2];
+   Real_type y = loop_data.scalar_Real[3];
+   Real_type yp = loop_data.scalar_Real[4];
+
+   Index_type nx = loop_data.array_1D_Indx[0][0] + 1;
+
+   const Real_type h = (xn - x0) / nx;
+   Real_type sumx = 0.5*( trap_int_func(x0, y, xp, yp) +
+                          trap_int_func(xn, y, xp, yp) );
+
+   Real_type val = 0;
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         Real_type x = x0 + i*h;
+         sumx += trap_int_func(x, y, xp, yp);
+      } );
+      benchmark::DoNotOptimize(val = sumx * h);
+
+   }
+}
+
+BENCHMARK(BM_TRAP_INT_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
Index: MicroBenchmarks/LCALS/SubsetBLambdaLoops/lit.local.cfg
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetBLambdaLoops/lit.local.cfg
@@ -0,0 +1,7 @@
+test_modules = config.test_modules
+if 'run' in test_modules:
+    # Insert microbenchmark module behind 'run'
+    test_modules.insert(test_modules.index('run')+1, 'microbenchmark')
+    # Timeit results are not useful for microbenchmarks
+    if 'timeit' in test_modules:
+        test_modules.remove('timeit')
Index: MicroBenchmarks/LCALS/SubsetBRawLoops/CMakeLists.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetBRawLoops/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(COPY lit.local.cfg DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG)
+llvm_test_run()
+llvm_test_executable(lcalsBRaw ../main.cxx RawSubsetBbenchmarks.cxx  ../LCALSStats.cxx ../LCALSSuite.cxx  ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx)
+target_link_libraries(lcalsBRaw benchmark)
+
Index: MicroBenchmarks/LCALS/SubsetBRawLoops/RawSubsetBbenchmarks.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetBRawLoops/RawSubsetBbenchmarks.cxx
@@ -0,0 +1,132 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file containing LCALS "B" subset raw loops using the google
+// benchmark library
+//
+
+#include <benchmark/benchmark.h>
+#include "../LCALSSuite.hxx"
+#include "../SubsetDataB.hxx"
+
+static void BM_INIT3_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(INIT3);
+
+   Real_ptr out1 = loop_data.array_1D_Real[0];
+   Real_ptr out2 = loop_data.array_1D_Real[1];
+   Real_ptr out3 = loop_data.array_1D_Real[2];
+   Real_ptr in1 = loop_data.array_1D_Real[3];
+   Real_ptr in2 = loop_data.array_1D_Real[4];
+
+   for( auto _ : state) {
+
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         out1[i] = out2[i] = out3[i] = - in1[i] - in2[i];
+      }
+
+   }
+}
+
+BENCHMARK(BM_INIT3_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+static void BM_MULADDSUB_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(MULADDSUB);
+
+   Real_ptr out1 = loop_data.array_1D_Real[0];
+   Real_ptr out2 = loop_data.array_1D_Real[1];
+   Real_ptr out3 = loop_data.array_1D_Real[2];
+   Real_ptr in1 = loop_data.array_1D_Real[3];
+   Real_ptr in2 = loop_data.array_1D_Real[4];
+
+   for ( auto _ : state) {
+
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         out1[i] = in1[i] * in2[i] ;
+         out2[i] = in1[i] + in2[i] ;
+         out3[i] = in1[i] - in2[i] ;
+      }
+
+   }
+}
+
+BENCHMARK(BM_MULADDSUB_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+static void BM_IF_QUAD_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(IF_QUAD);
+
+   Real_ptr a = loop_data.array_1D_Real[0];
+   Real_ptr b = loop_data.array_1D_Real[1];
+   Real_ptr c = loop_data.array_1D_Real[2];
+   Real_ptr x1 = loop_data.array_1D_Real[3];
+   Real_ptr x2 = loop_data.array_1D_Real[4];
+
+   for ( auto _ : state ) { 
+
+      for (Index_type i=0 ; i< state.range(0); i++ ) {
+         Real_type s = b[i]*b[i] - 4.0*a[i]*c[i];
+         if ( s >= 0 ) {
+            s = sqrt(s);
+            x2[i] = (-b[i]+s)/(2.0*a[i]);
+            x1[i] = (-b[i]-s)/(2.0*a[i]);
+         } else {
+            x2[i] = 0.0;
+            x1[i] = 0.0;
+         }
+      }
+
+   }
+}
+
+BENCHMARK(BM_IF_QUAD_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+
+static void BM_TRAP_INT_RAW(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(TRAP_INT);
+
+   Real_type xn = loop_data.scalar_Real[0];
+   Real_type x0 = loop_data.scalar_Real[1];
+   Real_type xp = loop_data.scalar_Real[2];
+   Real_type y = loop_data.scalar_Real[3];
+   Real_type yp = loop_data.scalar_Real[4];
+
+   Index_type nx = loop_data.array_1D_Indx[0][0] + 1;
+
+   const Real_type h = (xn - x0) / nx;
+   Real_type sumx = 0.5*( trap_int_func(x0, y, xp, yp) +
+                          trap_int_func(xn, y, xp, yp) );
+
+   Real_type val = 0;
+
+   for (auto _ : state) {
+
+      for (Index_type i=0 ; i< state.range(0); i++ ) {
+         Real_type x = x0 + i*h;
+         sumx += trap_int_func(x, y, xp, yp);
+      }
+      benchmark::DoNotOptimize(val = sumx * h);
+
+   }
+}
+
+BENCHMARK(BM_TRAP_INT_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
Index: MicroBenchmarks/LCALS/SubsetBRawLoops/lit.local.cfg
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetBRawLoops/lit.local.cfg
@@ -0,0 +1,7 @@
+test_modules = config.test_modules
+if 'run' in test_modules:
+    # Insert microbenchmark module behind 'run'
+    test_modules.insert(test_modules.index('run')+1, 'microbenchmark')
+    # Timeit results are not useful for microbenchmarks
+    if 'timeit' in test_modules:
+        test_modules.remove('timeit')
Index: MicroBenchmarks/LCALS/SubsetCLambdaLoops/CMakeLists.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetCLambdaLoops/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(COPY lit.local.cfg DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG)
+#llvm_test_run(--benchmark_repetitions=5)
+llvm_test_run()
+llvm_test_executable(lcalsCLambda ../main.cxx LambdaSubsetCbenchmarks.cxx  ../LCALSStats.cxx ../LCALSSuite.cxx  ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx)
+target_link_libraries(lcalsCLambda benchmark)
Index: MicroBenchmarks/LCALS/SubsetCLambdaLoops/LambdaSubsetCbenchmarks.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetCLambdaLoops/LambdaSubsetCbenchmarks.cxx
@@ -0,0 +1,718 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file containing LCALS "C" subset forall lambda loops using 
+// the google benchmark library.
+//
+
+#include <benchmark/benchmark.h>
+#include "../LCALSSuite.hxx"
+#include "../LCALSTraversalMethods.hxx"
+
+static void BM_HYDRO_1D_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(HYDRO_1D);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+
+   const Real_type q = loop_data.scalar_Real[0];
+   const Real_type r = loop_data.scalar_Real[1];
+   const Real_type t = loop_data.scalar_Real[2];
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k)  {
+         x[k] = q + y[k]*( r*z[k+10] + t*z[k+11] );
+      } );
+
+   }
+}
+
+BENCHMARK(BM_HYDRO_1D_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+static void BM_ICCG_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(ICCG);
+
+   Real_ptr x = loop_data.array_1D_Nx4_Real[0];
+   Real_ptr v = loop_data.array_1D_Nx4_Real[1];
+
+   Index_type ii, ipnt, ipntp, i;
+
+  for (auto _ : state) {
+
+      ii = state.range(0);
+      ipntp = 0;
+      do {
+         ipnt = ipntp;
+         ipntp += ii;
+         ii /= 2;
+         i = ipntp ;
+         forall<exec_policy>(ipnt+1, ipntp, 2,
+         [&] (Index_type k) {
+            i++;
+            x[i] = x[k] - v[k  ]*x[k-1] - v[k+1]*x[k+1];
+         } );
+      } while ( ii>0 );
+
+   }
+}
+
+BENCHMARK(BM_ICCG_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_INNER_PROD_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(INNER_PROD);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr z = loop_data.array_1D_Real[1];
+
+   Real_type q = 0.0;
+   Real_type val = 0.0;
+
+   for (auto _ : state) {
+
+      q = 0.0;
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k) {
+         benchmark::DoNotOptimize(q += z[k]*x[k]);
+      } );
+
+   }
+}
+
+BENCHMARK(BM_INNER_PROD_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_BAND_LIN_EQ_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(BAND_LIN_EQ);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+
+   Index_type lw;
+   Real_type temp;
+
+   for (auto _ : state) {
+
+      Index_type m = ( 1001-7 )/2;
+      for ( Index_type k=6 ; k<1001 ; k=k+m ) {
+         lw = k - 6;
+         temp = x[k-1];
+         forall<exec_policy>(4, state.range(0), 5,
+         [&] (Index_type j) {
+            temp -= x[lw]*y[j];
+            lw++;
+         } );
+         x[k-1] = y[4]*temp;
+      }
+
+   }
+}
+
+BENCHMARK(BM_BAND_LIN_EQ_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_TRIDIAG_ELIM_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(TRIDIAG_ELIM);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(1, state.range(0),
+      [&] (Index_type i) {
+         x[i] = z[i]*( y[i] - x[i-1] );
+      } );
+
+   }
+}
+
+BENCHMARK(BM_TRIDIAG_ELIM_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_EOS_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(EOS);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+   Real_ptr u = loop_data.array_1D_Real[3];
+
+   const Real_type q = loop_data.scalar_Real[0];
+   const Real_type r = loop_data.scalar_Real[1];
+   const Real_type t = loop_data.scalar_Real[2];
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k) {
+         x[k] = u[k] + r*( z[k] + r*y[k] ) +
+               t*( u[k+3] + r*( u[k+2] + r*u[k+1] ) +
+                  t*( u[k+6] + q*( u[k+5] + q*u[k+4] ) ) );
+      } );
+
+   }
+}
+
+BENCHMARK(BM_EOS_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_ADI_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(ADI);
+
+   Real_ptr du1 = loop_data.array_1D_Real[0];
+   Real_ptr du2 = loop_data.array_1D_Real[1];
+   Real_ptr du3 = loop_data.array_1D_Real[2];
+
+   Real_ptr** u1 = loop_data.array_3D_2xNx4_Real[0];
+   Real_ptr** u2 = loop_data.array_3D_2xNx4_Real[1];
+   Real_ptr** u3 = loop_data.array_3D_2xNx4_Real[2];
+
+   const Real_type sig = loop_data.scalar_Real[0];
+   const Real_type a11 = loop_data.scalar_Real[1];
+   const Real_type a12 = loop_data.scalar_Real[2];
+   const Real_type a13 = loop_data.scalar_Real[3];
+   const Real_type a21 = loop_data.scalar_Real[4];
+   const Real_type a22 = loop_data.scalar_Real[5];
+   const Real_type a23 = loop_data.scalar_Real[6];
+   const Real_type a31 = loop_data.scalar_Real[7];
+   const Real_type a32 = loop_data.scalar_Real[8];
+   const Real_type a33 = loop_data.scalar_Real[9];
+
+   Index_type nl1 = 0;
+   Index_type nl2 = 1;
+   Index_type kx;
+
+   for (auto _ : state) {
+
+      for ( kx=1 ; kx<3 ; kx++ ) {
+         forall<exec_policy>(1, state.range(0),
+         [&] (Index_type ky) {
+            du1[ky] = u1[nl1][ky+1][kx] - u1[nl1][ky-1][kx];
+            du2[ky] = u2[nl1][ky+1][kx] - u2[nl1][ky-1][kx];
+            du3[ky] = u3[nl1][ky+1][kx] - u3[nl1][ky-1][kx];
+            u1[nl2][ky][kx]=
+               u1[nl1][ky][kx]+a11*du1[ky]+a12*du2[ky]+a13*du3[ky] + sig*
+               (u1[nl1][ky][kx+1]-2.0*u1[nl1][ky][kx]+u1[nl1][ky][kx-1]);
+            u2[nl2][ky][kx]=
+               u2[nl1][ky][kx]+a21*du1[ky]+a22*du2[ky]+a23*du3[ky] + sig*
+               (u2[nl1][ky][kx+1]-2.0*u2[nl1][ky][kx]+u2[nl1][ky][kx-1]);
+            u3[nl2][ky][kx]=
+               u3[nl1][ky][kx]+a31*du1[ky]+a32*du2[ky]+a33*du3[ky] + sig*
+               (u3[nl1][ky][kx+1]-2.0*u3[nl1][ky][kx]+u3[nl1][ky][kx-1]);
+         } );
+      }
+
+   }
+}
+
+BENCHMARK(BM_ADI_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_INT_PREDICT_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(INT_PREDICT);
+
+   Real_ptr* px = loop_data.array_2D_Nx25_Real[0];
+
+   const Real_type dm22 = loop_data.scalar_Real[0];
+   const Real_type dm23 = loop_data.scalar_Real[1];
+   const Real_type dm24 = loop_data.scalar_Real[2];
+   const Real_type dm25 = loop_data.scalar_Real[3];
+   const Real_type dm26 = loop_data.scalar_Real[4];
+   const Real_type dm27 = loop_data.scalar_Real[5];
+   const Real_type dm28 = loop_data.scalar_Real[6];
+   const Real_type c0 = loop_data.scalar_Real[7];
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         px[i][0] = dm28*px[i][12] + dm27*px[i][11] + dm26*px[i][10] +
+              dm25*px[i][ 9] + dm24*px[i][ 8] + dm23*px[i][ 7] +
+              dm22*px[i][ 6] + c0*( px[i][ 4] + px[i][ 5]) + px[i][ 2];
+      } );
+
+   }
+}
+
+BENCHMARK(BM_INT_PREDICT_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_DIFF_PREDICT_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(DIFF_PREDICT);
+
+   Real_ptr* px = loop_data.array_2D_Nx25_Real[0];
+   Real_ptr* cx = loop_data.array_2D_Nx25_Real[1];
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type i) {
+         Real_type ar, br, cr;
+         ar        =      cx[i][ 4];
+         br        = ar - px[i][ 4];
+         px[i][ 4] = ar;
+         cr        = br - px[i][ 5];
+         px[i][ 5] = br;
+         ar        = cr - px[i][ 6];
+         px[i][ 6] = cr;
+         br        = ar - px[i][ 7];
+         px[i][ 7] = ar;
+         cr        = br - px[i][ 8];
+         px[i][ 8] = br;
+         ar        = cr - px[i][ 9];
+         px[i][ 9] = cr;
+         br        = ar - px[i][10];
+         px[i][10] = ar;
+         cr        = br - px[i][11];
+         px[i][11] = br;
+         px[i][13] = cr - px[i][12];
+         px[i][12] = cr;
+      } );
+
+   }
+}
+
+BENCHMARK(BM_DIFF_PREDICT_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_FIRST_SUM_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(FIRST_SUM);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+
+   for (auto _ :state) {
+
+      x[0] = y[0];
+      forall<exec_policy>(1, state.range(0),
+      [&] (Index_type k) {
+         x[k] = x[k-1] + y[k];
+      } );
+
+   }
+}
+
+BENCHMARK(BM_FIRST_SUM_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_FIRST_DIFF_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(FIRST_DIFF);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k) {
+         x[k] = y[k+1] - y[k];
+      } );
+
+   }
+}
+
+BENCHMARK(BM_FIRST_DIFF_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_PIC_2D_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(PIC_2D);
+
+   Real_ptr* p = loop_data.array_2D_Nx25_Real[0];
+   Real_ptr* b = loop_data.array_2D_Nx25_Real[1];
+   Real_ptr* c = loop_data.array_2D_Nx25_Real[2];
+
+   Real_ptr y = loop_data.array_1D_Real[0];
+   Real_ptr z = loop_data.array_1D_Real[1];
+
+   Index_type* e = loop_data.array_1D_Indx[0];
+   Index_type* f = loop_data.array_1D_Indx[1];
+
+   Real_ptr* h = loop_data.array_2D_64x64_Real[0];
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type ip) {
+         Index_type i1, j1, i2, j2;
+         i1 = (Index_type) p[ip][0];
+         j1 = (Index_type) p[ip][1];
+         i1 &= 64-1;
+         j1 &= 64-1;
+         p[ip][2] += b[j1][i1];
+         p[ip][3] += c[j1][i1];
+         p[ip][0] += p[ip][2];
+         p[ip][1] += p[ip][3];
+         i2 = (Index_type) p[ip][0];
+         j2 = (Index_type) p[ip][1];
+         i2 = ( i2 & 64-1 ) ;
+         j2 = ( j2 & 64-1 ) ;
+         p[ip][0] += y[i2+32];
+         p[ip][1] += z[j2+32];
+         i2 += e[i2+32];
+         j2 += f[j2+32];
+         h[j2][i2] += 1.0;
+      } );
+
+   }
+}
+
+BENCHMARK(BM_PIC_2D_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_PIC_1D_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(PIC_1D);
+
+   Real_ptr vx = loop_data.array_1D_Real[0];
+   Real_ptr xx = loop_data.array_1D_Real[1];
+   Real_ptr xi = loop_data.array_1D_Real[2];
+   Real_ptr ex = loop_data.array_1D_Real[3];
+   Real_ptr ex1 = loop_data.array_1D_Real[4];
+   Real_ptr dex = loop_data.array_1D_Real[5];
+   Real_ptr dex1 = loop_data.array_1D_Real[6];
+   Real_ptr rh = loop_data.array_1D_Real[7];
+   Real_ptr rx = loop_data.array_1D_Real[8];
+
+   const Real_type flx = loop_data.scalar_Real[0];
+
+   Index_type* ix = loop_data.array_1D_Indx[2];
+   Index_type* ir = loop_data.array_1D_Indx[3];
+   Index_type* grd = loop_data.array_1D_Indx[4];
+
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k) {
+         vx[k] = 0.0;
+         xx[k] = 0.0;
+         ix[k] = (Index_type) grd[k];
+         xi[k] = (Real_type) ix[k];
+         ex1[k] = ex[ ix[k] - 1 ];
+         dex1[k] = dex[ ix[k] - 1 ];
+      } );
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k) {
+         vx[k] = vx[k] + ex1[k] + ( xx[k] - xi[k] )*dex1[k];
+         xx[k] = xx[k] + vx[k]  + flx;
+         ir[k] = (Index_type) xx[k];
+         rx[k] = xx[k] - ir[k];
+         ir[k] = ( ir[k] & (2048-1) ) + 1;
+         xx[k] = rx[k] + ir[k];
+      } );
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k) {
+         rh[ ir[k]-1 ] += 1.0 - rx[k];
+         rh[ ir[k]   ] += rx[k];
+      } );
+
+   }
+}
+
+BENCHMARK(BM_PIC_1D_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_HYDRO_2D_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(HYDRO_2D);
+
+   Real_ptr* za = loop_data.array_2D_7xN_Real[0];
+   Real_ptr* zb = loop_data.array_2D_7xN_Real[1]; 
+   Real_ptr* zm = loop_data.array_2D_7xN_Real[2];
+   Real_ptr* zp = loop_data.array_2D_7xN_Real[3];
+   Real_ptr* zq = loop_data.array_2D_7xN_Real[4];
+   Real_ptr* zr = loop_data.array_2D_7xN_Real[5];
+   Real_ptr* zu = loop_data.array_2D_7xN_Real[6];
+   Real_ptr* zv = loop_data.array_2D_7xN_Real[7];
+   Real_ptr* zz = loop_data.array_2D_7xN_Real[8];
+
+   Real_ptr* zrout = loop_data.array_2D_7xN_Real[9];
+   Real_ptr* zzout = loop_data.array_2D_7xN_Real[10];
+
+   const Real_type t = 0.0037;
+   const Real_type s = 0.0041;
+
+   Index_type kn = 6;
+   Index_type jn = state.range(0);
+   Index_type k;
+
+   for (auto _ : state) {
+
+      for ( k=1 ; k<kn ; k++ ) {
+         forall<exec_policy>(1, jn,
+         [&] (Index_type j) {
+            za[k][j] = ( zp[k+1][j-1] +zq[k+1][j-1] -zp[k][j-1] -zq[k][j-1] )*
+                       ( zr[k][j] +zr[k][j-1] ) / ( zm[k][j-1] +zm[k+1][j-1]);
+            zb[k][j] = ( zp[k][j-1] +zq[k][j-1] -zp[k][j] -zq[k][j] ) *
+                       ( zr[k][j] +zr[k-1][j] ) / ( zm[k][j] +zm[k][j-1]);
+         } );
+      }
+
+      for ( k=1 ; k<kn ; k++ ) {
+         forall<exec_policy>(1, jn,
+         [&] (Index_type j) {
+            zu[k][j] += s*( za[k][j]   *( zz[k][j] - zz[k][j+1] ) -
+                            za[k][j-1] *( zz[k][j] - zz[k][j-1] ) -
+                            zb[k][j]   *( zz[k][j] - zz[k-1][j] ) +
+                            zb[k+1][j] *( zz[k][j] - zz[k+1][j] ) );
+            zv[k][j] += s*( za[k][j]   *( zr[k][j] - zr[k][j+1] ) -
+                            za[k][j-1] *( zr[k][j] - zr[k][j-1] ) -
+                            zb[k][j]   *( zr[k][j] - zr[k-1][j] ) +
+                            zb[k+1][j] *( zr[k][j] - zr[k+1][j] ) );
+         } );
+      }
+
+      for ( k=1 ; k<kn ; k++ ) {
+         forall<exec_policy>(1, jn,
+         [&] (Index_type j) {
+            zrout[k][j] = zr[k][j] + t*zu[k][j];
+            zzout[k][j] = zz[k][j] + t*zv[k][j];
+         } );
+      }
+
+   }
+}
+
+BENCHMARK(BM_HYDRO_2D_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_GEN_LIN_RECUR_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(GEN_LIN_RECUR);
+
+   Real_ptr b5 = loop_data.array_1D_Real[0];
+   Real_ptr sa = loop_data.array_1D_Real[1];
+   Real_ptr sb = loop_data.array_1D_Real[2];
+
+   Real_type stb5 = loop_data.scalar_Real[0];
+
+   Index_type kb5i = 0;
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k) {
+         b5[k+kb5i] = sa[k] + stb5*sb[k];
+         stb5 = b5[k+kb5i] - stb5;
+      } );
+
+      forall<exec_policy>(1, state.range(0) + 1,
+      [&] (Index_type i) {
+         Index_type k = state.range(0) - i ;
+         b5[k+kb5i] = sa[k] + stb5*sb[k];
+         stb5 = b5[k+kb5i] - stb5;
+      } );
+
+   }
+}
+
+BENCHMARK(BM_GEN_LIN_RECUR_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_DISC_ORD_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(DISC_ORD);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+   Real_ptr u = loop_data.array_1D_Real[3];
+   Real_ptr v = loop_data.array_1D_Real[4];
+   Real_ptr w = loop_data.array_1D_Real[5];
+   Real_ptr g = loop_data.array_1D_Real[6];
+   Real_ptr xx = loop_data.array_1D_Real[7];
+   Real_ptr vx = loop_data.array_1D_Real[9];
+   const Real_type s = loop_data.scalar_Real[0];
+   const Real_type t = loop_data.scalar_Real[1];
+   const Real_type dk = loop_data.scalar_Real[2];
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k) {
+         Real_type di = y[k] - g[k] / ( xx[k] + dk );
+         Real_type dn = 0.2;
+         if ( di ) {
+            dn = z[k]/di ;
+            if ( t < dn ) dn = t;
+            if ( s > dn ) dn = s;
+         }
+         x[k] = ( ( w[k] + v[k]*dn )* xx[k] + u[k] ) / ( vx[k] + v[k]*dn );
+         xx[k+1] = ( x[k] - xx[k] )* dn + xx[k];
+      } );
+
+   }
+}
+
+BENCHMARK(BM_DISC_ORD_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_MAT_X_MAT_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(MAT_X_MAT);
+
+   Real_ptr* px = loop_data.array_2D_Nx25_Real[0];
+   Real_ptr* cx = loop_data.array_2D_Nx25_Real[1];
+   Real_ptr* vy = loop_data.array_2D_64x64_Real[0];
+
+   Index_type k, i;
+
+   for (auto _ : state) {
+
+      for ( k=0 ; k<25 ; k++ ) {
+         for ( i=0 ; i<25 ; i++ ) {
+            forall<exec_policy>(0, state.range(0),
+            [&] (Index_type j) {
+               px[j][i] += vy[k][i] * cx[j][k];
+            } );
+         }
+      }
+
+   }
+}
+
+BENCHMARK(BM_MAT_X_MAT_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_PLANCKIAN_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(PLANCKIAN);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr u = loop_data.array_1D_Real[2];
+   Real_ptr v = loop_data.array_1D_Real[3];
+   Real_ptr w = loop_data.array_1D_Real[4];
+
+   Real_type expmax = 20.0;
+   u[state.range(0)-1] = 0.99*expmax*v[state.range(0)-1];
+
+   for (auto _ : state) {
+
+      forall<exec_policy>(0, state.range(0),
+      [&] (Index_type k) {
+         y[k] = u[k] / v[k];
+         w[k] = x[k] / ( exp( y[k] ) -1.0 );
+      } );
+
+   }
+}
+
+BENCHMARK(BM_PLANCKIAN_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_IMP_HYDRO_2D_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(IMP_HYDRO_2D);
+
+   Real_ptr* za = loop_data.array_2D_7xN_Real[0];
+   Real_ptr* zb = loop_data.array_2D_7xN_Real[1];
+   Real_ptr* zr = loop_data.array_2D_7xN_Real[2];
+   Real_ptr* zu = loop_data.array_2D_7xN_Real[3];
+   Real_ptr* zv = loop_data.array_2D_7xN_Real[4];
+   Real_ptr* zz = loop_data.array_2D_7xN_Real[5];
+
+   Index_type j;
+
+   for (auto _ : state) {
+
+      for ( j=1 ; j<6 ; j++ ) {
+         forall<exec_policy>(1, state.range(0),
+         [&] (Index_type k) {
+            Real_type qa = za[j+1][k]*zr[j][k] + za[j-1][k]*zb[j][k] +
+                 za[j][k+1]*zu[j][k] + za[j][k-1]*zv[j][k] + zz[j][k];
+            za[j][k] += 0.175*( qa - za[j][k] );
+         } );
+      }
+
+   }
+}
+
+BENCHMARK(BM_IMP_HYDRO_2D_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_FIND_FIRST_MIN_LAMBDA(benchmark::State& state) {
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(FIND_FIRST_MIN);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+
+   Index_type m = 0;
+   Index_type val = 0;
+
+   for (auto _ : state) {
+
+      m = 0;
+      forall<exec_policy>(1, state.range(0),
+      [&] (Index_type k) {
+         if ( x[k] < x[m] ) benchmark::DoNotOptimize(m = k);
+      } );
+
+   }
+}
+
+BENCHMARK(BM_FIND_FIRST_MIN_LAMBDA)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
Index: MicroBenchmarks/LCALS/SubsetCLambdaLoops/lit.local.cfg
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetCLambdaLoops/lit.local.cfg
@@ -0,0 +1,7 @@
+test_modules = config.test_modules
+if 'run' in test_modules:
+    # Insert microbenchmark module behind 'run'
+    test_modules.insert(test_modules.index('run')+1, 'microbenchmark')
+    # Timeit results are not useful for microbenchmarks
+    if 'timeit' in test_modules:
+        test_modules.remove('timeit')
Index: MicroBenchmarks/LCALS/SubsetCRawLoops/CMakeLists.txt
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetCRawLoops/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(COPY lit.local.cfg DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+list(APPEND CPPFLAGS -std=c++11 -DLCALS_USE_DOUBLE -DLCALS_USE_RESTRICT_PTR -DLCALS_VERIFY_CHECKSUM -DLCALS_USE_CLOCK -DLCALS_COMPILER_CLANG)
+llvm_test_run()
+llvm_test_executable(lcalsCRaw ../main.cxx RawSubsetCbenchmarks.cxx  ../LCALSStats.cxx ../LCALSSuite.cxx  ../LCALSTraversalMethods.cxx ../runReferenceLoops.cxx)
+target_link_libraries(lcalsCRaw benchmark)
Index: MicroBenchmarks/LCALS/SubsetCRawLoops/RawSubsetCbenchmarks.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetCRawLoops/RawSubsetCbenchmarks.cxx
@@ -0,0 +1,1017 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file containing LCALS "C" subset raw loops using the google
+// benchmark library.
+//
+
+#include <benchmark/benchmark.h>
+#include "../LCALSSuite.hxx"
+
+static void BM_HYDRO_1D_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 1 -- hydro fragment
+    *******************************************************************
+    *       DO 1 L = 1,Loop
+    *       DO 1 k = 1,n
+    *  1       X(k)= Q + Y(k)*(R*ZX(k+10) + T*ZX(k+11))
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(HYDRO_1D);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+
+   const Real_type q = loop_data.scalar_Real[0];
+   const Real_type r = loop_data.scalar_Real[1];
+   const Real_type t = loop_data.scalar_Real[2];
+
+   for (auto _ : state) {
+
+      for (Index_type k=0 ; k< state.range(0) ; k++ ) {
+         x[k] = q + y[k]*( r*z[k+10] + t*z[k+11] );
+      }
+
+   }
+}
+
+BENCHMARK(BM_HYDRO_1D_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+
+static void BM_ICCG_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 2 -- ICCG excerpt (Incomplete Cholesky Conj. Gradient)
+    *******************************************************************
+    *    DO 200  L= 1,Loop
+    *        II= n
+    *     IPNTP= 0
+    *222   IPNT= IPNTP
+    *     IPNTP= IPNTP+II
+    *        II= II/2
+    *         i= IPNTP+1
+    CDIR$ IVDEP
+    *    DO 2 k= IPNT+2,IPNTP,2
+    *         i= i+1
+    *  2   X(i)= X(k) - V(k)*X(k-1) - V(k+1)*X(k+1)
+    *        IF( II.GT.1) GO TO 222
+    *200 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(ICCG);
+
+   Real_ptr x = loop_data.array_1D_Nx4_Real[0];
+   Real_ptr v = loop_data.array_1D_Nx4_Real[1];
+
+   Index_type ii, ipnt, ipntp, i;
+
+  for (auto _ : state) {
+
+      ii = state.range(0);
+      ipntp = 0;
+      do {
+         ipnt = ipntp;
+         ipntp += ii;
+         ii /= 2;
+         i = ipntp ;
+         for (Index_type k=ipnt+1 ; k<ipntp ; k=k+2 ) {
+            i++;
+            x[i] = x[k] - v[k  ]*x[k-1] - v[k+1]*x[k+1];
+         }
+      } while ( ii>0 );
+
+   }
+}
+
+BENCHMARK(BM_ICCG_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_INNER_PROD_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 3 -- inner product
+    *******************************************************************
+    *    DO 3 L= 1,Loop
+    *         Q= 0.0
+    *    DO 3 k= 1,n
+    *  3      Q= Q + Z(k)*X(k)
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(INNER_PROD);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr z = loop_data.array_1D_Real[1];
+
+   Real_type q = 0.0;
+   Real_type val = 0.0;
+
+   for (auto _ : state) {
+
+      q = 0.0;
+      for (Index_type k=0 ; k< state.range(0); k++ ) {
+         benchmark::DoNotOptimize(q += z[k]*x[k]);
+      }
+
+   }
+}
+
+BENCHMARK(BM_INNER_PROD_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_BAND_LIN_EQ_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 4 -- banded linear equations
+    *******************************************************************
+    *            m= (1001-7)/2
+    *    DO 444  L= 1,Loop
+    *    DO 444  k= 7,1001,m
+    *           lw= k-6
+    *         temp= X(k-1)
+    CDIR$ IVDEP
+    *    DO   4  j= 5,n,5
+    *       temp  = temp   - XZ(lw)*Y(j)
+    *  4        lw= lw+1
+    *       X(k-1)= Y(5)*temp
+    *444 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(BAND_LIN_EQ);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+
+   Index_type lw;
+   Real_type temp;
+
+   for (auto _ : state) {
+
+      Index_type m = ( 1001-7 )/2;
+      for ( Index_type k=6 ; k<1001 ; k=k+m ) {
+         lw = k - 6;
+         temp = x[k-1];
+         for (Index_type j=4 ; j< state.range(0) ; j=j+5 ) {
+            temp -= x[lw]*y[j];
+            lw++;
+         }
+         x[k-1] = y[4]*temp;
+      }
+
+   }
+}
+
+BENCHMARK(BM_BAND_LIN_EQ_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_TRIDIAG_ELIM_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 5 -- tri-diagonal elimination, below diagonal
+    *******************************************************************
+    *    DO 5 L = 1,Loop
+    *    DO 5 i = 2,n
+    *  5    X(i)= Z(i)*(Y(i) - X(i-1))
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(TRIDIAG_ELIM);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+
+   for (auto _ : state) {
+
+      for ( Index_type i=1 ; i< state.range(0) ; i++ ) {
+         x[i] = z[i]*( y[i] - x[i-1] );
+      }
+
+   }
+}
+
+BENCHMARK(BM_TRIDIAG_ELIM_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_EOS_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 7 -- equation of state fragment
+    *******************************************************************
+    *    DO 7 L= 1,Loop
+    *    DO 7 k= 1,n
+    *      X(k)=     U(k  ) + R*( Z(k  ) + R*Y(k  )) +
+    *   .        T*( U(k+3) + R*( U(k+2) + R*U(k+1)) +
+    *   .        T*( U(k+6) + Q*( U(k+5) + Q*U(k+4))))
+    *  7 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(EOS);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+   Real_ptr u = loop_data.array_1D_Real[3];
+
+   const Real_type q = loop_data.scalar_Real[0];
+   const Real_type r = loop_data.scalar_Real[1];
+   const Real_type t = loop_data.scalar_Real[2];
+
+   for (auto _ : state) {
+
+      for ( Index_type k=0 ; k< state.range(0) ; k++ ) {
+         x[k] = u[k] + r*( z[k] + r*y[k] ) +
+                t*( u[k+3] + r*( u[k+2] + r*u[k+1] ) +
+                  t*( u[k+6] + q*( u[k+5] + q*u[k+4] ) ) );
+      }
+
+   }
+}
+
+BENCHMARK(BM_EOS_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_ADI_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 8 -- ADI integration
+    *******************************************************************
+    *    DO  8      L = 1,Loop
+    *             nl1 = 1
+    *             nl2 = 2
+    *    DO  8     kx = 2,3
+    CDIR$ IVDEP
+    *    DO  8     ky = 2,n
+    *          DU1(ky)=U1(kx,ky+1,nl1)  -  U1(kx,ky-1,nl1)
+    *          DU2(ky)=U2(kx,ky+1,nl1)  -  U2(kx,ky-1,nl1)
+    *          DU3(ky)=U3(kx,ky+1,nl1)  -  U3(kx,ky-1,nl1)
+    *    U1(kx,ky,nl2)=U1(kx,ky,nl1) +A11*DU1(ky) +A12*DU2(ky) +A13*DU3(ky)
+    *   .       + SIG*(U1(kx+1,ky,nl1) -2.*U1(kx,ky,nl1) +U1(kx-1,ky,nl1))
+    *    U2(kx,ky,nl2)=U2(kx,ky,nl1) +A21*DU1(ky) +A22*DU2(ky) +A23*DU3(ky)
+    *   .       + SIG*(U2(kx+1,ky,nl1) -2.*U2(kx,ky,nl1) +U2(kx-1,ky,nl1))
+    *    U3(kx,ky,nl2)=U3(kx,ky,nl1) +A31*DU1(ky) +A32*DU2(ky) +A33*DU3(ky)
+    *   .       + SIG*(U3(kx+1,ky,nl1) -2.*U3(kx,ky,nl1) +U3(kx-1,ky,nl1))
+    *  8 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(ADI);
+
+   Real_ptr du1 = loop_data.array_1D_Real[0];
+   Real_ptr du2 = loop_data.array_1D_Real[1];
+   Real_ptr du3 = loop_data.array_1D_Real[2];
+
+   Real_ptr** u1 = loop_data.array_3D_2xNx4_Real[0];
+   Real_ptr** u2 = loop_data.array_3D_2xNx4_Real[1];
+   Real_ptr** u3 = loop_data.array_3D_2xNx4_Real[2];
+
+   const Real_type sig = loop_data.scalar_Real[0];
+   const Real_type a11 = loop_data.scalar_Real[1];
+   const Real_type a12 = loop_data.scalar_Real[2];
+   const Real_type a13 = loop_data.scalar_Real[3];
+   const Real_type a21 = loop_data.scalar_Real[4];
+   const Real_type a22 = loop_data.scalar_Real[5];
+   const Real_type a23 = loop_data.scalar_Real[6];
+   const Real_type a31 = loop_data.scalar_Real[7];
+   const Real_type a32 = loop_data.scalar_Real[8];
+   const Real_type a33 = loop_data.scalar_Real[9];
+
+   Index_type nl1 = 0;
+   Index_type nl2 = 1;
+   Index_type kx;
+
+   for (auto _ : state) {
+
+      for ( kx=1 ; kx<3 ; kx++ ) {
+         for (Index_type ky=1 ; ky< state.range(0) ; ky++ ) {
+            du1[ky] = u1[nl1][ky+1][kx] - u1[nl1][ky-1][kx];
+            du2[ky] = u2[nl1][ky+1][kx] - u2[nl1][ky-1][kx];
+            du3[ky] = u3[nl1][ky+1][kx] - u3[nl1][ky-1][kx];
+            u1[nl2][ky][kx]=
+               u1[nl1][ky][kx]+a11*du1[ky]+a12*du2[ky]+a13*du3[ky] + sig*
+               (u1[nl1][ky][kx+1]-2.0*u1[nl1][ky][kx]+u1[nl1][ky][kx-1]);
+            u2[nl2][ky][kx]=
+               u2[nl1][ky][kx]+a21*du1[ky]+a22*du2[ky]+a23*du3[ky] + sig*
+               (u2[nl1][ky][kx+1]-2.0*u2[nl1][ky][kx]+u2[nl1][ky][kx-1]);
+            u3[nl2][ky][kx]=
+               u3[nl1][ky][kx]+a31*du1[ky]+a32*du2[ky]+a33*du3[ky] + sig*
+               (u3[nl1][ky][kx+1]-2.0*u3[nl1][ky][kx]+u3[nl1][ky][kx-1]);
+         }
+      }
+
+   }
+}
+
+BENCHMARK(BM_ADI_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_INT_PREDICT_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 9 -- integrate predictors
+    *******************************************************************
+    *    DO 9  L = 1,Loop
+    *    DO 9  i = 1,n
+    *    PX( 1,i)= DM28*PX(13,i) + DM27*PX(12,i) + DM26*PX(11,i) +
+    *   .          DM25*PX(10,i) + DM24*PX( 9,i) + DM23*PX( 8,i) +
+    *   .          DM22*PX( 7,i) +  C0*(PX( 5,i) +      PX( 6,i))+ PX( 3,i)
+    *  9 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(INT_PREDICT);
+
+   Real_ptr* px = loop_data.array_2D_Nx25_Real[0];
+
+   const Real_type dm22 = loop_data.scalar_Real[0];
+   const Real_type dm23 = loop_data.scalar_Real[1];
+   const Real_type dm24 = loop_data.scalar_Real[2];
+   const Real_type dm25 = loop_data.scalar_Real[3];
+   const Real_type dm26 = loop_data.scalar_Real[4];
+   const Real_type dm27 = loop_data.scalar_Real[5];
+   const Real_type dm28 = loop_data.scalar_Real[6];
+   const Real_type c0 = loop_data.scalar_Real[7];
+
+   for (auto _ : state) {
+
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         px[i][0] = dm28*px[i][12] + dm27*px[i][11] + dm26*px[i][10] +
+              dm25*px[i][ 9] + dm24*px[i][ 8] + dm23*px[i][ 7] +
+              dm22*px[i][ 6] + c0*( px[i][ 4] + px[i][ 5]) + px[i][ 2];
+      }
+
+   }
+}
+
+BENCHMARK(BM_INT_PREDICT_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_DIFF_PREDICT_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 10 -- difference predictors
+    *******************************************************************
+    *    DO 10  L= 1,Loop
+    *    DO 10  i= 1,n
+    *    AR      =      CX(5,i)
+    *    BR      = AR - PX(5,i)
+    *    PX(5,i) = AR
+    *    CR      = BR - PX(6,i)
+    *    PX(6,i) = BR
+    *    AR      = CR - PX(7,i)
+    *    PX(7,i) = CR
+    *    BR      = AR - PX(8,i)
+    *    PX(8,i) = AR
+    *    CR      = BR - PX(9,i)
+    *    PX(9,i) = BR
+    *    AR      = CR - PX(10,i)
+    *    PX(10,i)= CR
+    *    BR      = AR - PX(11,i)
+    *    PX(11,i)= AR
+    *    CR      = BR - PX(12,i)
+    *    PX(12,i)= BR
+    *    PX(14,i)= CR - PX(13,i)
+    *    PX(13,i)= CR
+    * 10 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(DIFF_PREDICT);
+
+   Real_ptr* px = loop_data.array_2D_Nx25_Real[0];
+   Real_ptr* cx = loop_data.array_2D_Nx25_Real[1];
+
+   for (auto _ : state) {
+
+      for (Index_type i=0 ; i< state.range(0) ; i++ ) {
+         Real_type ar, br, cr;
+         ar        =      cx[i][ 4];
+         br        = ar - px[i][ 4];
+         px[i][ 4] = ar;
+         cr        = br - px[i][ 5];
+         px[i][ 5] = br;
+         ar        = cr - px[i][ 6];
+         px[i][ 6] = cr;
+         br        = ar - px[i][ 7];
+         px[i][ 7] = ar;
+         cr        = br - px[i][ 8];
+         px[i][ 8] = br;
+         ar        = cr - px[i][ 9];
+         px[i][ 9] = cr;
+         br        = ar - px[i][10];
+         px[i][10] = ar;
+         cr        = br - px[i][11];
+         px[i][11] = br;
+         px[i][13] = cr - px[i][12];
+         px[i][12] = cr;
+      }
+
+   }
+}
+
+BENCHMARK(BM_DIFF_PREDICT_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_FIRST_SUM_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 11 -- first sum
+    *******************************************************************
+    *    DO 11 L = 1,Loop
+    *        X(1)= Y(1)
+    *    DO 11 k = 2,n
+    * 11     X(k)= X(k-1) + Y(k)
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(FIRST_SUM);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+
+   for (auto _ :state) {
+
+      x[0] = y[0];
+      for (Index_type k=1 ; k< state.range(0) ; k++ ) {
+         x[k] = x[k-1] + y[k];
+      }
+
+   }
+}
+
+BENCHMARK(BM_FIRST_SUM_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_FIRST_DIFF_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 12 -- first difference
+    *******************************************************************
+    *    DO 12 L = 1,Loop
+    *    DO 12 k = 1,n
+    * 12     X(k)= Y(k+1) - Y(k)
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(FIRST_DIFF);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+
+   for (auto _ : state) {
+
+      for (Index_type k=0 ; k< state.range(0) ; k++ ) {
+         x[k] = y[k+1] - y[k];
+      }
+
+   }
+}
+
+BENCHMARK(BM_FIRST_DIFF_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_PIC_2D_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 13 -- 2-D PIC (Particle In Cell)
+    *******************************************************************
+    *    DO  13     L= 1,Loop
+    *    DO  13    ip= 1,n
+    *              i1= P(1,ip)
+    *              j1= P(2,ip)
+    *              i1=        1 + MOD2N(i1,64)
+    *              j1=        1 + MOD2N(j1,64)
+    *         P(3,ip)= P(3,ip)  + B(i1,j1)
+    *         P(4,ip)= P(4,ip)  + C(i1,j1)
+    *         P(1,ip)= P(1,ip)  + P(3,ip)
+    *         P(2,ip)= P(2,ip)  + P(4,ip)
+    *              i2= P(1,ip)
+    *              j2= P(2,ip)
+    *              i2=            MOD2N(i2,64)
+    *              j2=            MOD2N(j2,64)
+    *         P(1,ip)= P(1,ip)  + Y(i2+32)
+    *         P(2,ip)= P(2,ip)  + Z(j2+32)
+    *              i2= i2       + E(i2+32)
+    *              j2= j2       + F(j2+32)
+    *        H(i2,j2)= H(i2,j2) + 1.0
+    * 13 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(PIC_2D);
+
+   Real_ptr* p = loop_data.array_2D_Nx25_Real[0];
+   Real_ptr* b = loop_data.array_2D_Nx25_Real[1];
+   Real_ptr* c = loop_data.array_2D_Nx25_Real[2];
+
+   Real_ptr y = loop_data.array_1D_Real[0];
+   Real_ptr z = loop_data.array_1D_Real[1];
+
+   Index_type* e = loop_data.array_1D_Indx[0];
+   Index_type* f = loop_data.array_1D_Indx[1];
+
+   Real_ptr* h = loop_data.array_2D_64x64_Real[0];
+
+   for (auto _ : state) {
+
+      for (Index_type ip=0 ; ip< state.range(0) ; ip++ ) {
+         Index_type i1, j1, i2, j2;
+         i1 = (Index_type) p[ip][0];
+         j1 = (Index_type) p[ip][1];
+         i1 &= 64-1;
+         j1 &= 64-1;
+         p[ip][2] += b[j1][i1];
+         p[ip][3] += c[j1][i1];
+         p[ip][0] += p[ip][2];
+         p[ip][1] += p[ip][3];
+         i2 = (Index_type) p[ip][0];
+         j2 = (Index_type) p[ip][1];
+         i2 = ( i2 & 64-1 ) ;
+         j2 = ( j2 & 64-1 ) ;
+         p[ip][0] += y[i2+32];
+         p[ip][1] += z[j2+32];
+         i2 += e[i2+32];
+         j2 += f[j2+32];
+         h[j2][i2] += 1.0;
+      }
+
+   }
+}
+
+BENCHMARK(BM_PIC_2D_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_PIC_1D_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 14 -- 1-D PIC (Particle In Cell)
+    *******************************************************************
+    *    DO   14   L= 1,Loop
+    *    DO   141  k= 1,n
+    *          VX(k)= 0.0
+    *          XX(k)= 0.0
+    *          IX(k)= INT(  GRD(k))
+    *          XI(k)= REAL( IX(k))
+    *         EX1(k)= EX   ( IX(k))
+    *        DEX1(k)= DEX  ( IX(k))
+    *41  CONTINUE
+    *    DO   142  k= 1,n
+    *          VX(k)= VX(k) + EX1(k) + (XX(k) - XI(k))*DEX1(k)
+    *          XX(k)= XX(k) + VX(k)  + FLX
+    *          IR(k)= XX(k)
+    *          RX(k)= XX(k) - IR(k)
+    *          IR(k)= MOD2N(  IR(k),2048) + 1
+    *          XX(k)= RX(k) + IR(k)
+    *42  CONTINUE
+    *    DO  14    k= 1,n
+    *    RH(IR(k)  )= RH(IR(k)  ) + 1.0 - RX(k)
+    *    RH(IR(k)+1)= RH(IR(k)+1) + RX(k)
+    *14  CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(PIC_1D);
+
+   Real_ptr vx = loop_data.array_1D_Real[0];
+   Real_ptr xx = loop_data.array_1D_Real[1];
+   Real_ptr xi = loop_data.array_1D_Real[2];
+   Real_ptr ex = loop_data.array_1D_Real[3];
+   Real_ptr ex1 = loop_data.array_1D_Real[4];
+   Real_ptr dex = loop_data.array_1D_Real[5];
+   Real_ptr dex1 = loop_data.array_1D_Real[6];
+   Real_ptr rh = loop_data.array_1D_Real[7];
+   Real_ptr rx = loop_data.array_1D_Real[8];
+
+   const Real_type flx = loop_data.scalar_Real[0];
+
+   Index_type* ix = loop_data.array_1D_Indx[2];
+   Index_type* ir = loop_data.array_1D_Indx[3];
+   Index_type* grd = loop_data.array_1D_Indx[4];
+
+
+   for (auto _ : state) {
+
+      for (Index_type k=0 ; k< state.range(0) ; k++ ) {
+         vx[k] = 0.0;
+         xx[k] = 0.0;
+         ix[k] = (Index_type) grd[k];
+         xi[k] = (Real_type) ix[k];
+         ex1[k] = ex[ ix[k] - 1 ];
+         dex1[k] = dex[ ix[k] - 1 ];
+      }
+
+      for (Index_type k=0 ; k< state.range(0) ; k++ ) {
+         vx[k] = vx[k] + ex1[k] + ( xx[k] - xi[k] )*dex1[k];
+         xx[k] = xx[k] + vx[k]  + flx;
+         ir[k] = (Index_type) xx[k];
+         rx[k] = xx[k] - ir[k];
+         ir[k] = ( ir[k] & (2048-1) ) + 1;
+         xx[k] = rx[k] + ir[k];
+      }
+
+      for (Index_type k=0 ; k< state.range(0) ; k++ ) {
+         rh[ ir[k]-1 ] += 1.0 - rx[k];
+         rh[ ir[k]   ] += rx[k];
+      }
+
+   }
+}
+
+BENCHMARK(BM_PIC_1D_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_HYDRO_2D_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 18 - 2-D explicit hydrodynamics fragment
+    *******************************************************************
+    *       DO 75  L= 1,Loop
+    *              T= 0.0037
+    *              S= 0.0041
+    *             KN= 6
+    *             JN= n
+    *       DO 70  k= 2,KN
+    *       DO 70  j= 2,JN
+    *        ZA(j,k)= (ZP(j-1,k+1)+ZQ(j-1,k+1)-ZP(j-1,k)-ZQ(j-1,k))
+    *   .            *(ZR(j,k)+ZR(j-1,k))/(ZM(j-1,k)+ZM(j-1,k+1))
+    *        ZB(j,k)= (ZP(j-1,k)+ZQ(j-1,k)-ZP(j,k)-ZQ(j,k))
+    *   .            *(ZR(j,k)+ZR(j,k-1))/(ZM(j,k)+ZM(j-1,k))
+    * 70    CONTINUE
+    *       DO 72  k= 2,KN
+    *       DO 72  j= 2,JN
+    *        ZU(j,k)= ZU(j,k)+S*(ZA(j,k)*(ZZ(j,k)-ZZ(j+1,k))
+    *   .                    -ZA(j-1,k) *(ZZ(j,k)-ZZ(j-1,k))
+    *   .                    -ZB(j,k)   *(ZZ(j,k)-ZZ(j,k-1))
+    *   .                    +ZB(j,k+1) *(ZZ(j,k)-ZZ(j,k+1)))
+    *        ZV(j,k)= ZV(j,k)+S*(ZA(j,k)*(ZR(j,k)-ZR(j+1,k))
+    *   .                    -ZA(j-1,k) *(ZR(j,k)-ZR(j-1,k))
+    *   .                    -ZB(j,k)   *(ZR(j,k)-ZR(j,k-1))
+    *   .                    +ZB(j,k+1) *(ZR(j,k)-ZR(j,k+1)))
+    * 72    CONTINUE
+    *       DO 75  k= 2,KN
+    *       DO 75  j= 2,JN
+    *        ZR(j,k)= ZR(j,k)+T*ZU(j,k)
+    *        ZZ(j,k)= ZZ(j,k)+T*ZV(j,k)
+    * 75    CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(HYDRO_2D);
+
+   Real_ptr* za = loop_data.array_2D_7xN_Real[0];
+   Real_ptr* zb = loop_data.array_2D_7xN_Real[1]; 
+   Real_ptr* zm = loop_data.array_2D_7xN_Real[2];
+   Real_ptr* zp = loop_data.array_2D_7xN_Real[3];
+   Real_ptr* zq = loop_data.array_2D_7xN_Real[4];
+   Real_ptr* zr = loop_data.array_2D_7xN_Real[5];
+   Real_ptr* zu = loop_data.array_2D_7xN_Real[6];
+   Real_ptr* zv = loop_data.array_2D_7xN_Real[7];
+   Real_ptr* zz = loop_data.array_2D_7xN_Real[8];
+
+   Real_ptr* zrout = loop_data.array_2D_7xN_Real[9];
+   Real_ptr* zzout = loop_data.array_2D_7xN_Real[10];
+
+   const Real_type t = 0.0037;
+   const Real_type s = 0.0041;
+
+   Index_type kn = 6;
+   Index_type jn = state.range(0);
+   Index_type k;
+
+   for (auto _ : state) {
+
+      for ( k=1 ; k<kn ; k++ ) {
+         for (Index_type j=1 ; j<jn ; j++ ) {
+            za[k][j] = ( zp[k+1][j-1] +zq[k+1][j-1] -zp[k][j-1] -zq[k][j-1] )*
+                       ( zr[k][j] +zr[k][j-1] ) / ( zm[k][j-1] +zm[k+1][j-1]);
+            zb[k][j] = ( zp[k][j-1] +zq[k][j-1] -zp[k][j] -zq[k][j] ) *
+                       ( zr[k][j] +zr[k-1][j] ) / ( zm[k][j] +zm[k][j-1]);
+         }
+      }
+
+      for ( k=1 ; k<kn ; k++ ) {
+         for (Index_type j=1 ; j<jn ; j++ ) {
+            zu[k][j] += s*( za[k][j]   *( zz[k][j] - zz[k][j+1] ) -
+                            za[k][j-1] *( zz[k][j] - zz[k][j-1] ) -
+                            zb[k][j]   *( zz[k][j] - zz[k-1][j] ) +
+                            zb[k+1][j] *( zz[k][j] - zz[k+1][j] ) );
+            zv[k][j] += s*( za[k][j]   *( zr[k][j] - zr[k][j+1] ) -
+                            za[k][j-1] *( zr[k][j] - zr[k][j-1] ) -
+                            zb[k][j]   *( zr[k][j] - zr[k-1][j] ) +
+                            zb[k+1][j] *( zr[k][j] - zr[k+1][j] ) );
+         }
+      }
+
+      for ( k=1 ; k<kn ; k++ ) {
+         for (Index_type j=1 ; j<jn ; j++ ) {
+            zrout[k][j] = zr[k][j] + t*zu[k][j];
+            zzout[k][j] = zz[k][j] + t*zv[k][j];
+         }
+      }
+
+   }
+}
+
+BENCHMARK(BM_HYDRO_2D_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_GEN_LIN_RECUR_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 19 -- general linear recurrence equations
+    *******************************************************************
+    *               KB5I= 0
+    *           DO 194 L= 1,Loop
+    *           DO 191 k= 1,n
+    *         B5(k+KB5I)= SA(k) +STB5*SB(k)
+    *               STB5= B5(k+KB5I) -STB5
+    *191        CONTINUE
+    *192        DO 193 i= 1,n
+    *                  k= n-i+1
+    *         B5(k+KB5I)= SA(k) +STB5*SB(k)
+    *               STB5= B5(k+KB5I) -STB5
+    *193        CONTINUE
+    *194 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(GEN_LIN_RECUR);
+
+   Real_ptr b5 = loop_data.array_1D_Real[0];
+   Real_ptr sa = loop_data.array_1D_Real[1];
+   Real_ptr sb = loop_data.array_1D_Real[2];
+
+   Real_type stb5 = loop_data.scalar_Real[0];
+
+   Index_type kb5i = 0;
+
+   for (auto _ : state) {
+
+      for ( Index_type k=0 ; k< state.range(0) ; k++ ) {
+         b5[k+kb5i] = sa[k] + stb5*sb[k];
+         stb5 = b5[k+kb5i] - stb5;
+      }
+
+      for (Index_type  i=1 ; i<= state.range(0) ; i++ ) {
+         Index_type k = state.range(0) - i ;
+         b5[k+kb5i] = sa[k] + stb5*sb[k];
+         stb5 = b5[k+kb5i] - stb5;
+      }
+
+   }
+}
+
+BENCHMARK(BM_GEN_LIN_RECUR_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_DISC_ORD_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 20 -- Discrete ordinates transport, cond recurrence on xx
+    *******************************************************************
+    *    DO 20 L= 1,Loop
+    *    DO 20 k= 1,n
+    *         DI= Y(k)-G(k)/( XX(k)+DK)
+    *         DN= 0.2
+    *         IF( DI.NE.0.0) DN= MAX( S,MIN( Z(k)/DI, T))
+    *       X(k)= ((W(k)+V(k)*DN)* XX(k)+U(k))/(VX(k)+V(k)*DN)
+    *    XX(k+1)= (X(k)- XX(k))*DN+ XX(k)
+    * 20 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(DISC_ORD);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr z = loop_data.array_1D_Real[2];
+   Real_ptr u = loop_data.array_1D_Real[3];
+   Real_ptr v = loop_data.array_1D_Real[4];
+   Real_ptr w = loop_data.array_1D_Real[5];
+   Real_ptr g = loop_data.array_1D_Real[6];
+   Real_ptr xx = loop_data.array_1D_Real[7];
+   Real_ptr vx = loop_data.array_1D_Real[9];
+   const Real_type s = loop_data.scalar_Real[0];
+   const Real_type t = loop_data.scalar_Real[1];
+   const Real_type dk = loop_data.scalar_Real[2];
+
+   for (auto _ : state) {
+
+      for (Index_type k=0 ; k< state.range(0) ; k++ ) {
+         Real_type di = y[k] - g[k] / ( xx[k] + dk );
+         Real_type dn = 0.2;
+         if ( di ) {
+            dn = z[k]/di ;
+            if ( t < dn ) dn = t;
+            if ( s > dn ) dn = s;
+         }
+         x[k] = ( ( w[k] + v[k]*dn )* xx[k] + u[k] ) / ( vx[k] + v[k]*dn );
+         xx[k+1] = ( x[k] - xx[k] )* dn + xx[k];
+      }
+
+   }
+}
+
+BENCHMARK(BM_DISC_ORD_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_MAT_X_MAT_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 21 -- matrix*matrix product
+    *******************************************************************
+    *    DO 21 L= 1,Loop
+    *    DO 21 k= 1,25
+    *    DO 21 i= 1,25
+    *    DO 21 j= 1,n
+    *    PX(i,j)= PX(i,j) +VY(i,k) * CX(k,j)
+    * 21 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(MAT_X_MAT);
+
+   Real_ptr* px = loop_data.array_2D_Nx25_Real[0];
+   Real_ptr* cx = loop_data.array_2D_Nx25_Real[1];
+   Real_ptr* vy = loop_data.array_2D_64x64_Real[0];
+
+   Index_type k, i;
+
+   for (auto _ : state) {
+
+      for ( k=0 ; k<25 ; k++ ) {
+         for ( i=0 ; i<25 ; i++ ) {
+            for (Index_type j=0 ; j< state.range(0) ; j++ ) {
+               px[j][i] += vy[k][i] * cx[j][k];
+            }
+         }
+      }
+
+   }
+}
+
+BENCHMARK(BM_MAT_X_MAT_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_PLANCKIAN_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 22 -- Planckian distribution
+    *******************************************************************
+    *     EXPMAX= 20.0
+    *       U(n)= 0.99*EXPMAX*V(n)
+    *    DO 22 L= 1,Loop
+    *    DO 22 k= 1,n
+    *                                          Y(k)= U(k)/V(k)
+    *       W(k)= X(k)/( EXP( Y(k)) -1.0)
+    * 22 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(PLANCKIAN);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+   Real_ptr y = loop_data.array_1D_Real[1];
+   Real_ptr u = loop_data.array_1D_Real[2];
+   Real_ptr v = loop_data.array_1D_Real[3];
+   Real_ptr w = loop_data.array_1D_Real[4];
+
+   Real_type expmax = 20.0;
+   u[state.range(0)-1] = 0.99*expmax*v[state.range(0)-1];
+
+   for (auto _ : state) {
+
+      for (Index_type k=0 ; k< state.range(0) ; k++ ) {
+         y[k] = u[k] / v[k];
+         w[k] = x[k] / ( exp( y[k] ) -1.0 );
+      }
+
+   }
+}
+
+BENCHMARK(BM_PLANCKIAN_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_IMP_HYDRO_2D_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 23 -- 2-D implicit hydrodynamics fragment
+    *******************************************************************
+    *    DO 23  L= 1,Loop
+    *    DO 23  j= 2,6
+    *    DO 23  k= 2,n
+    *          QA= ZA(k,j+1)*ZR(k,j) +ZA(k,j-1)*ZB(k,j) +
+    *   .          ZA(k+1,j)*ZU(k,j) +ZA(k-1,j)*ZV(k,j) +ZZ(k,j)
+    * 23  ZA(k,j)= ZA(k,j) +.175*(QA -ZA(k,j))
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(IMP_HYDRO_2D);
+
+   Real_ptr* za = loop_data.array_2D_7xN_Real[0];
+   Real_ptr* zb = loop_data.array_2D_7xN_Real[1];
+   Real_ptr* zr = loop_data.array_2D_7xN_Real[2];
+   Real_ptr* zu = loop_data.array_2D_7xN_Real[3];
+   Real_ptr* zv = loop_data.array_2D_7xN_Real[4];
+   Real_ptr* zz = loop_data.array_2D_7xN_Real[5];
+
+   Index_type j;
+
+   for (auto _ : state) {
+
+      for ( j=1 ; j<6 ; j++ ) {
+         for ( Index_type k=1 ; k< state.range(0) ; k++ ) {
+            Real_type qa = za[j+1][k]*zr[j][k] + za[j-1][k]*zb[j][k] +
+                 za[j][k+1]*zu[j][k] + za[j][k-1]*zv[j][k] + zz[j][k];
+            za[j][k] += 0.175*( qa - za[j][k] );
+         }
+      }
+
+   }
+}
+
+BENCHMARK(BM_IMP_HYDRO_2D_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
+
+static void BM_FIND_FIRST_MIN_RAW(benchmark::State& state) {
+
+   /*
+    *******************************************************************
+    *   Kernel 24 -- find location of first minimum in array
+    *******************************************************************
+    *     X( n/2)= -1.0E+10
+    *    DO 24  L= 1,Loop
+    *           m= 1
+    *    DO 24  k= 2,n
+    *          IF( X(k).LT.X(m))  m= k
+    * 24 CONTINUE
+    */
+
+   LoopData& loop_data = getLoopData();
+
+   loopInit(FIND_FIRST_MIN);
+
+   Real_ptr x = loop_data.array_1D_Real[0];
+
+   Index_type m = 0;
+   Index_type val = 0;
+
+   for (auto _ : state) {
+
+      m = 0;
+      for (Index_type  k=1 ; k< state.range(0) ; k++ ) {
+         if ( x[k] < x[m] ) benchmark::DoNotOptimize(m = k);
+      }
+
+   }
+}
+
+BENCHMARK(BM_FIND_FIRST_MIN_RAW)->Arg(171)->Arg(5001)->
+                         Arg(44217)->Unit(benchmark::kMicrosecond);
Index: MicroBenchmarks/LCALS/SubsetCRawLoops/lit.local.cfg
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetCRawLoops/lit.local.cfg
@@ -0,0 +1,7 @@
+test_modules = config.test_modules
+if 'run' in test_modules:
+    # Insert microbenchmark module behind 'run'
+    test_modules.insert(test_modules.index('run')+1, 'microbenchmark')
+    # Timeit results are not useful for microbenchmarks
+    if 'timeit' in test_modules:
+        test_modules.remove('timeit')
Index: MicroBenchmarks/LCALS/SubsetDataA.hxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetDataA.hxx
@@ -0,0 +1,168 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Header file defining macros, routines, structures used in Loop Subset A.
+//
+
+#ifndef SubsetDataA_HXX
+#define SubsetDataA_HXX
+
+//
+// Some macros used in kernels to mimic real code usage.
+//
+#define NDPTRSET(v,v0,v1,v2,v3,v4,v5,v6,v7)  \
+   v0 = v ;   \
+   v1 = v0 + 1 ;  \
+   v2 = v0 + domain.jp ; \
+   v3 = v1 + domain.jp ; \
+   v4 = v0 + domain.kp ; \
+   v5 = v1 + domain.kp ; \
+   v6 = v2 + domain.kp ; \
+   v7 = v3 + domain.kp ;
+
+#define NDSET2D(v,v1,v2,v3,v4)  \
+   v4 = v ;   \
+   v1 = v4 + 1 ;  \
+   v2 = v1 + domain.jp ;  \
+   v3 = v4 + domain.jp ;
+
+#define zabs2(z)    ( real(z)*real(z)+imag(z)*imag(z) )
+
+
+//
+// Domain structure to mimic structured mesh loops in real codes.
+//
+struct ADomain
+{
+   ADomain( int ilen, Index_type ndims ) 
+      : ndims(ndims), NPNL(2), NPNR(1)
+   {
+      Index_type rzmax;
+      switch ( ilen ) {
+         case LONG : {
+            if ( ndims == 2 ) {
+               rzmax = 156 * loop_length_factor;
+            } else if ( ndims == 3 ) {
+               rzmax = 28 * loop_length_factor;
+            } 
+            break;
+         }
+         case MEDIUM : {
+            if ( ndims == 2 ) {
+               rzmax = 64 * loop_length_factor;
+            } else if ( ndims == 3 ) {
+               rzmax = 16 * loop_length_factor;
+            } 
+            break;
+         }
+         case SHORT : {
+            if ( ndims == 2 ) {
+               rzmax = 8 * loop_length_factor;
+            } else if ( ndims == 3 ) {
+               rzmax = 4 * loop_length_factor;
+            } 
+            break;
+         }
+
+         default : { }
+      }
+
+      imin = NPNL;
+      jmin = NPNL;
+      imax = rzmax + NPNR;
+      jmax = rzmax + NPNR;
+      jp = imax - imin + 1 + NPNL + NPNR;
+
+      if ( ndims == 2 ) {
+         kmin = 0;
+         kmax = 0;
+         kp = 0;
+         nnalls = jp * (jmax - jmin + 1 + NPNL + NPNR) ;
+      } else if ( ndims == 3 ) {
+         kmin = NPNL;
+         kmax = rzmax + NPNR;
+         kp = jp * (jmax - jmin + 1 + NPNL + NPNR);
+         nnalls = kp * (kmax - kmin + 1 + NPNL + NPNR) ;
+      }
+
+      fpn = 0;
+      lpn = nnalls - 1;
+      frn = fpn + NPNL * (kp + jp) + NPNL;
+      lrn = lpn - NPNR * (kp + jp) - NPNR;
+
+      fpz = frn - jp - kp - 1;
+      lpz = lrn;
+
+      real_zones = new Index_type[nnalls];
+      for (Index_type i = 0; i < nnalls; ++i) real_zones[i] = -1;
+
+      n_real_zones = 0;
+
+      if ( ndims == 2 ) {
+
+         for (Index_type j = jmin; j < jmax; j++) {
+            for (Index_type i = imin; i < imax; i++) {
+               Index_type ip = i + j*jp ;
+
+               Index_type id = n_real_zones;
+               real_zones[id] = ip;
+               n_real_zones++;
+            }
+         }
+
+      } else if ( ndims == 3 ) {
+
+         for (Index_type k = kmin; k < kmax; k++) { 
+            for (Index_type j = jmin; j < jmax; j++) {
+               for (Index_type i = imin; i < imax; i++) {
+                  Index_type ip = i + j*jp + kp*k ;
+
+                  Index_type id = n_real_zones;
+                  real_zones[id] = ip;
+                  n_real_zones++;
+               }
+            }
+         } 
+
+      }
+
+   }
+
+   ~ADomain() 
+   {
+      if (real_zones) delete [] real_zones; 
+   }
+
+   static double loop_length_factor;
+
+   Index_type ndims;
+   Index_type NPNL;
+   Index_type NPNR;
+
+   Index_type imin;
+   Index_type jmin;
+   Index_type kmin;
+   Index_type imax;
+   Index_type jmax;
+   Index_type kmax;
+
+   Index_type jp;
+   Index_type kp;
+   Index_type nnalls;
+
+   Index_type fpn;
+   Index_type lpn;
+   Index_type frn;
+   Index_type lrn;
+
+   Index_type fpz;
+   Index_type lpz;
+
+   Index_type* real_zones;
+   Index_type  n_real_zones;
+};
+
+
+#endif  // closing endif for header file include guard
Index: MicroBenchmarks/LCALS/SubsetDataB.hxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/SubsetDataB.hxx
@@ -0,0 +1,30 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Header file defining macros, routines, structures used in Loop Subset B.
+//
+
+#ifndef SubsetDataB_HXX
+#define SubsetDataB_HXX
+
+namespace {
+
+//
+// Function used in TRAP_INT loop.
+//
+LCALS_INLINE
+Real_type trap_int_func(Real_type x,
+                        Real_type y,
+                        Real_type xp,
+                        Real_type yp)
+{
+   Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
+   denom = 1.0/sqrt(denom);
+   return denom;
+}
+
+}  // closing brace for unnamed namespace
+
+#endif  // closing endif for header file include guard
Index: MicroBenchmarks/LCALS/main.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/main.cxx
@@ -0,0 +1,407 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+// This code is under continuing development.  Go to http://codesign.llnl.gov
+// to acquire the latest released version.
+//
+
+//
+// Main program for LCALS suite.
+//
+
+#include <cstdlib>
+
+#include<string>
+#include<iostream>
+#include<algorithm>
+
+#include <unistd.h>
+
+#include "LCALSSuite.hxx"
+#include <benchmark/benchmark.h>
+
+int main(int argc, char *argv[])
+{
+
+   //
+   // Define some variables used to define part of suite execution.
+   //
+   bool do_fom = true;
+   bool run_misc = false;
+   bool input_error = false;
+   std::string output_dirname;
+
+   //
+   // Process command line args and report correct usage if necessary.
+   //
+   // if (argc == 1) no args to check...
+#ifdef TESTSUITE
+   if (argc > 1) {
+      std::string arg = argv[1];
+      if ( arg == "-misc" ) {
+         run_misc = true; 
+      } else {
+         output_dirname = argv[1];
+      }
+   }
+
+   if (argc > 2) {
+      if ( !run_misc ) { 
+         input_error = true; 
+      } else {
+         output_dirname = argv[2];
+      }
+   }
+
+   if ( argc > 3) {
+      input_error = true; 
+   } 
+
+   if ( !input_error ) { 
+
+      if ( !output_dirname.empty() && !recursiveMkdir(output_dirname) ) {
+        std::cout << "Problem with given output directory name." << std::endl;
+        std::cout << "No file output will be generated." << std::endl;
+      } 
+
+   } else {
+      std::cout << "ERROR RUNNING EXECUTABLE!\n\n";
+      std::cout << "CORRECT USAGE:\n";
+      std::cout << "\t" << argv[0] 
+                << " -misc <output directory name>, both args optional\n\n"
+                << "\tIf '-misc' option is given, " 
+                << "benchmark variants plus others may be run.\n"
+                << "\tActual loop variants to run are set below using the\n"
+                << "\tvector 'run_variants'. Note that the compiler switch\n"
+                << "\tin the Makefile may be required for full compilation.\n\n"
+                << "\tWhen no output directory is provided,\n"
+                << "\trun summary will be printed to standard output\n" 
+                << "\tIf directory name is provided, execution summary and\n"
+                << "\ttext files suitable for importing into MS Excel will\n"
+                << "\tbe written there." << std::endl;
+      exit(-1);
+      return -1;
+   }
+#endif
+
+   //
+   // Define some parameters specifying how suite of loops will execute.
+   //
+   // See README-LCALS_instructions.txt file for additional description of how 
+   // to control compilation and execution of loop suite.
+   //
+   unsigned num_suite_passes = 1;
+#if defined(LCALS_VERIFY_CHECKSUM_ABBREVIATED)
+   //
+   // When verifying checksums, we only take one pass through the suite of loops
+   // as this is sufficient.
+   // 
+   num_suite_passes = num_checksum_suite_passes;
+#endif
+
+   //
+   // Specify fraction of pre-defined loop sample counts to use.
+   // Smaller value reduces total run time. However, a value too
+   // small will result in inaccurate timings.
+   //
+   double sample_frac = 1.0;
+
+   //
+   // Specify multiplication factor used to deviate from pre-defined loop 
+   // lengths to use.  For example, setting factor to 'a' will roughly
+   // multiply the length of "1D" loops by a and will multiply total number 
+   // of iterations of "domain-based" loops by a^N, where N is the 
+   // spatial dimension of the domain used by the loop.
+   //
+   double loop_length_factor = 1.0;
+
+   //
+   //  Specify which loops lengths to run by true/false
+   //  value in 'run_loop_length' array.
+   //
+   bool run_loop_length[NUM_LENGTHS];
+   run_loop_length[LONG] = true;
+   run_loop_length[MEDIUM] = true;
+   run_loop_length[SHORT] = true;
+
+
+   //
+   //  Specify loop kernels to run by true/false value in 'run_loop' array.
+   //
+   //  NOTE: If COMPILE_* macro constant associated with each lernel
+   //        is not defined, then those kernels will not be compiled 
+   //        and thus will not be run.
+   //
+   bool run_loop[NUM_LOOP_KERNELS];
+   for (unsigned iloop = 0; iloop < NUM_LOOP_KERNELS; ++iloop) {
+      run_loop[iloop] = false;
+   }
+
+
+#if defined(LCALS_DO_OMP_ONLY)
+
+   // Loop Subset A: Loops extracted from LLNL app codes.
+   run_loop[PRESSURE_CALC ] = true;
+   run_loop[PRESSURE_CALC_ALT ] = true;
+   run_loop[ENERGY_CALC   ] = true;
+   run_loop[ENERGY_CALC_ALT   ] = true;
+   run_loop[VOL3D_CALC    ] = true;
+   run_loop[DEL_DOT_VEC_2D] = true;
+   run_loop[COUPLE        ] = true;
+   run_loop[FIR           ] = true;
+
+   // Loop Subset B: "Basic" Loops.
+   run_loop[INIT3         ] = true;
+   run_loop[MULADDSUB     ] = true;
+   run_loop[IF_QUAD       ] = true;
+   run_loop[TRAP_INT      ] = true;
+
+   // Loop Subset C: Loops from older Livermore Loops in "C" suite.
+   run_loop[PIC_2D        ] = true; 
+
+#else  // else run all loop kernels
+
+   // Loop Subset A: Loops extracted from LLNL app codes.
+   run_loop[PRESSURE_CALC ] = true;
+   run_loop[ENERGY_CALC   ] = true;
+   run_loop[VOL3D_CALC    ] = true;
+   run_loop[DEL_DOT_VEC_2D] = true;
+   run_loop[COUPLE        ] = true;
+   run_loop[FIR           ] = true;
+
+   // Loop Subset B: "Basic" Loops.
+   run_loop[INIT3         ] = true;
+   run_loop[MULADDSUB     ] = true;
+   run_loop[IF_QUAD       ] = true;
+   run_loop[TRAP_INT      ] = true;
+
+   // Loop Subset C: Loops from older Livermore Loops in "C" suite.
+   run_loop[HYDRO_1D      ] = true;
+   run_loop[ICCG          ] = true;
+   run_loop[INNER_PROD    ] = true;
+   run_loop[BAND_LIN_EQ   ] = true;
+   run_loop[TRIDIAG_ELIM  ] = true;
+   run_loop[EOS           ] = true;
+   run_loop[ADI           ] = true;
+   run_loop[INT_PREDICT   ] = true; 
+   run_loop[DIFF_PREDICT  ] = true; 
+   run_loop[FIRST_SUM     ] = true;
+   run_loop[FIRST_DIFF    ] = true;
+   run_loop[PIC_2D        ] = true; 
+   run_loop[PIC_1D        ] = true;
+   run_loop[HYDRO_2D      ] = true;
+   run_loop[GEN_LIN_RECUR ] = true;
+   run_loop[DISC_ORD      ] = true;
+   run_loop[MAT_X_MAT     ] = true;
+   run_loop[PLANCKIAN     ] = true;
+   run_loop[IMP_HYDRO_2D  ] = true;
+   run_loop[FIND_FIRST_MIN] = true;
+
+#endif
+
+
+   //
+   // Specify which loop variants are executed. To run different loop variants,
+   // change which enum values are pushed onto the run-variants vector here. 
+   //
+   // IMPORTANT: The first variant added is used as the reference
+   //            variant for reporting relative execution timing data
+   //            and checksum comparisons.
+   //
+   std::vector<LoopVariantID> run_variants;
+   if ( !run_misc ) {
+      //
+      // These variants comprose the LCALS benchmark.
+      //
+
+#if defined(LCALS_DO_OMP_ONLY)
+
+      run_variants.push_back(RAW_OMP);
+      run_variants.push_back(FORALL_LAMBDA_OMP);
+
+#else  // run other variants in addition to OMP variants
+
+      run_variants.push_back(RAW);
+      run_variants.push_back(FORALL_LAMBDA);
+      run_variants.push_back(RAW_OMP);
+      run_variants.push_back(FORALL_LAMBDA_OMP);
+
+#endif
+
+   } else {
+      //
+      // These variants are used for miscellaneous studies.
+      //
+
+#if defined(LCALS_DO_OMP_ONLY)
+
+      run_variants.push_back(RAW_OMP);
+      run_variants.push_back(FORALL_LAMBDA_OMP);
+#if defined(LCALS_DO_MISC)
+      run_variants.push_back(FORALL_FUNCTOR_OMP);
+//    run_variants.push_back(FORALL_LAMBDA_OMP_TYPEFIX);
+#endif // if LCALS_DO_MISC
+
+
+#else  // run other variants in addition to OMP variants
+
+      //
+      // Bechmark variants.
+      // 
+      run_variants.push_back(RAW);
+      run_variants.push_back(FORALL_LAMBDA);
+//    run_variants.push_back(RAW_OMP);
+//    run_variants.push_back(FORALL_LAMBDA_OMP);
+
+      //
+      // Other available loop variants.
+      // 
+#if defined(LCALS_DO_MISC)
+//    run_variants.push_back(FORALL_HYBRID_LAMBDA);
+
+//    run_variants.push_back(FORALL_FUNCTOR);
+//    run_variants.push_back(FORALL_FUNCTOR_OMP);
+
+//    run_variants.push_back(RAW_FUNC);
+
+//    run_variants.push_back(FORALL_LAMBDA_TYPEFIX);
+//    run_variants.push_back(FORALL_LAMBDA_OMP_TYPEFIX);
+//    run_variants.push_back(FORALL_HYBRID_LAMBDA_TYPEFIX);
+#endif // if LCALS_DO_MISC
+
+#endif
+
+   }
+
+
+   //
+   // Obtain and report hostname.
+   // 
+   const int host_namelen = 64;
+   char host[host_namelen];
+   gethostname( host, host_namelen );
+   std::string host_name(host);
+
+#ifdef TESTSUITE
+   std::cout << "\n Running loop suite on " << host_name << std::endl;
+#endif
+   //
+   // Specify size in bytes of largest data cache level on machine so that 
+   // caches can be properly flushed between execution of different loops.
+   // 
+
+   CacheIndex_type cache_size = 0;
+   if ( host_name.find("rzalastor") != std::string::npos ) {
+      cache_size = 12000000;  // 12MB on rzalastor 
+   } else if ( host_name.find("rzmerl") != std::string::npos ) {
+      cache_size = 20000000;  // 20MB on rzmerl  
+   } else if ( host_name.find("dawn") != std::string::npos ) {
+      cache_size = 8000000;   // 8MB on dawn/rzdawndev
+   } else if ( host_name.find("rzuseq") != std::string::npos ||
+               host_name.find("vulcan") != std::string::npos ||
+               host_name.find("sequoia") != std::string::npos ) {
+      cache_size = 32000000;  // 32MB on BG/Q
+   } 
+#ifdef TESTSUITE
+   else {
+      std::cout << "\n WARNING: unknown system cache size. " 
+                << "Timing results may be suspect!!" << std::endl;
+   }
+#endif
+
+ 
+   //
+   // Allocate data for running loops and generating execution timings.
+   // Also, set structures that define how loops will be run.
+   //
+   allocateLoopSuiteRunInfo(host_name,
+                            NUM_LOOP_KERNELS,
+                            NUM_LENGTHS,
+                            num_suite_passes, 
+                            run_loop_length,
+                            cache_size);
+
+   
+   defineLoopSuiteRunInfo( run_variants, run_loop, sample_frac,
+                                                   loop_length_factor );
+
+   allocateLoopData();
+
+
+   if (do_fom) {
+      //
+      // Compute reference times for figure of merit (FOM) calculation.
+      //
+      computeReferenceLoopTimes();
+   }
+
+   /*************** TEST SUITE ****************
+    *                                         *
+    *  Using google benchmark as test Runner  *
+    *                                         *
+    *******************************************/
+
+
+   ::benchmark::Initialize(&argc, argv);
+   if(::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1;
+   ::benchmark::RunSpecifiedBenchmarks();
+
+
+#ifdef TESTSUITE
+   // Run loops, record timings, etc.
+   //
+   for (unsigned ipass = 0; ipass < num_suite_passes; ++ipass) {
+      std::cout << "\n run suite: pass = " << ipass << std::endl;
+
+      for (unsigned ivariant = 0; ivariant < run_variants.size(); ++ivariant) {
+
+         std::string loop_variant_name = 
+            getVariantName(run_variants[ivariant]);
+
+         std::cout << "\t run loop variant ---> " 
+                   << loop_variant_name << std::endl;
+
+         for (unsigned ilen = 0; ilen < NUM_LENGTHS; ++ilen) {
+
+            if (run_loop_length[ilen]) {
+
+               LoopLength rilen = static_cast<LoopLength>(ilen);
+
+               runLoopVariant(run_variants[ivariant], run_loop, rilen) ;
+
+            }  // if loop length is run
+
+         } // iterate over loop lengths
+
+      }  // iterate over loop variants
+
+   }  // iterate over loop suite passes
+#endif
+
+#ifdef TESTSUITE
+   //
+   //  Generate report(s).
+   //
+   std::cout << "\n generate reports...." << std::endl;
+  
+   std::vector<std::string> run_variant_names = getVariantNames(run_variants);
+
+   generateTimingReport(run_variant_names, output_dirname);
+   generateChecksumReport(run_variant_names, output_dirname); 
+   generateFOMReport(run_variant_names, output_dirname);
+#endif
+
+   //
+   //  Clean up.
+   //
+   freeLoopData();
+#ifdef TESTSUITE
+   std::cout << "\n freeLoopSuiteRunInfo..." << std::endl;
+#endif
+   freeLoopSuiteRunInfo();
+#ifdef TESTSUITE
+   std::cout << "\n DONE!!! " << std::endl;
+#endif
+   return 0 ;
+}
+
Index: MicroBenchmarks/LCALS/runReferenceLoops.cxx
===================================================================
--- /dev/null
+++ MicroBenchmarks/LCALS/runReferenceLoops.cxx
@@ -0,0 +1,172 @@
+//
+// See README-LCALS_license.txt for access and distribution restrictions
+//
+
+//
+// Source file with routines to generate reference loop times for
+// figure of merit (FOM) calculations.
+//
+
+#include "LCALSSuite.hxx"
+#include "LCALSStats.hxx"
+
+#include<string>
+#include<iostream>
+
+//
+// Prototypes for file scope routines containing reference loops
+//
+
+namespace {
+
+void runReferenceLoop0(LoopStat& lstat, unsigned ilen);
+void runReferenceLoop1(LoopStat& lstat, unsigned ilen);
+
+}  // closing brace for unnamed namespace
+
+
+//
+// Define reference loop information.
+//
+// Note: That this may need to be tweaked in the future.
+//
+void defineReferenceLoopRunInfo()
+{
+   LoopSuiteRunInfo& suite_info = getLoopSuiteRunInfo();
+
+   suite_info.ref_loop_stat = LoopStat(NUM_LENGTHS);
+   LoopStat& ref_loop_stat = suite_info.ref_loop_stat;
+
+   ref_loop_stat.loop_length[LONG]   = 24336;
+   ref_loop_stat.loop_length[MEDIUM] = 3844;
+   ref_loop_stat.loop_length[SHORT]  = 64;
+   ref_loop_stat.samples_per_pass[LONG]   = 30000;
+   ref_loop_stat.samples_per_pass[MEDIUM] = 300000;
+   ref_loop_stat.samples_per_pass[SHORT]  = 50000000;
+}
+
+
+//
+// Execute reference loops. The intent is to generate a time for
+// fast loops that any compile should be able to optimize well.  
+// We run two reference loops and take the min execution time.
+// This time is used as a reference against which to compre the 
+// execution times of other loops for figure of merit computation.
+//
+// Note: That this may need to be tweaked in the future.
+//
+void computeReferenceLoopTimes()
+{
+#ifdef TESTSUITE
+   std::cout << "\n computeReferenceLoopTimes..." << std::endl;
+#endif
+   LoopSuiteRunInfo& suite_info = getLoopSuiteRunInfo();
+   LoopStat& ref_loop_stat = suite_info.ref_loop_stat;
+
+
+   LoopStat  lstat0(suite_info.num_loop_lengths);
+   lstat0 = ref_loop_stat;
+   for (unsigned ilen = 0; ilen < NUM_LENGTHS; ++ilen) {
+      runReferenceLoop0(lstat0, ilen);
+   }
+
+   LoopStat  lstat1(suite_info.num_loop_lengths);
+   lstat1 = ref_loop_stat;
+   for (unsigned ilen = 0; ilen < NUM_LENGTHS; ++ilen) {
+      runReferenceLoop1(lstat1, ilen);
+   }
+
+   for (unsigned ilen = 0; ilen < NUM_LENGTHS; ++ilen) {
+      ref_loop_stat.loop_run_time[ilen].push_back(
+         std::min(lstat0.loop_run_time[ilen][0],
+                  lstat1.loop_run_time[ilen][0]) );
+#if 0 // Just for checking...
+      std::cout << "\t len : " << ilen << " rloop0 time = "
+                << lstat0.loop_run_time[ilen][0] << std::endl;
+      std::cout << "\t len : " << ilen << " rloop1 time = "
+                << lstat1.loop_run_time[ilen][0] << std::endl;
+      std::cout << "\t ref len, time = " << ilen << " , "
+                << ref_loop_stat.loop_run_time[ilen][0] << std::endl;
+#endif
+   }
+}
+
+
+//
+// Prototypes for file scope reference loop routines
+//
+
+namespace {
+
+//
+// Element-wise vector product
+//
+void runReferenceLoop0(LoopStat& lstat, unsigned ilen)
+{
+   LoopData& loop_data = getLoopData();
+
+   Index_type len = lstat.loop_length[ilen];
+   int num_samples = lstat.samples_per_pass[ilen];
+   LoopTimer ltimer;
+
+   loopInit(REF_LOOP, lstat);
+
+   Real_ptr a = loop_data.array_1D_Real[0];
+   Real_ptr b = loop_data.array_1D_Real[1];
+   Real_ptr c = loop_data.array_1D_Real[2];
+
+   TIMER_START(ltimer);
+   for (SampIndex_type isamp = 0; isamp < num_samples; ++isamp) {
+
+      for (Index_type i=0 ; i<len ; i++ ) {
+         c[i] = a[i] * b[i];
+      }
+
+   }
+   TIMER_STOP(ltimer);
+
+   copyTimer(lstat, ilen, ltimer);
+}
+
+
+//
+// Vector dot product
+//
+void runReferenceLoop1(LoopStat& lstat, unsigned ilen)
+{
+   LoopData& loop_data = getLoopData();
+
+   Index_type len = lstat.loop_length[ilen];
+   int num_samples = lstat.samples_per_pass[ilen];
+   LoopTimer ltimer;
+
+   loopInit(REF_LOOP, lstat);
+
+   Real_ptr a = loop_data.array_1D_Real[0];
+   Real_ptr b = loop_data.array_1D_Real[1];
+   Real_ptr c = loop_data.array_1D_Real[2];
+
+   Real_type val = 0.0;
+
+   TIMER_START(ltimer);
+   for (SampIndex_type isamp = 0; isamp < num_samples; ++isamp) {
+
+      Real_type q = 0.0;
+      for (Index_type i=0 ; i<len ; i++ ) {
+         c[i] = a[i] * b[i];
+      }
+
+      val = q*isamp;
+   }
+   TIMER_STOP(ltimer);
+
+   //
+   // RDH added this. Without it compiler may optimize out
+   // outer sampling loop because value of q was not used.
+   //
+   loop_data.scalar_Real[0] = (val + 0.00123) / (val - 0.00123);
+
+   copyTimer(lstat, ilen, ltimer);
+}
+
+}  // closing brace for unnamed namespace