Index: test-suite/trunk/LICENSE.TXT
===================================================================
--- test-suite/trunk/LICENSE.TXT
+++ test-suite/trunk/LICENSE.TXT
@@ -85,6 +85,7 @@
 smg2000:            llvm-test/MultiSource/Benchmarks/ASCI_Purple/SMG2000
 XSBench:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench
 HPCCG:              llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/HPCCG
+PENNANT:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT
 Fhourstones:        llvm-test/MultiSource/Benchmarks/Fhourstones
 Fhourstones-3.1:    llvm-test/MultiSource/Benchmarks/Fhourstones-3.1
 McCat:              llvm-test/MultiSource/Benchmarks/McCat
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(HPCCG)
+add_subdirectory(PENNANT)
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile
@@ -1,6 +1,6 @@
 # MultiSource/DOE-ProxyApps-C++ Makefile: Build all subdirectories automatically
 
 LEVEL = ../../..
-PARALLEL_DIRS = HPCCG
+PARALLEL_DIRS = HPCCG PENNANT
 
 include $(LEVEL)/Makefile.programs
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeLists.txt
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeLists.txt
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(PROG PENNANT)
+set(RUN_OPTIONS ${CMAKE_CURRENT_SOURCE_DIR}/intermediate_leblanc.pnt)
+llvm_multisource()
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Driver.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Driver.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Driver.hh
@@ -0,0 +1,54 @@
+/*
+ * Driver.hh
+ *
+ *  Created on: Jan 23, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef DRIVER_HH_
+#define DRIVER_HH_
+
+#include <string>
+
+// forward declarations
+class InputFile;
+class Mesh;
+class Hydro;
+
+
+class Driver {
+public:
+
+    // children of this object
+    Mesh *mesh;
+    Hydro *hydro;
+
+    std::string probname;          // problem name
+    double time;                   // simulation time
+    int cycle;                     // simulation cycle number
+    double tstop;                  // simulation stop time
+    int cstop;                     // simulation stop cycle
+    double dtmax;                  // maximum timestep size
+    double dtinit;                 // initial timestep size
+    double dtfac;                  // factor limiting timestep growth
+    int dtreport;                  // frequency for timestep reports
+    double dt;                     // current timestep
+    double dtlast;                 // previous timestep
+    std::string msgdt;             // dt limiter message
+    std::string msgdtlast;         // previous dt limiter message
+
+    Driver(const InputFile* inp, const std::string& pname);
+    ~Driver();
+
+    void run();
+    void calcGlobalDt();
+
+};  // class Driver
+
+
+#endif /* DRIVER_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Driver.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Driver.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Driver.cc
@@ -0,0 +1,239 @@
+/*
+ * Driver.cc
+ *
+ *  Created on: Jan 23, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "Driver.hh"
+
+#include <cstdlib>
+#include <cstring>
+#include <sys/time.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#include "Parallel.hh"
+#include "InputFile.hh"
+#include "Mesh.hh"
+#include "Hydro.hh"
+
+using namespace std;
+
+
+Driver::Driver(const InputFile* inp, const string& pname)
+        : probname(pname) {
+    using Parallel::numpe;
+    using Parallel::mype;
+
+    if (mype == 0) {
+        cout << "********************" << endl;
+        cout << "Running PENNANT v0.9" << endl;
+        cout << "********************" << endl;
+        cout << endl;
+
+#ifdef USE_MPI
+        cout << "Running on " << numpe << " MPI PE(s)" << endl;
+#endif
+#ifdef _OPENMP
+        cout << "Running on " << omp_get_max_threads() << " thread(s)"
+             << endl;
+#endif
+    }  // if mype == 0
+
+    cstop = inp->getInt("cstop", 999999);
+    tstop = inp->getDouble("tstop", 1.e99);
+    if (cstop == 999999 && tstop == 1.e99) {
+        if (mype == 0)
+            cerr << "Must specify either cstop or tstop" << endl;
+        exit(1);
+    }
+    dtmax = inp->getDouble("dtmax", 1.e99);
+    dtinit = inp->getDouble("dtinit", 1.e99);
+    dtfac = inp->getDouble("dtfac", 1.2);
+    dtreport = inp->getInt("dtreport", 10);
+
+    // initialize mesh, hydro
+    mesh = new Mesh(inp);
+    hydro = new Hydro(inp, mesh);
+
+}
+
+Driver::~Driver() {
+
+    delete hydro;
+    delete mesh;
+
+}
+
+void Driver::run() {
+    using Parallel::mype;
+
+    time = 0.0;
+    cycle = 0;
+
+    // do energy check
+    hydro->writeEnergyCheck();
+
+    double tbegin, tlast;
+    if (mype == 0) {
+        // get starting timestamp
+        struct timeval sbegin;
+        gettimeofday(&sbegin, NULL);
+        tbegin = sbegin.tv_sec + sbegin.tv_usec * 1.e-6;
+        tlast = tbegin;
+    }
+
+    // main event loop
+    while (cycle < cstop && time < tstop) {
+
+        cycle += 1;
+
+        // get timestep
+        calcGlobalDt();
+
+        // begin hydro cycle
+        hydro->doCycle(dt);
+
+        time += dt;
+
+        if (mype == 0 &&
+                (cycle == 1 || cycle % dtreport == 0)) {
+            struct timeval scurr;
+            gettimeofday(&scurr, NULL);
+            double tcurr = scurr.tv_sec + scurr.tv_usec * 1.e-6;
+            double tdiff = tcurr - tlast;
+
+            cout << scientific << setprecision(5);
+            #ifdef PRINT_TIME
+            cout << "End cycle " << setw(6) << cycle
+                 << ", time = " << setw(11) << time
+                 << ", dt = " << setw(11) << dt
+                 << ", wall = " << setw(11) << tdiff << endl;
+            #endif
+            cout << "dt limiter: " << msgdt << endl;
+
+            tlast = tcurr;
+        } // if mype...
+
+    } // while cycle...
+
+    if (mype == 0) {
+
+        // get stopping timestamp
+        struct timeval send;
+        gettimeofday(&send, NULL);
+        double tend = send.tv_sec + send.tv_usec * 1.e-6;
+        double runtime = tend - tbegin;
+
+        // write end message
+        cout << endl;
+        cout << "Run complete" << endl;
+        cout << scientific << setprecision(6);
+        cout << "cycle = " << setw(6) << cycle
+             << ",         cstop = " << setw(6) << cstop << endl;
+        cout << "time  = " << setw(14) << time
+             << ", tstop = " << setw(14) << tstop << endl;
+
+        cout << endl;
+        #ifdef PRINT_TIME
+        cout << "************************************" << endl;
+        cout << "hydro cycle run time= " << setw(14) << runtime << endl;
+        cout << "************************************" << endl;
+        #endif
+
+    } // if mype
+
+    // do energy check
+    hydro->writeEnergyCheck();
+
+    // do final mesh output
+    mesh->write(probname, cycle, time,
+            hydro->zr, hydro->ze, hydro->zp);
+
+}
+
+
+void Driver::calcGlobalDt() {
+
+    using Parallel::mype;
+
+    // Save timestep from last cycle
+    dtlast = dt;
+    msgdtlast = msgdt;
+
+    // Compute timestep for this cycle
+    dt = dtmax;
+    msgdt = "Global maximum (dtmax)";
+
+    if (cycle == 1) {
+        // compare to initial timestep
+        if (dtinit < dt) {
+            dt = dtinit;
+            msgdt = "Initial timestep";
+        }
+    } else {
+        // compare to factor * previous timestep
+        double dtrecover = dtfac * dtlast;
+        if (dtrecover < dt) {
+            dt = dtrecover;
+            if (msgdtlast.substr(0, 8) == "Recovery")
+                msgdt = msgdtlast;
+            else
+                msgdt = "Recovery: " + msgdtlast;
+        }
+    }
+
+    // compare to time-to-end
+    if ((tstop - time) < dt) {
+        dt = tstop - time;
+        msgdt = "Global (tstop - time)";
+    }
+
+    // compare to hydro dt
+    hydro->getDtHydro(dt, msgdt);
+
+#ifdef USE_MPI
+    int pedt;
+    Parallel::globalMinLoc(dt, pedt);
+
+    // if the global min isn't on this PE, get the right message
+    if (pedt > 0) {
+        const int tagmpi = 300;
+        if (mype == pedt) {
+            char cmsgdt[80];
+            strncpy(cmsgdt, msgdt.c_str(), 80);
+            MPI_Send(cmsgdt, 80, MPI_CHAR, 0, tagmpi,
+                    MPI_COMM_WORLD);
+        }
+        else if (mype == 0) {
+            char cmsgdt[80];
+            MPI_Status status;
+            MPI_Recv(cmsgdt, 80, MPI_CHAR, pedt, tagmpi,
+                    MPI_COMM_WORLD, &status);
+            cmsgdt[79] = '\0';
+            msgdt = string(cmsgdt);
+        }
+    }  // if pedt > 0
+
+    // if timestep was determined by hydro, report which PE
+    // caused it
+    if (mype == 0 && msgdt.substr(0, 5) == "Hydro") {
+        ostringstream oss;
+        oss << "PE " << pedt << ", " << msgdt;
+        msgdt = oss.str();
+    }
+#endif
+
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/ExportGold.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/ExportGold.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/ExportGold.hh
@@ -0,0 +1,69 @@
+/*
+ * ExportGold.hh
+ *
+ *  Created on: Mar 1, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef EXPORTGOLD_HH_
+#define EXPORTGOLD_HH_
+
+#include <string>
+#include <vector>
+
+// forward declarations
+class Mesh;
+
+
+class ExportGold {
+public:
+
+    Mesh* mesh;
+
+    std::vector<int> tris;         // zone index list for 3-sided zones
+    std::vector<int> quads;        // same, for 4-sided zones
+    std::vector<int> others;       // same, for n-sided zones, n > 4
+    std::vector<int> mapzs;        // map: zone -> first side
+
+    // parallel info, meaningful on PE 0 only:
+    std::vector<int> pentris;      // number of tris on each PE
+    std::vector<int> penquads;     // same, for quads
+    std::vector<int> penothers;    // same, for others
+    int gntris, gnquads, gnothers; // total number across all PEs
+                                   //     of tris/quads/others
+
+    ExportGold(Mesh* m);
+    ~ExportGold();
+
+    void write(
+            const std::string& basename,
+            const int cycle,
+            const double time,
+            const double* zr,
+            const double* ze,
+            const double* zp);
+
+    void writeCaseFile(
+            const std::string& basename);
+
+    void writeGeoFile(
+            const std::string& basename,
+            const int cycle,
+            const double time);
+
+    void writeVarFile(
+            const std::string& basename,
+            const std::string& varname,
+            const double* var);
+
+    void sortZones();
+};
+
+
+
+#endif /* EXPORTGOLD_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/ExportGold.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/ExportGold.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/ExportGold.cc
@@ -0,0 +1,376 @@
+/*
+ * ExportGold.cc
+ *
+ *  Created on: Mar 1, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "ExportGold.hh"
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cstdlib>
+#include <numeric>
+
+#include "Parallel.hh"
+#include "Vec2.hh"
+#include "Mesh.hh"
+
+using namespace std;
+
+
+ExportGold::ExportGold(Mesh* m) : mesh(m) {}
+
+ExportGold::~ExportGold() {}
+
+
+void ExportGold::write(
+        const string& basename,
+        const int cycle,
+        const double time,
+        const double* zr,
+        const double* ze,
+        const double* zp) {
+
+    writeCaseFile(basename);
+
+    sortZones();
+    writeGeoFile(basename, cycle, time);
+
+    writeVarFile(basename, "zr", zr);
+    writeVarFile(basename, "ze", ze);
+    writeVarFile(basename, "zp", zp);
+
+}
+
+
+void ExportGold::writeCaseFile(
+        const string& basename) {
+
+    if (Parallel::mype > 0) return;
+
+    // open file
+    const string filename = basename + ".case";
+    ofstream ofs(filename.c_str());
+    if (!ofs.good()) {
+        cerr << "Cannot open file " << filename << " for writing"
+             << endl;
+        exit(1);
+    }
+
+    // write case info
+    ofs << "#" << endl;
+    ofs << "# Created by PENNANT" << endl;
+    ofs << "#" << endl;
+
+    ofs << "FORMAT" << endl;
+    ofs << "type: ensight gold" << endl;
+
+    ofs << "GEOMETRY" << endl;
+    ofs << "model: " << basename << ".geo" << endl;
+
+    ofs << "VARIABLE" << endl;
+    ofs << "scalar per element: zr " << basename << ".zr" << endl;
+    ofs << "scalar per element: ze " << basename << ".ze" << endl;
+    ofs << "scalar per element: zp " << basename << ".zp" << endl;
+
+    ofs.close();
+
+}
+
+
+void ExportGold::writeGeoFile(
+        const string& basename,
+        const int cycle,
+        const double time) {
+    using Parallel::numpe;
+    using Parallel::mype;
+
+    // open file
+    ofstream ofs;
+    if (mype == 0) {
+        const string filename = basename + ".geo";
+        ofs.open(filename.c_str());
+        if (!ofs.good()) {
+            cerr << "Cannot open file " << filename << " for writing"
+                 << endl;
+            exit(1);
+        }
+    }
+
+    // write general header
+    if (mype == 0) {
+        ofs << scientific;
+        ofs << "cycle = " << setw(8) << cycle << endl;
+        ofs << setprecision(8);
+        ofs << "t = " << setw(15) << time << endl;
+        ofs << "node id assign" << endl;
+        ofs << "element id given" << endl;
+
+        // write header for the one "part" (entire mesh)
+        ofs << "part" << endl;
+        ofs << setw(10) << 1 << endl;
+        ofs << "universe" << endl;
+    } // if mype == 0
+
+    // gather node info to PE 0
+    const int nump = mesh->nump;
+    const double2* px = mesh->px;
+
+    int gnump = nump;
+    Parallel::globalSum(gnump);
+    vector<int> penump(mype == 0 ? numpe : 0);
+    Parallel::gather(nump, &penump[0]);
+    vector<int> peoffset(mype == 0 ? numpe + 1 : 1);
+    partial_sum(penump.begin(), penump.end(), &peoffset[1]);
+    int offset;
+    Parallel::scatter(&peoffset[0], offset);
+    vector<double2> gpx(mype == 0 ? gnump : 0);
+    Parallel::gatherv(&px[0], nump, &gpx[0], &penump[0]);
+
+    // write node info
+    if (mype == 0) {
+        ofs << "coordinates" << endl;
+        ofs << setw(10) << gnump << endl;
+        ofs << setprecision(5);
+        for (int p = 0; p < gnump; ++p)
+            ofs << setw(12) << gpx[p].x << endl;
+        for (int p = 0; p < gnump; ++p)
+            ofs << setw(12) << gpx[p].y << endl;
+        // Ensight expects z-coordinates, so write 0 for those
+        for (int p = 0; p < gnump; ++p)
+            ofs << setw(12) << 0. << endl;
+    } // if mype
+
+    const int* znump = mesh->znump;
+    const int* mapsp1 = mesh->mapsp1;
+
+    const int ntris = tris.size();
+    const int nquads = quads.size();
+    const int nothers = others.size();
+
+    if (mype == 0) {
+        pentris.resize(numpe);
+        penquads.resize(numpe);
+        penothers.resize(numpe);
+    }
+    Parallel::gather(ntris, &pentris[0]);
+    Parallel::gather(nquads, &penquads[0]);
+    Parallel::gather(nothers, &penothers[0]);
+
+    gntris = accumulate(pentris.begin(), pentris.end(), 0);
+    gnquads = accumulate(penquads.begin(), penquads.end(), 0);
+    gnothers = accumulate(penothers.begin(), penothers.end(), 0);
+
+    vector<int> pesizes(mype == 0 ? numpe : 0);
+
+    // gather triangle info to PE 0
+    vector<int> trip(3 * ntris);
+    vector<int> gtris(gntris), gtrip(3 * gntris);
+    Parallel::gatherv(&tris[0], ntris, &gtris[0], &pentris[0]);
+    if (mype == 0) {
+        for (int pe = 0; pe < numpe; ++pe)
+            pesizes[pe] = pentris[pe] * 3;
+    }
+    for (int t = 0; t < ntris; ++t) {
+        int z = tris[t];
+        int sbase = mapzs[z];
+        for (int i = 0; i < 3; ++i) {
+            trip[t * 3 + i] = mapsp1[sbase + i] + offset;
+        }
+    }
+    Parallel::gatherv(&trip[0], 3 * ntris, &gtrip[0], &pesizes[0]);
+
+    // write triangles
+    if (mype == 0 && gntris > 0) {
+        ofs << "tria3" << endl;
+        ofs << setw(10) << gntris << endl;
+        for (int t = 0; t < gntris; ++t)
+            ofs << setw(10) << gtris[t] + 1 << endl;
+        for (int t = 0; t < gntris; ++t) {
+            for (int i = 0; i < 3; ++i)
+                ofs << setw(10) << gtrip[t * 3 + i] + 1;
+            ofs << endl;
+        }
+    } // if mype == 0 ...
+
+    // gather quad info to PE 0
+    vector<int> quadp(4 * nquads);
+    vector<int> gquads(gnquads), gquadp(4 * gnquads);
+    Parallel::gatherv(&quads[0], nquads, &gquads[0], &penquads[0]);
+    if (mype == 0) {
+        for (int pe = 0; pe < numpe; ++pe)
+            pesizes[pe] = penquads[pe] * 4;
+    }
+    for (int q = 0; q < nquads; ++q) {
+        int z = quads[q];
+        int sbase = mapzs[z];
+        for (int i = 0; i < 4; ++i) {
+            quadp[q * 4 + i] = mapsp1[sbase + i] + offset;
+        }
+    }
+    Parallel::gatherv(&quadp[0], 4 * nquads, &gquadp[0], &pesizes[0]);
+
+    // write quads
+    if (mype == 0 && gnquads > 0) {
+        ofs << "quad4" << endl;
+        ofs << setw(10) << gnquads << endl;
+        for (int q = 0; q < gnquads; ++q)
+            ofs << setw(10) << gquads[q] + 1 << endl;
+        for (int q = 0; q < gnquads; ++q) {
+            for (int i = 0; i < 4; ++i)
+                ofs << setw(10) << gquadp[q * 4 + i] + 1;
+            ofs << endl;
+        }
+    } // if mype == 0 ...
+
+    // gather other info to PE 0
+    vector<int> othernump(nothers), otherp;
+    vector<int> gothers(gnothers), gothernump(gnothers);
+    Parallel::gatherv(&others[0], nothers, &gothers[0], &penothers[0]);
+    for (int n = 0; n < nothers; ++n) {
+        int z = others[n];
+        int sbase = mapzs[z];
+        othernump[n] = znump[z];
+        for (int i = 0; i < znump[z]; ++i) {
+            otherp.push_back(mapsp1[sbase + i] + offset);
+        }
+    }
+    Parallel::gatherv(&othernump[0], nothers, &gothernump[0], &penothers[0]);
+    int size = otherp.size();
+    Parallel::gather(size, &pesizes[0]);
+    int gsize = accumulate(pesizes.begin(), pesizes.end(), 0);
+    vector<int> gotherp(gsize);
+    Parallel::gatherv(&otherp[0], size, &gotherp[0], &pesizes[0]);
+
+    // write others
+    if (mype == 0 && gnothers > 0) {
+        ofs << "nsided" << endl;
+        ofs << setw(10) << gnothers << endl;
+        for (int n = 0; n < gnothers; ++n)
+            ofs << setw(10) << gothers[n] + 1 << endl;
+        for (int n = 0; n < gnothers; ++n)
+            ofs << setw(10) << gothernump[n] << endl;
+        int gp = 0;
+        for (int n = 0; n < gnothers; ++n) {
+            for (int i = 0; i < gothernump[n]; ++i)
+                ofs << setw(10) << gotherp[gp + i] + 1;
+            ofs << endl;
+            gp += gothernump[n];
+        }
+    } // if mype == 0 ...
+
+    if (mype == 0) ofs.close();
+
+}
+
+
+void ExportGold::writeVarFile(
+        const string& basename,
+        const string& varname,
+        const double* var) {
+    using Parallel::mype;
+
+    // open file
+    ofstream ofs;
+    if (mype == 0) {
+        const string filename = basename + "." + varname;
+        ofs.open(filename.c_str());
+        if (!ofs.good()) {
+            cerr << "Cannot open file " << filename << " for writing"
+                 << endl;
+            exit(1);
+        }
+    } // if mype == 0
+
+    // write header
+    if (mype == 0) {
+        ofs << scientific << setprecision(5);
+        ofs << varname << endl;
+        ofs << "part" << endl;
+        ofs << setw(10) << 1 << endl;
+    } // if mype == 0
+
+    int ntris = tris.size();
+    int nquads = quads.size();
+    int nothers = others.size();
+
+    // gather values on triangles to PE 0
+    vector<double> tvar(ntris), gtvar(gntris);
+    for (int t = 0; t < ntris; ++t) {
+        tvar[t] = var[tris[t]];
+    }
+    Parallel::gatherv(&tvar[0], ntris, &gtvar[0], &pentris[0]);
+
+    // write values on triangles
+    if (mype == 0 && gntris > 0) {
+        ofs << "tria3" << endl;
+        for (int t = 0; t < gntris; ++t) {
+            ofs << setw(12) << gtvar[t] << endl;
+        }
+    } // if mype == 0 ...
+
+    // gather values on quads to PE 0
+    vector<double> qvar(nquads), gqvar(gnquads);
+    for (int q = 0; q < nquads; ++q) {
+        qvar[q] = var[quads[q]];
+    }
+    Parallel::gatherv(&qvar[0], nquads, &gqvar[0], &penquads[0]);
+
+    // write values on quads
+    if (mype == 0 && gnquads > 0) {
+        ofs << "quad4" << endl;
+        for (int q = 0; q < gnquads; ++q) {
+            ofs << setw(12) << gqvar[q] << endl;
+        }
+    } // if mype == 0 ...
+
+    // gather values on others to PE 0
+    vector<double> ovar(nothers), govar(gnothers);
+    for (int n = 0; n < nothers; ++n) {
+        ovar[n] = var[others[n]];
+    }
+    Parallel::gatherv(&ovar[0], nothers, &govar[0], &penothers[0]);
+
+    // write values on others
+    if (mype == 0 && gnothers > 0) {
+        ofs << "nsided" << endl;
+        for (int n = 0; n < gnothers; ++n) {
+            ofs << setw(12) << govar[n] << endl;
+        }
+    } // if mype == 0 ...
+
+    if (mype == 0) ofs.close();
+
+}
+
+
+void ExportGold::sortZones() {
+
+    const int numz = mesh->numz;
+    const int* znump = mesh->znump;
+
+    mapzs.resize(numz);
+
+    // sort zones by size, create an inverse map
+    int scount = 0;
+    for (int z = 0; z < numz; ++z) {
+        int zsize = znump[z];
+        if (zsize == 3)
+            tris.push_back(z);
+        else if (zsize == 4)
+            quads.push_back(z);
+        else // zsize > 4
+            others.push_back(z);
+        mapzs[z] = scount;
+        scount += zsize;
+    } // for z
+
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/GenMesh.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/GenMesh.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/GenMesh.hh
@@ -0,0 +1,96 @@
+/*
+ * GenMesh.hh
+ *
+ *  Created on: Jun 4, 2013
+ *      Author: cferenba
+ *
+ * Copyright (c) 2013, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef GENMESH_HH_
+#define GENMESH_HH_
+
+#include <string>
+#include <vector>
+#include "Vec2.hh"
+
+// forward declarations
+class InputFile;
+
+
+class GenMesh {
+public:
+
+    std::string meshtype;       // generated mesh type
+    int gnzx, gnzy;             // global number of zones, in x and y
+                                // directions
+    double lenx, leny;          // length of mesh sides, in x and y
+                                // directions
+    int numpex, numpey;         // number of PEs to use, in x and y
+                                // directions
+    int mypex, mypey;           // my PE index, in x and y directions
+    int nzx, nzy;               // (local) number of zones, in x and y
+                                // directions
+    int zxoffset, zyoffset;     // offsets of local zone array into
+                                // global, in x and y directions
+
+    GenMesh(const InputFile* inp);
+    ~GenMesh();
+
+    void generate(
+            std::vector<double2>& pointpos,
+            std::vector<int>& zonestart,
+            std::vector<int>& zonesize,
+            std::vector<int>& zonepoints,
+            std::vector<int>& slavemstrpes,
+            std::vector<int>& slavemstrcounts,
+            std::vector<int>& slavepoints,
+            std::vector<int>& masterslvpes,
+            std::vector<int>& masterslvcounts,
+            std::vector<int>& masterpoints);
+
+    void generateRect(
+            std::vector<double2>& pointpos,
+            std::vector<int>& zonestart,
+            std::vector<int>& zonesize,
+            std::vector<int>& zonepoints,
+            std::vector<int>& slavemstrpes,
+            std::vector<int>& slavemstrcounts,
+            std::vector<int>& slavepoints,
+            std::vector<int>& masterslvpes,
+            std::vector<int>& masterslvcounts,
+            std::vector<int>& masterpoints);
+
+    void generatePie(
+            std::vector<double2>& pointpos,
+            std::vector<int>& zonestart,
+            std::vector<int>& zonesize,
+            std::vector<int>& zonepoints,
+            std::vector<int>& slavemstrpes,
+            std::vector<int>& slavemstrcounts,
+            std::vector<int>& slavepoints,
+            std::vector<int>& masterslvpes,
+            std::vector<int>& masterslvcounts,
+            std::vector<int>& masterpoints);
+
+    void generateHex(
+            std::vector<double2>& pointpos,
+            std::vector<int>& zonestart,
+            std::vector<int>& zonesize,
+            std::vector<int>& zonepoints,
+            std::vector<int>& slavemstrpes,
+            std::vector<int>& slavemstrcounts,
+            std::vector<int>& slavepoints,
+            std::vector<int>& masterslvpes,
+            std::vector<int>& masterslvcounts,
+            std::vector<int>& masterpoints);
+
+    void calcNumPE();
+
+}; // class GenMesh
+
+
+#endif /* GENMESH_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/GenMesh.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/GenMesh.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/GenMesh.cc
@@ -0,0 +1,649 @@
+/*
+ * GenMesh.cc
+ *
+ *  Created on: Jun 4, 2013
+ *      Author: cferenba
+ *
+ * Copyright (c) 2013, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "GenMesh.hh"
+
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+
+#include "Vec2.hh"
+#include "Parallel.hh"
+#include "InputFile.hh"
+
+using namespace std;
+
+
+GenMesh::GenMesh(const InputFile* inp) {
+
+    using Parallel::mype;
+
+    meshtype = inp->getString("meshtype", "");
+    if (meshtype.empty()) {
+        if (mype == 0)
+            cerr << "Error:  must specify meshtype" << endl;
+        exit(1);
+    }
+    if (meshtype != "pie" &&
+            meshtype != "rect" &&
+            meshtype != "hex") {
+        if (mype == 0)
+            cerr << "Error:  invalid meshtype " << meshtype << endl;
+        exit(1);
+    }
+    vector<double> params =
+            inp->getDoubleList("meshparams", vector<double>());
+    if (params.empty()) {
+        if (mype == 0)
+            cerr << "Error:  must specify meshparams" << endl;
+        exit(1);
+    }
+    if (params.size() > 4) {
+        if (mype == 0)
+            cerr << "Error:  meshparams must have <= 4 values" << endl;
+        exit(1);
+    }
+
+    gnzx = params[0];
+    gnzy = (params.size() >= 2 ? params[1] : gnzx);
+    if (meshtype != "pie")
+        lenx = (params.size() >= 3 ? params[2] : 1.0);
+    else
+        // convention:  x = theta, y = r
+        lenx = (params.size() >= 3 ? params[2] : 90.0)
+                * M_PI / 180.0;
+    leny = (params.size() >= 4 ? params[3] : 1.0);
+
+    if (gnzx <= 0 || gnzy <= 0 || lenx <= 0. || leny <= 0. ) {
+        if (mype == 0)
+            cerr << "Error:  meshparams values must be positive" << endl;
+        exit(1);
+    }
+    if (meshtype == "pie" && lenx >= 2. * M_PI) {
+        if (mype == 0)
+            cerr << "Error:  meshparams theta must be < 360" << endl;
+        exit(1);
+    }
+
+}
+
+
+GenMesh::~GenMesh() {}
+
+
+void GenMesh::generate(
+        std::vector<double2>& pointpos,
+        std::vector<int>& zonestart,
+        std::vector<int>& zonesize,
+        std::vector<int>& zonepoints,
+        std::vector<int>& slavemstrpes,
+        std::vector<int>& slavemstrcounts,
+        std::vector<int>& slavepoints,
+        std::vector<int>& masterslvpes,
+        std::vector<int>& masterslvcounts,
+        std::vector<int>& masterpoints){
+
+    // do calculations common to all mesh types
+    calcNumPE();
+    zxoffset = mypex * gnzx / numpex;
+    const int zxstop = (mypex + 1) * gnzx / numpex;
+    nzx = zxstop - zxoffset;
+    zyoffset = mypey * gnzy / numpey;
+    const int zystop = (mypey + 1) * gnzy / numpey;
+    nzy = zystop - zyoffset;
+
+    // mesh type-specific calculations
+    if (meshtype == "pie")
+        generatePie(pointpos, zonestart, zonesize, zonepoints,
+                slavemstrpes, slavemstrcounts, slavepoints,
+                masterslvpes, masterslvcounts, masterpoints);
+    else if (meshtype == "rect")
+        generateRect(pointpos, zonestart, zonesize, zonepoints,
+                slavemstrpes, slavemstrcounts, slavepoints,
+                masterslvpes, masterslvcounts, masterpoints);
+    else if (meshtype == "hex")
+        generateHex(pointpos, zonestart, zonesize, zonepoints,
+                slavemstrpes, slavemstrcounts, slavepoints,
+                masterslvpes, masterslvcounts, masterpoints);
+
+}
+
+
+void GenMesh::generateRect(
+        std::vector<double2>& pointpos,
+        std::vector<int>& zonestart,
+        std::vector<int>& zonesize,
+        std::vector<int>& zonepoints,
+        std::vector<int>& slavemstrpes,
+        std::vector<int>& slavemstrcounts,
+        std::vector<int>& slavepoints,
+        std::vector<int>& masterslvpes,
+        std::vector<int>& masterslvcounts,
+        std::vector<int>& masterpoints) {
+
+    using Parallel::numpe;
+    using Parallel::mype;
+
+    const int nz = nzx * nzy;
+    const int npx = nzx + 1;
+    const int npy = nzy + 1;
+    const int np = npx * npy;
+
+    // generate point coordinates
+    pointpos.reserve(np);
+    double dx = lenx / (double) gnzx;
+    double dy = leny / (double) gnzy;
+    for (int j = 0; j < npy; ++j) {
+        double y = dy * (double) (j + zyoffset);
+        for (int i = 0; i < npx; ++i) {
+            double x = dx * (double) (i + zxoffset);
+            pointpos.push_back(make_double2(x, y));
+        }
+    }
+
+    // generate zone adjacency lists
+    zonestart.reserve(nz);
+    zonesize.reserve(nz);
+    zonepoints.reserve(4 * nz);
+    for (int j = 0; j < nzy; ++j) {
+        for (int i = 0; i < nzx; ++i) {
+            zonestart.push_back(zonepoints.size());
+            zonesize.push_back(4);
+            int p0 = j * npx + i;
+            zonepoints.push_back(p0);
+            zonepoints.push_back(p0 + 1);
+            zonepoints.push_back(p0 + npx + 1);
+            zonepoints.push_back(p0 + npx);
+       }
+    }
+
+    if (numpe == 1) return;
+
+    // estimate sizes of slave/master arrays
+    slavepoints.reserve((mypey != 0) * npx + (mypex != 0) * npy);
+    masterpoints.reserve((mypey != numpey - 1) * npx +
+            (mypex != numpex - 1) * npy + 1);
+
+    // enumerate slave points
+    // slave point with master at lower left
+    if (mypex != 0 && mypey != 0) {
+        int mstrpe = mype - numpex - 1;
+        slavepoints.push_back(0);
+        slavemstrpes.push_back(mstrpe);
+        slavemstrcounts.push_back(1);
+    }
+    // slave points with master below
+    if (mypey != 0) {
+        int mstrpe = mype - numpex;
+        int oldsize = slavepoints.size();
+        int p = 0;
+        for (int i = 0; i < npx; ++i) {
+            if (i == 0 && mypex != 0) { p++; continue; }
+            slavepoints.push_back(p);
+            p++;
+        }
+        slavemstrpes.push_back(mstrpe);
+        slavemstrcounts.push_back(slavepoints.size() - oldsize);
+    }
+    // slave points with master to left
+    if (mypex != 0) {
+        int mstrpe = mype - 1;
+        int oldsize = slavepoints.size();
+        int p = 0;
+        for (int j = 0; j < npy; ++j) {
+            if (j == 0 && mypey != 0) { p += npx; continue; }
+            slavepoints.push_back(p);
+            p += npx;
+        }
+        slavemstrpes.push_back(mstrpe);
+        slavemstrcounts.push_back(slavepoints.size() - oldsize);
+    }
+
+    // enumerate master points
+    // master points with slave to right
+    if (mypex != numpex - 1) {
+        int slvpe = mype + 1;
+        int oldsize = masterpoints.size();
+        int p = npx - 1;
+        for (int j = 0; j < npy; ++j) {
+            if (j == 0 && mypey != 0) { p += npx; continue; }
+            masterpoints.push_back(p);
+            p += npx;
+        }
+        masterslvpes.push_back(slvpe);
+        masterslvcounts.push_back(masterpoints.size() - oldsize);
+    }
+    // master points with slave above
+    if (mypey != numpey - 1) {
+        int slvpe = mype + numpex;
+        int oldsize = masterpoints.size();
+        int p = (npy - 1) * npx;
+        for (int i = 0; i < npx; ++i) {
+            if (i == 0 && mypex != 0) { p++; continue; }
+            masterpoints.push_back(p);
+            p++;
+        }
+        masterslvpes.push_back(slvpe);
+        masterslvcounts.push_back(masterpoints.size() - oldsize);
+    }
+    // master point with slave at upper right
+    if (mypex != numpex - 1 && mypey != numpey - 1) {
+        int slvpe = mype + numpex + 1;
+        int p = npx * npy - 1;
+        masterpoints.push_back(p);
+        masterslvpes.push_back(slvpe);
+        masterslvcounts.push_back(1);
+    }
+
+}
+
+
+void GenMesh::generatePie(
+        std::vector<double2>& pointpos,
+        std::vector<int>& zonestart,
+        std::vector<int>& zonesize,
+        std::vector<int>& zonepoints,
+        std::vector<int>& slavemstrpes,
+        std::vector<int>& slavemstrcounts,
+        std::vector<int>& slavepoints,
+        std::vector<int>& masterslvpes,
+        std::vector<int>& masterslvcounts,
+        std::vector<int>& masterpoints) {
+
+    using Parallel::numpe;
+    using Parallel::mype;
+
+    const int nz = nzx * nzy;
+    const int npx = nzx + 1;
+    const int npy = nzy + 1;
+    const int np = (mypey == 0 ? npx * (npy - 1) + 1 : npx * npy);
+
+    // generate point coordinates
+    pointpos.reserve(np);
+    double dth = lenx / (double) gnzx;
+    double dr  = leny / (double) gnzy;
+    for (int j = 0; j < npy; ++j) {
+        if (j + zyoffset == 0) {
+            pointpos.push_back(make_double2(0., 0.));
+            continue;
+        }
+        double r = dr * (double) (j + zyoffset);
+        for (int i = 0; i < npx; ++i) {
+            double th = dth * (double) (gnzx - (i + zxoffset));
+            double x = r * cos(th);
+            double y = r * sin(th);
+            pointpos.push_back(make_double2(x, y));
+        }
+    }
+
+    // generate zone adjacency lists
+    zonestart.reserve(nz);
+    zonesize.reserve(nz);
+    zonepoints.reserve(4 * nz);
+    for (int j = 0; j < nzy; ++j) {
+        for (int i = 0; i < nzx; ++i) {
+            zonestart.push_back(zonepoints.size());
+            int p0 = j * npx + i;
+            if (mypey == 0) p0 -= npx - 1;
+            if (j + zyoffset == 0) {
+                zonesize.push_back(3);
+                zonepoints.push_back(0);
+            }
+            else {
+                zonesize.push_back(4);
+                zonepoints.push_back(p0);
+                zonepoints.push_back(p0 + 1);
+            }
+            zonepoints.push_back(p0 + npx + 1);
+            zonepoints.push_back(p0 + npx);
+        }
+    }
+
+    if (numpe == 1) return;
+
+    // estimate sizes of slave/master arrays
+    slavepoints.reserve((mypey != 0) * npx + (mypex != 0) * npy);
+    masterpoints.reserve((mypey != numpey - 1) * npx +
+            (mypex != numpex - 1) * npy + 1);
+
+    // enumerate slave points
+    // slave point with master at lower left
+    if (mypex != 0 && mypey != 0) {
+        int mstrpe = mype - numpex - 1;
+        slavepoints.push_back(0);
+        slavemstrpes.push_back(mstrpe);
+        slavemstrcounts.push_back(1);
+    }
+    // slave points with master below
+    if (mypey != 0) {
+        int mstrpe = mype - numpex;
+        int oldsize = slavepoints.size();
+        int p = 0;
+        for (int i = 0; i < npx; ++i) {
+            if (i == 0 && mypex != 0) { p++; continue; }
+            slavepoints.push_back(p);
+            p++;
+        }
+        slavemstrpes.push_back(mstrpe);
+        slavemstrcounts.push_back(slavepoints.size() - oldsize);
+    }
+    // slave points with master to left
+    if (mypex != 0) {
+        int mstrpe = mype - 1;
+        int oldsize = slavepoints.size();
+        if (mypey == 0) {
+            slavepoints.push_back(0);
+            // special case:
+            // slave point at origin, master not to immediate left
+            if (mypex > 1) {
+                slavemstrpes.push_back(0);
+                slavemstrcounts.push_back(1);
+                oldsize += 1;
+            }
+        }
+        int p = (mypey > 0 ? npx : 1);
+        for (int j = 1; j < npy; ++j) {
+            slavepoints.push_back(p);
+            p += npx;
+        }
+        slavemstrpes.push_back(mstrpe);
+        slavemstrcounts.push_back(slavepoints.size() - oldsize);
+    }
+
+    // enumerate master points
+    // master points with slave to right
+    if (mypex != numpex - 1) {
+        int slvpe = mype + 1;
+        int oldsize = masterpoints.size();
+        // special case:  origin as master for slave on PE 1
+        if (mypex == 0 && mypey == 0) {
+            masterpoints.push_back(0);
+        }
+        int p = (mypey > 0 ? 2 * npx - 1 : npx);
+        for (int j = 1; j < npy; ++j) {
+            masterpoints.push_back(p);
+            p += npx;
+        }
+        masterslvpes.push_back(slvpe);
+        masterslvcounts.push_back(masterpoints.size() - oldsize);
+        // special case:  origin as master for slaves on PEs > 1
+        if (mypex == 0 && mypey == 0) {
+            for (int slvpe = 2; slvpe < numpex; ++slvpe) {
+                masterpoints.push_back(0);
+                masterslvpes.push_back(slvpe);
+                masterslvcounts.push_back(1);
+            }
+        }
+    }
+    // master points with slave above
+    if (mypey != numpey - 1) {
+        int slvpe = mype + numpex;
+        int oldsize = masterpoints.size();
+        int p = (npy - 1) * npx;
+        if (mypey == 0) p -= npx - 1;
+        for (int i = 0; i < npx; ++i) {
+            if (i == 0 && mypex != 0) { p++; continue; }
+            masterpoints.push_back(p);
+            p++;
+        }
+        masterslvpes.push_back(slvpe);
+        masterslvcounts.push_back(masterpoints.size() - oldsize);
+    }
+    // master point with slave at upper right
+    if (mypex != numpex - 1 && mypey != numpey - 1) {
+        int slvpe = mype + numpex + 1;
+        int p = npx * npy - 1;
+        if (mypey == 0) p -= npx - 1;
+        masterpoints.push_back(p);
+        masterslvpes.push_back(slvpe);
+        masterslvcounts.push_back(1);
+    }
+
+}
+
+
+void GenMesh::generateHex(
+        std::vector<double2>& pointpos,
+        std::vector<int>& zonestart,
+        std::vector<int>& zonesize,
+        std::vector<int>& zonepoints,
+        std::vector<int>& slavemstrpes,
+        std::vector<int>& slavemstrcounts,
+        std::vector<int>& slavepoints,
+        std::vector<int>& masterslvpes,
+        std::vector<int>& masterslvcounts,
+        std::vector<int>& masterpoints) {
+
+    using Parallel::numpe;
+    using Parallel::mype;
+
+    const int nz = nzx * nzy;
+    const int npx = nzx + 1;
+    const int npy = nzy + 1;
+
+    // generate point coordinates
+    pointpos.reserve(2 * npx * npy);  // upper bound
+    double dx = lenx / (double) (gnzx - 1);
+    double dy = leny / (double) (gnzy - 1);
+
+    vector<int> pbase(npy);
+    for (int j = 0; j < npy; ++j) {
+        pbase[j] = pointpos.size();
+        int gj = j + zyoffset;
+        double y = dy * ((double) gj - 0.5);
+        y = max(0., min(leny, y));
+        for (int i = 0; i < npx; ++i) {
+            int gi = i + zxoffset;
+            double x = dx * ((double) gi - 0.5);
+            x = max(0., min(lenx, x));
+            if (gi == 0 || gi == gnzx || gj == 0 || gj == gnzy)
+                pointpos.push_back(make_double2(x, y));
+            else if (i == nzx && j == 0)
+                pointpos.push_back(
+                        make_double2(x - dx / 6., y + dy / 6.));
+            else if (i == 0 && j == nzy)
+                pointpos.push_back(
+                        make_double2(x + dx / 6., y - dy / 6.));
+            else {
+                pointpos.push_back(
+                        make_double2(x - dx / 6., y + dy / 6.));
+                pointpos.push_back(
+                        make_double2(x + dx / 6., y - dy / 6.));
+            }
+        } // for i
+    } // for j
+    int np = pointpos.size();
+
+    // generate zone adjacency lists
+    zonestart.reserve(nz);
+    zonesize.reserve(nz);
+    zonepoints.reserve(6 * nz);  // upper bound
+    for (int j = 0; j < nzy; ++j) {
+        int gj = j + zyoffset;
+        int pbasel = pbase[j];
+        int pbaseh = pbase[j+1];
+        if (mypex > 0) {
+            if (gj > 0) pbasel += 1;
+            if (j < nzy - 1) pbaseh += 1;
+        }
+        for (int i = 0; i < nzx; ++i) {
+            int gi = i + zxoffset;
+            vector<int> v(6);
+            v[1] = pbasel + 2 * i;
+            v[0] = v[1] - 1;
+            v[2] = v[1] + 1;
+            v[5] = pbaseh + 2 * i;
+            v[4] = v[5] + 1;
+            v[3] = v[4] + 1;
+            if (gj == 0) {
+                v[0] = pbasel + i;
+                v[2] = v[0] + 1;
+                if (gi == gnzx - 1) v.erase(v.begin()+3);
+                v.erase(v.begin()+1);
+            } // if j
+            else if (gj == gnzy - 1) {
+                v[5] = pbaseh + i;
+                v[3] = v[5] + 1;
+                v.erase(v.begin()+4);
+                if (gi == 0) v.erase(v.begin()+0);
+            } // else if j
+            else if (gi == 0)
+                v.erase(v.begin()+0);
+            else if (gi == gnzx - 1)
+                v.erase(v.begin()+3);
+            zonestart.push_back(zonepoints.size());
+            zonesize.push_back(v.size());
+            zonepoints.insert(zonepoints.end(), v.begin(), v.end());
+        } // for i
+    } // for j
+
+    if (numpe == 1) return;
+
+    // estimate upper bounds for sizes of slave/master arrays
+    slavepoints.reserve((mypey != 0) * 2 * npx +
+            (mypex != 0) * 2 * npy);
+    masterpoints.reserve((mypey != numpey - 1) * 2 * npx +
+            (mypex != numpex - 1) * 2 * npy + 2);
+
+    // enumerate slave points
+    // slave points with master at lower left
+    if (mypex != 0 && mypey != 0) {
+        int mstrpe = mype - numpex - 1;
+        slavepoints.push_back(0);
+        slavepoints.push_back(1);
+        slavemstrpes.push_back(mstrpe);
+        slavemstrcounts.push_back(2);
+    }
+    // slave points with master below
+    if (mypey != 0) {
+        int p = 0;
+        int mstrpe = mype - numpex;
+        int oldsize = slavepoints.size();
+        for (int i = 0; i < npx; ++i) {
+            if (i == 0 && mypex != 0) {
+                p += 2;
+                continue;
+            }
+            if (i == 0 || i == nzx)
+                slavepoints.push_back(p++);
+            else {
+                slavepoints.push_back(p++);
+                slavepoints.push_back(p++);
+            }
+        }  // for i
+        slavemstrpes.push_back(mstrpe);
+        slavemstrcounts.push_back(slavepoints.size() - oldsize);
+    }  // if mypey != 0
+    // slave points with master to left
+    if (mypex != 0) {
+        int mstrpe = mype - 1;
+        int oldsize = slavepoints.size();
+        for (int j = 0; j < npy; ++j) {
+            if (j == 0 && mypey != 0) continue;
+            int p = pbase[j];
+            if (j == 0 || j == nzy)
+                slavepoints.push_back(p++);
+            else {
+                slavepoints.push_back(p++);
+                slavepoints.push_back(p++);
+           }
+        }  // for j
+        slavemstrpes.push_back(mstrpe);
+        slavemstrcounts.push_back(slavepoints.size() - oldsize);
+    }  // if mypex != 0
+
+    // enumerate master points
+    // master points with slave to right
+    if (mypex != numpex - 1) {
+        int slvpe = mype + 1;
+        int oldsize = masterpoints.size();
+        for (int j = 0; j < npy; ++j) {
+            if (j == 0 && mypey != 0) continue;
+            int p = (j == nzy ? np : pbase[j+1]);
+            if (j == 0 || j == nzy)
+                masterpoints.push_back(p-1);
+            else {
+                masterpoints.push_back(p-2);
+                masterpoints.push_back(p-1);
+           }
+        }
+        masterslvpes.push_back(slvpe);
+        masterslvcounts.push_back(masterpoints.size() - oldsize);
+    }  // if mypex != numpex - 1
+    // master points with slave above
+    if (mypey != numpey - 1) {
+        int p = pbase[nzy];
+        int slvpe = mype + numpex;
+        int oldsize = masterpoints.size();
+        for (int i = 0; i < npx; ++i) {
+            if (i == 0 && mypex != 0) {
+                p++;
+                continue;
+            }
+            if (i == 0 || i == nzx)
+                masterpoints.push_back(p++);
+            else {
+                masterpoints.push_back(p++);
+                masterpoints.push_back(p++);
+            }
+        }  // for i
+        masterslvpes.push_back(slvpe);
+        masterslvcounts.push_back(masterpoints.size() - oldsize);
+    }  // if mypey != numpey - 1
+    // master points with slave at upper right
+    if (mypex != numpex - 1 && mypey != numpey - 1) {
+        int slvpe = mype + numpex + 1;
+        masterpoints.push_back(np-2);
+        masterpoints.push_back(np-1);
+        masterslvpes.push_back(slvpe);
+        masterslvcounts.push_back(2);
+    }
+
+}
+
+
+void GenMesh::calcNumPE() {
+
+    using Parallel::numpe;
+    using Parallel::mype;
+
+    // pick numpex, numpey such that PE blocks are as close to square
+    // as possible
+    // we would like:  gnzx / numpex == gnzy / numpey,
+    // where numpex * numpey = numpe (total number of PEs available)
+    // this solves to:  numpex = sqrt(numpe * gnzx / gnzy)
+    // we compute this, assuming gnzx <= gnzy (swap if necessary)
+    double nx = static_cast<double>(gnzx);
+    double ny = static_cast<double>(gnzy);
+    bool swapflag = (nx > ny);
+    if (swapflag) swap(nx, ny);
+    double n = sqrt(numpe * nx / ny);
+    // need to constrain n to be an integer with numpe % n == 0
+    // try rounding n both up and down
+    int n1 = floor(n + 1.e-12);
+    n1 = max(n1, 1);
+    while (numpe % n1 != 0) --n1;
+    int n2 = ceil(n - 1.e-12);
+    while (numpe % n2 != 0) ++n2;
+    // pick whichever of n1 and n2 gives blocks closest to square,
+    // i.e. gives the shortest long side
+    double longside1 = max(nx / n1, ny / (numpe/n1));
+    double longside2 = max(nx / n2, ny / (numpe/n2));
+    numpex = (longside1 <= longside2 ? n1 : n2);
+    numpey = numpe / numpex;
+    if (swapflag) swap(numpex, numpey);
+    mypex = mype % numpex;
+    mypey = mype / numpex;
+
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Hydro.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Hydro.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Hydro.hh
@@ -0,0 +1,219 @@
+/*
+ * Hydro.hh
+ *
+ *  Created on: Dec 22, 2011
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef HYDRO_HH_
+#define HYDRO_HH_
+
+#include <string>
+#include <vector>
+
+#include "Vec2.hh"
+
+// forward declarations
+class InputFile;
+class Mesh;
+class PolyGas;
+class TTS;
+class QCS;
+class HydroBC;
+
+
+class Hydro {
+public:
+
+    // associated mesh object
+    Mesh* mesh;
+
+    // children of this object
+    PolyGas* pgas;
+    TTS* tts;
+    QCS* qcs;
+    std::vector<HydroBC*> bcs;
+
+    double cfl;                 // Courant number, limits timestep
+    double cflv;                // volume change limit for timestep
+    double rinit;               // initial density for main mesh
+    double einit;               // initial energy for main mesh
+    double rinitsub;            // initial density in subregion
+    double einitsub;            // initial energy in subregion
+    double uinitradial;         // initial velocity in radial direction
+    std::vector<double> bcx;    // x values of x-plane fixed boundaries
+    std::vector<double> bcy;    // y values of y-plane fixed boundaries
+
+    double dtrec;               // maximum timestep for hydro
+    char msgdtrec[80];          // message:  reason for dtrec
+
+    double2* pu;       // point velocity
+    double2* pu0;      // point velocity, start of cycle
+    double2* pap;      // point acceleration
+    double2* pf;       // point force
+    double* pmaswt;    // point mass, weighted by 1/r
+    double* cmaswt;    // corner contribution to pmaswt
+
+    double* zm;        // zone mass
+    double* zr;        // zone density
+    double* zrp;       // zone density, middle of cycle
+    double* ze;        // zone specific internal energy
+                       // (energy per unit mass)
+    double* zetot;     // zone total internal energy
+    double* zw;        // zone work done in cycle
+    double* zwrate;    // zone work rate
+    double* zp;        // zone pressure
+    double* zss;       // zone sound speed
+    double* zdu;       // zone velocity difference
+
+    double2* sfp;      // side force from pressure
+    double2* sfq;      // side force from artificial visc.
+    double2* sft;      // side force from tts
+    double2* cftot;    // corner force, total from all sources
+
+    Hydro(const InputFile* inp, Mesh* m);
+    ~Hydro();
+
+    void init();
+
+    void initRadialVel(
+            const double vel,
+            const int pfirst,
+            const int plast);
+
+    void doCycle(const double dt);
+
+    void advPosHalf(
+            const double2* px0,
+            const double2* pu0,
+            const double dt,
+            double2* pxp,
+            const int pfirst,
+            const int plast);
+
+    void advPosFull(
+            const double2* px0,
+            const double2* pu0,
+            const double2* pa,
+            const double dt,
+            double2* px,
+            double2* pu,
+            const int pfirst,
+            const int plast);
+
+    void calcCrnrMass(
+            const double* zr,
+            const double* zarea,
+            const double* smf,
+            double* cmaswt,
+            const int sfirst,
+            const int slast);
+
+    void sumCrnrForce(
+            const double2* sf,
+            const double2* sf2,
+            const double2* sf3,
+            double2* cftot,
+            const int sfirst,
+            const int slast);
+
+    void calcAccel(
+            const double2* pf,
+            const double* pmass,
+            double2* pa,
+            const int pfirst,
+            const int plast);
+
+    void calcRho(
+            const double* zm,
+            const double* zvol,
+            double* zr,
+            const int zfirst,
+            const int zlast);
+
+    void calcWork(
+            const double2* sf,
+            const double2* sf2,
+            const double2* pu0,
+            const double2* pu,
+            const double2* px0,
+            const double dt,
+            double* zw,
+            double* zetot,
+            const int sfirst,
+            const int slast);
+
+    void calcWorkRate(
+            const double* zvol0,
+            const double* zvol,
+            const double* zw,
+            const double* zp,
+            const double dt,
+            double* zwrate,
+            const int zfirst,
+            const int zlast);
+
+    void calcEnergy(
+            const double* zetot,
+            const double* zm,
+            double* ze,
+            const int zfirst,
+            const int zlast);
+
+    void sumEnergy(
+            const double* zetot,
+            const double* zarea,
+            const double* zvol,
+            const double* zm,
+            const double* smf,
+            const double2* px,
+            const double2* pu,
+            double& ei,
+            double& ek,
+            const int zfirst,
+            const int zlast,
+            const int sfirst,
+            const int slast);
+
+    void calcDtCourant(
+            const double* zdl,
+            double& dtrec,
+            char* msgdtrec,
+            const int zfirst,
+            const int zlast);
+
+    void calcDtVolume(
+            const double* zvol,
+            const double* zvol0,
+            const double dtlast,
+            double& dtrec,
+            char* msgdtrec,
+            const int zfirst,
+            const int zlast);
+
+    void calcDtHydro(
+            const double* zdl,
+            const double* zvol,
+            const double* zvol0,
+            const double dtlast,
+            const int zfirst,
+            const int zlast);
+
+    void getDtHydro(
+            double& dtnew,
+            std::string& msgdtnew);
+
+    void resetDtHydro();
+
+    void writeEnergyCheck();
+
+}; // class Hydro
+
+
+
+#endif /* HYDRO_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Hydro.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Hydro.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Hydro.cc
@@ -0,0 +1,666 @@
+/*
+ * Hydro.cc
+ *
+ *  Created on: Dec 22, 2011
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "Hydro.hh"
+
+#include <string>
+#include <vector>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include <iomanip>
+
+#include "Parallel.hh"
+#include "Memory.hh"
+#include "InputFile.hh"
+#include "Mesh.hh"
+#include "PolyGas.hh"
+#include "TTS.hh"
+#include "QCS.hh"
+#include "HydroBC.hh"
+
+using namespace std;
+
+
+Hydro::Hydro(const InputFile* inp, Mesh* m) : mesh(m) {
+    cfl = inp->getDouble("cfl", 0.6);
+    cflv = inp->getDouble("cflv", 0.1);
+    rinit = inp->getDouble("rinit", 1.);
+    einit = inp->getDouble("einit", 0.);
+    rinitsub = inp->getDouble("rinitsub", 1.);
+    einitsub = inp->getDouble("einitsub", 0.);
+    uinitradial = inp->getDouble("uinitradial", 0.);
+    bcx = inp->getDoubleList("bcx", vector<double>());
+    bcy = inp->getDoubleList("bcy", vector<double>());
+
+    pgas = new PolyGas(inp, this);
+    tts = new TTS(inp, this);
+    qcs = new QCS(inp, this);
+
+    const double2 vfixx = double2(1., 0.);
+    const double2 vfixy = double2(0., 1.);
+    for (int i = 0; i < bcx.size(); ++i)
+        bcs.push_back(new HydroBC(mesh, vfixx, mesh->getXPlane(bcx[i])));
+    for (int i = 0; i < bcy.size(); ++i)
+        bcs.push_back(new HydroBC(mesh, vfixy, mesh->getYPlane(bcy[i])));
+
+    init();
+}
+
+
+Hydro::~Hydro() {
+
+    delete tts;
+    delete qcs;
+    for (int i = 0; i < bcs.size(); ++i) {
+        delete bcs[i];
+    }
+}
+
+
+void Hydro::init() {
+
+    const int numpch = mesh->numpch;
+    const int numzch = mesh->numzch;
+    const int nump = mesh->nump;
+    const int numz = mesh->numz;
+    const int nums = mesh->nums;
+
+    const double2* zx = mesh->zx;
+    const double* zvol = mesh->zvol;
+
+    // allocate arrays
+    pu = Memory::alloc<double2>(nump);
+    pu0 = Memory::alloc<double2>(nump);
+    pap = Memory::alloc<double2>(nump);
+    pf = Memory::alloc<double2>(nump);
+    pmaswt = Memory::alloc<double>(nump);
+    cmaswt = Memory::alloc<double>(nums);
+    zm = Memory::alloc<double>(numz);
+    zr = Memory::alloc<double>(numz);
+    zrp = Memory::alloc<double>(numz);
+    ze = Memory::alloc<double>(numz);
+    zetot = Memory::alloc<double>(numz);
+    zw = Memory::alloc<double>(numz);
+    zwrate = Memory::alloc<double>(numz);
+    zp = Memory::alloc<double>(numz);
+    zss = Memory::alloc<double>(numz);
+    zdu = Memory::alloc<double>(numz);
+    sfp = Memory::alloc<double2>(nums);
+    sfq = Memory::alloc<double2>(nums);
+    sft = Memory::alloc<double2>(nums);
+    cftot = Memory::alloc<double2>(nums);
+
+    // initialize hydro vars
+    #pragma omp parallel for schedule(static)
+    for (int zch = 0; zch < numzch; ++zch) {
+        int zfirst = mesh->zchzfirst[zch];
+        int zlast = mesh->zchzlast[zch];
+
+        fill(&zr[zfirst], &zr[zlast], rinit);
+        fill(&ze[zfirst], &ze[zlast], einit);
+        fill(&zwrate[zfirst], &zwrate[zlast], 0.);
+
+        const vector<double>& subrgn = mesh->subregion;
+        if (!subrgn.empty()) {
+            const double eps = 1.e-12;
+            #pragma ivdep
+            for (int z = zfirst; z < zlast; ++z) {
+                if (zx[z].x > (subrgn[0] - eps) &&
+                    zx[z].x < (subrgn[1] + eps) &&
+                    zx[z].y > (subrgn[2] - eps) &&
+                    zx[z].y < (subrgn[3] + eps)) {
+                    zr[z] = rinitsub;
+                    ze[z] = einitsub;
+                }
+            }
+        }
+
+        #pragma ivdep
+        for (int z = zfirst; z < zlast; ++z) {
+            zm[z] = zr[z] * zvol[z];
+            zetot[z] = ze[z] * zm[z];
+        }
+    }  // for sch
+
+    #pragma omp parallel for schedule(static)
+    for (int pch = 0; pch < numpch; ++pch) {
+        int pfirst = mesh->pchpfirst[pch];
+        int plast = mesh->pchplast[pch];
+        if (uinitradial != 0.)
+            initRadialVel(uinitradial, pfirst, plast);
+        else
+            fill(&pu[pfirst], &pu[plast], double2(0., 0.));
+    }  // for pch
+
+    resetDtHydro();
+
+}
+
+
+void Hydro::initRadialVel(
+        const double vel,
+        const int pfirst,
+        const int plast) {
+    const double2* px = mesh->px;
+    const double eps = 1.e-12;
+
+    #pragma ivdep
+    for (int p = pfirst; p < plast; ++p) {
+        double pmag = length(px[p]);
+        if (pmag > eps)
+            pu[p] = vel * px[p] / pmag;
+        else
+            pu[p] = double2(0., 0.);
+    }
+}
+
+
+void Hydro::doCycle(
+            const double dt) {
+
+    const int numpch = mesh->numpch;
+    const int numsch = mesh->numsch;
+    double2* px = mesh->px;
+    double2* ex = mesh->ex;
+    double2* zx = mesh->zx;
+    double* sarea = mesh->sarea;
+    double* svol = mesh->svol;
+    double* zarea = mesh->zarea;
+    double* zvol = mesh->zvol;
+    double* sareap = mesh->sareap;
+    double* svolp = mesh->svolp;
+    double* zareap = mesh->zareap;
+    double* zvolp = mesh->zvolp;
+    double* zvol0 = mesh->zvol0;
+    double2* ssurfp = mesh->ssurfp;
+    double* elen = mesh->elen;
+    double2* px0 = mesh->px0;
+    double2* pxp = mesh->pxp;
+    double2* exp = mesh->exp;
+    double2* zxp = mesh->zxp;
+    double* smf = mesh->smf;
+    double* zdl = mesh->zdl;
+
+    // Begin hydro cycle
+    #pragma omp parallel for schedule(static)
+    for (int pch = 0; pch < numpch; ++pch) {
+        int pfirst = mesh->pchpfirst[pch];
+        int plast = mesh->pchplast[pch];
+
+        // save off point variable values from previous cycle
+        copy(&px[pfirst], &px[plast], &px0[pfirst]);
+        copy(&pu[pfirst], &pu[plast], &pu0[pfirst]);
+
+        // ===== Predictor step =====
+        // 1. advance mesh to center of time step
+        advPosHalf(px0, pu0, dt, pxp, pfirst, plast);
+    } // for pch
+
+    #pragma omp parallel for schedule(static)
+    for (int sch = 0; sch < numsch; ++sch) {
+        int sfirst = mesh->schsfirst[sch];
+        int slast = mesh->schslast[sch];
+        int zfirst = mesh->schzfirst[sch];
+        int zlast = mesh->schzlast[sch];
+
+        // save off zone variable values from previous cycle
+        copy(&zvol[zfirst], &zvol[zlast], &zvol0[zfirst]);
+
+        // 1a. compute new mesh geometry
+        mesh->calcCtrs(pxp, exp, zxp, sfirst, slast);
+        mesh->calcVols(pxp, zxp, sareap, svolp, zareap, zvolp,
+                sfirst, slast);
+        mesh->calcSurfVecs(zxp, exp, ssurfp, sfirst, slast);
+        mesh->calcEdgeLen(pxp, elen, sfirst, slast);
+        mesh->calcCharLen(sareap, zdl, sfirst, slast);
+
+        // 2. compute point masses
+        calcRho(zm, zvolp, zrp, zfirst, zlast);
+        calcCrnrMass(zrp, zareap, smf, cmaswt, sfirst, slast);
+
+        // 3. compute material state (half-advanced)
+        pgas->calcStateAtHalf(zr, zvolp, zvol0, ze, zwrate, zm, dt,
+                zp, zss, zfirst, zlast);
+
+        // 4. compute forces
+        pgas->calcForce(zp, ssurfp, sfp, sfirst, slast);
+        tts->calcForce(zareap, zrp, zss, sareap, smf, ssurfp, sft,
+                sfirst, slast);
+        qcs->calcForce(sfq, sfirst, slast);
+        sumCrnrForce(sfp, sfq, sft, cftot, sfirst, slast);
+    }  // for sch
+    mesh->checkBadSides();
+
+    // sum corner masses, forces to points
+    mesh->sumToPoints(cmaswt, pmaswt);
+    mesh->sumToPoints(cftot, pf);
+
+    #pragma omp parallel for schedule(static)
+    for (int pch = 0; pch < numpch; ++pch) {
+        int pfirst = mesh->pchpfirst[pch];
+        int plast = mesh->pchplast[pch];
+
+        // 4a. apply boundary conditions
+        for (int i = 0; i < bcs.size(); ++i) {
+            int bfirst = bcs[i]->pchbfirst[pch];
+            int blast = bcs[i]->pchblast[pch];
+            bcs[i]->applyFixedBC(pu0, pf, bfirst, blast);
+        }
+
+        // 5. compute accelerations
+        calcAccel(pf, pmaswt, pap, pfirst, plast);
+
+        // ===== Corrector step =====
+        // 6. advance mesh to end of time step
+        advPosFull(px0, pu0, pap, dt, px, pu, pfirst, plast);
+    }  // for pch
+
+    resetDtHydro();
+
+    #pragma omp parallel for schedule(static)
+    for (int sch = 0; sch < numsch; ++sch) {
+        int sfirst = mesh->schsfirst[sch];
+        int slast = mesh->schslast[sch];
+        int zfirst = mesh->schzfirst[sch];
+        int zlast = mesh->schzlast[sch];
+
+        // 6a. compute new mesh geometry
+        mesh->calcCtrs(px, ex, zx, sfirst, slast);
+        mesh->calcVols(px, zx, sarea, svol, zarea, zvol,
+                sfirst, slast);
+
+        // 7. compute work
+        fill(&zw[zfirst], &zw[zlast], 0.);
+        calcWork(sfp, sfq, pu0, pu, pxp, dt, zw, zetot,
+                sfirst, slast);
+    }  // for sch
+    mesh->checkBadSides();
+
+    #pragma omp parallel for schedule(static)
+    for (int zch = 0; zch < mesh->numzch; ++zch) {
+        int zfirst = mesh->zchzfirst[zch];
+        int zlast = mesh->zchzlast[zch];
+
+        // 7a. compute work rate
+        calcWorkRate(zvol0, zvol, zw, zp, dt, zwrate, zfirst, zlast);
+
+        // 8. update state variables
+        calcEnergy(zetot, zm, ze, zfirst, zlast);
+        calcRho(zm, zvol, zr, zfirst, zlast);
+
+        // 9.  compute timestep for next cycle
+        calcDtHydro(zdl, zvol, zvol0, dt, zfirst, zlast);
+    }  // for zch
+
+}
+
+
+void Hydro::advPosHalf(
+        const double2* px0,
+        const double2* pu0,
+        const double dt,
+        double2* pxp,
+        const int pfirst,
+        const int plast) {
+
+    double dth = 0.5 * dt;
+
+    #pragma ivdep
+    for (int p = pfirst; p < plast; ++p) {
+        pxp[p] = px0[p] + pu0[p] * dth;
+    }
+}
+
+
+void Hydro::advPosFull(
+        const double2* px0,
+        const double2* pu0,
+        const double2* pa,
+        const double dt,
+        double2* px,
+        double2* pu,
+        const int pfirst,
+        const int plast) {
+
+    #pragma ivdep
+    for (int p = pfirst; p < plast; ++p) {
+        pu[p] = pu0[p] + pa[p] * dt;
+        px[p] = px0[p] + 0.5 * (pu[p] + pu0[p]) * dt;
+    }
+
+}
+
+
+void Hydro::calcCrnrMass(
+        const double* zr,
+        const double* zarea,
+        const double* smf,
+        double* cmaswt,
+        const int sfirst,
+        const int slast) {
+
+    #pragma ivdep
+    for (int s = sfirst; s < slast; ++s) {
+        int s3 = mesh->mapss3[s];
+        int z = mesh->mapsz[s];
+
+        double m = zr[z] * zarea[z] * 0.5 * (smf[s] + smf[s3]);
+        cmaswt[s] = m;
+    }
+}
+
+
+void Hydro::sumCrnrForce(
+        const double2* sf,
+        const double2* sf2,
+        const double2* sf3,
+        double2* cftot,
+        const int sfirst,
+        const int slast) {
+
+    #pragma ivdep
+    for (int s = sfirst; s < slast; ++s) {
+        int s3 = mesh->mapss3[s];
+
+        double2 f = (sf[s] + sf2[s] + sf3[s]) -
+                    (sf[s3] + sf2[s3] + sf3[s3]);
+        cftot[s] = f;
+    }
+}
+
+
+void Hydro::calcAccel(
+        const double2* pf,
+        const double* pmass,
+        double2* pa,
+        const int pfirst,
+        const int plast) {
+
+    const double fuzz = 1.e-99;
+
+    #pragma ivdep
+    for (int p = pfirst; p < plast; ++p) {
+        pa[p] = pf[p] / max(pmass[p], fuzz);
+    }
+
+}
+
+
+void Hydro::calcRho(
+        const double* zm,
+        const double* zvol,
+        double* zr,
+        const int zfirst,
+        const int zlast) {
+
+    #pragma ivdep
+    for (int z = zfirst; z < zlast; ++z) {
+        zr[z] = zm[z] / zvol[z];
+    }
+
+}
+
+
+void Hydro::calcWork(
+        const double2* sf,
+        const double2* sf2,
+        const double2* pu0,
+        const double2* pu,
+        const double2* px,
+        const double dt,
+        double* zw,
+        double* zetot,
+        const int sfirst,
+        const int slast) {
+
+    // Compute the work done by finding, for each element/node pair,
+    //   dwork= force * vavg
+    // where force is the force of the element on the node
+    // and vavg is the average velocity of the node over the time period
+
+    const double dth = 0.5 * dt;
+
+    for (int s = sfirst; s < slast; ++s) {
+        int p1 = mesh->mapsp1[s];
+        int p2 = mesh->mapsp2[s];
+        int z = mesh->mapsz[s];
+
+        double2 sftot = sf[s] + sf2[s];
+        double sd1 = dot( sftot, (pu0[p1] + pu[p1]));
+        double sd2 = dot(-sftot, (pu0[p2] + pu[p2]));
+        double dwork = -dth * (sd1 * px[p1].x + sd2 * px[p2].x);
+
+        zetot[z] += dwork;
+        zw[z] += dwork;
+
+    }
+
+}
+
+
+void Hydro::calcWorkRate(
+        const double* zvol0,
+        const double* zvol,
+        const double* zw,
+        const double* zp,
+        const double dt,
+        double* zwrate,
+        const int zfirst,
+        const int zlast) {
+    double dtinv = 1. / dt;
+    #pragma ivdep
+    for (int z = zfirst; z < zlast; ++z) {
+        double dvol = zvol[z] - zvol0[z];
+        zwrate[z] = (zw[z] + zp[z] * dvol) * dtinv;
+    }
+
+}
+
+
+void Hydro::calcEnergy(
+        const double* zetot,
+        const double* zm,
+        double* ze,
+        const int zfirst,
+        const int zlast) {
+
+    const double fuzz = 1.e-99;
+    #pragma ivdep
+    for (int z = zfirst; z < zlast; ++z) {
+        ze[z] = zetot[z] / (zm[z] + fuzz);
+    }
+
+}
+
+
+void Hydro::sumEnergy(
+        const double* zetot,
+        const double* zarea,
+        const double* zvol,
+        const double* zm,
+        const double* smf,
+        const double2* px,
+        const double2* pu,
+        double& ei,
+        double& ek,
+        const int zfirst,
+        const int zlast,
+        const int sfirst,
+        const int slast) {
+
+    // compute internal energy
+    double sumi = 0.; 
+    for (int z = zfirst; z < zlast; ++z) {
+        sumi += zetot[z];
+    }
+    // multiply by 2\pi for cylindrical geometry
+    ei += sumi * 2 * M_PI;
+
+    // compute kinetic energy
+    // in each individual zone:
+    // zone ke = zone mass * (volume-weighted average of .5 * u ^ 2)
+    //         = zm sum(c in z) [cvol / zvol * .5 * u ^ 2]
+    //         = sum(c in z) [zm * cvol / zvol * .5 * u ^ 2]
+    double sumk = 0.; 
+    for (int s = sfirst; s < slast; ++s) {
+        int s3 = mesh->mapss3[s];
+        int p1 = mesh->mapsp1[s];
+        int z = mesh->mapsz[s];
+
+        double cvol = zarea[z] * px[p1].x * 0.5 * (smf[s] + smf[s3]);
+        double cke = zm[z] * cvol / zvol[z] * 0.5 * length2(pu[p1]);
+        sumk += cke;
+    }
+    // multiply by 2\pi for cylindrical geometry
+    ek += sumk * 2 * M_PI;
+
+}
+
+
+void Hydro::calcDtCourant(
+        const double* zdl,
+        double& dtrec,
+        char* msgdtrec,
+        const int zfirst,
+        const int zlast) {
+
+    const double fuzz = 1.e-99;
+    double dtnew = 1.e99;
+    int zmin = -1;
+    for (int z = zfirst; z < zlast; ++z) {
+        double cdu = max(zdu[z], max(zss[z], fuzz));
+        double zdthyd = zdl[z] * cfl / cdu;
+        zmin = (zdthyd < dtnew ? z : zmin);
+        dtnew = (zdthyd < dtnew ? zdthyd : dtnew);
+    }
+
+    if (dtnew < dtrec) {
+        dtrec = dtnew;
+        snprintf(msgdtrec, 80, "Hydro Courant limit for z = %d", zmin);
+    }
+
+}
+
+
+void Hydro::calcDtVolume(
+        const double* zvol,
+        const double* zvol0,
+        const double dtlast,
+        double& dtrec,
+        char* msgdtrec,
+        const int zfirst,
+        const int zlast) {
+
+    double dvovmax = 1.e-99;
+    int zmax = -1;
+    for (int z = zfirst; z < zlast; ++z) {
+        double zdvov = abs((zvol[z] - zvol0[z]) / zvol0[z]);
+        zmax = (zdvov > dvovmax ? z : zmax);
+        dvovmax = (zdvov > dvovmax ? zdvov : dvovmax);
+    }
+    double dtnew = dtlast * cflv / dvovmax;
+    if (dtnew < dtrec) {
+        dtrec = dtnew;
+        snprintf(msgdtrec, 80, "Hydro dV/V limit for z = %d", zmax);
+    }
+
+}
+
+
+void Hydro::calcDtHydro(
+        const double* zdl,
+        const double* zvol,
+        const double* zvol0,
+        const double dtlast,
+        const int zfirst,
+        const int zlast) {
+
+    double dtchunk = 1.e99;
+    char msgdtchunk[80];
+
+    calcDtCourant(zdl, dtchunk, msgdtchunk, zfirst, zlast);
+    calcDtVolume(zvol, zvol0, dtlast, dtchunk, msgdtchunk,
+            zfirst, zlast);
+    if (dtchunk < dtrec) {
+        #pragma omp critical
+        {
+            // redundant test needed to avoid race condition
+            if (dtchunk < dtrec) {
+                dtrec = dtchunk;
+                strncpy(msgdtrec, msgdtchunk, 80);
+            }
+        }
+    }
+
+}
+
+
+void Hydro::getDtHydro(
+        double& dtnew,
+        string& msgdtnew) {
+
+    if (dtrec < dtnew) {
+        dtnew = dtrec;
+        msgdtnew = string(msgdtrec);
+    }
+
+}
+
+
+void Hydro::resetDtHydro() {
+
+    dtrec = 1.e99;
+    strcpy(msgdtrec, "Hydro default");
+
+}
+
+
+void Hydro::writeEnergyCheck() {
+
+    using Parallel::mype;
+    
+    double ei = 0.;
+    double ek = 0.;
+    #pragma omp parallel for schedule(static)
+    for (int sch = 0; sch < mesh->numsch; ++sch) {
+        int sfirst = mesh->schsfirst[sch];
+        int slast = mesh->schslast[sch];
+        int zfirst = mesh->schzfirst[sch];
+        int zlast = mesh->schzlast[sch];
+
+        double eichunk = 0.;
+        double ekchunk = 0.;
+        sumEnergy(zetot, mesh->zarea, mesh->zvol, zm, mesh->smf,
+                mesh->px, pu, eichunk, ekchunk,
+                zfirst, zlast, sfirst, slast);
+        #pragma omp critical
+        {
+            ei += eichunk;
+            ek += ekchunk;
+        }
+    }
+
+    Parallel::globalSum(ei);
+    Parallel::globalSum(ek);
+
+    if (mype == 0) {
+        cout << scientific << setprecision(6);
+        cout << "Energy check:  "
+             << "total energy  = " << setw(14) << ei + ek << endl;
+        cout << "(internal = " << setw(14) << ei
+             << ", kinetic = " << setw(14) << ek << ")" << endl;
+    }
+
+ }
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/HydroBC.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/HydroBC.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/HydroBC.hh
@@ -0,0 +1,52 @@
+/*
+ * HydroBC.hh
+ *
+ *  Created on: Jan 13, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef HYDROBC_HH_
+#define HYDROBC_HH_
+
+#include <vector>
+
+#include "Vec2.hh"
+
+// forward declarations
+class Mesh;
+
+
+class HydroBC {
+public:
+
+    // associated mesh object
+    Mesh* mesh;
+
+    int numb;                      // number of bdy points
+    double2 vfix;                  // vector perp. to fixed plane
+    int* mapbp;                    // map: bdy point -> point
+    std::vector<int> pchbfirst;    // start/stop index for bdy pt chunks
+    std::vector<int> pchblast;
+
+    HydroBC(
+            Mesh* msh,
+            const double2 v,
+            const std::vector<int>& mbp);
+
+    ~HydroBC();
+
+    void applyFixedBC(
+            double2* pu,
+            double2* pf,
+            const int bfirst,
+            const int blast);
+
+}; // class HydroBC
+
+
+#endif /* HYDROBC_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/HydroBC.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/HydroBC.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/HydroBC.cc
@@ -0,0 +1,53 @@
+/*
+ * HydroBC.cc
+ *
+ *  Created on: Jan 13, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "HydroBC.hh"
+
+#include "Memory.hh"
+#include "Mesh.hh"
+
+using namespace std;
+
+
+HydroBC::HydroBC(
+        Mesh* msh,
+        const double2 v,
+        const vector<int>& mbp)
+    : mesh(msh), numb(mbp.size()), vfix(v) {
+
+    mapbp = Memory::alloc<int>(numb);
+    copy(mbp.begin(), mbp.end(), mapbp);
+
+    mesh->getPlaneChunks(numb, mapbp, pchbfirst, pchblast);
+
+}
+
+
+HydroBC::~HydroBC() {}
+
+
+void HydroBC::applyFixedBC(
+        double2* pu,
+        double2* pf,
+        const int bfirst,
+        const int blast) {
+
+    #pragma ivdep
+    for (int b = bfirst; b < blast; ++b) {
+        int p = mapbp[b];
+
+        pu[p] = project(pu[p], vfix);
+        pf[p] = project(pf[p], vfix);
+    }
+
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/InputFile.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/InputFile.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/InputFile.hh
@@ -0,0 +1,45 @@
+/*
+ * InputFile.hh
+ *
+ *  Created on: Mar 20, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef INPUTFILE_HH_
+#define INPUTFILE_HH_
+
+#include <string>
+#include <vector>
+#include <map>
+
+
+class InputFile {
+public:
+    InputFile(const char* filename);
+    ~InputFile();
+    int getInt(const std::string& key, const int dflt) const;
+    double getDouble(const std::string& key, const double dflt) const;
+    std::string getString(const std::string& key,
+            const std::string& dflt) const;
+    std::vector<double> getDoubleList(
+            const std::string& key,
+            const std::vector<double>& dflt) const;
+
+private:
+    typedef std::map<std::string, std::string> pairstype;
+
+    pairstype pairs;               // map of key-value string pairs
+
+    template <typename T>
+    T get(const std::string& key, const T& dflt) const;
+
+
+}; // class InputFile
+
+
+#endif /* INPUTFILE_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/InputFile.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/InputFile.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/InputFile.cc
@@ -0,0 +1,109 @@
+/*
+ * InputFile.cc
+ *
+ *  Created on: Mar 20, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "InputFile.hh"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <cstdlib>
+
+#include "Parallel.hh"
+
+using namespace std;
+
+
+InputFile::InputFile(const char* filename) {
+
+    using Parallel::mype;
+
+    ifstream ifs(filename);
+    if (!ifs.good())
+    {
+        if (mype == 0)
+            cerr << "File " << filename << " not found" << endl;
+        exit(1);
+    }
+
+    while (true)
+    {
+        string line;
+        getline(ifs, line);
+        if (ifs.eof()) break;
+
+        istringstream iss(line);
+        string key;
+
+        iss >> key;
+        if (key.empty() || key[0] == '#')
+          continue;
+
+        if (pairs.find(key) != pairs.end()) {
+            if (mype == 0)
+                cerr << "Duplicate key " << key << " in input file" << endl;
+            exit(1);
+        }
+
+        string val;
+        getline(iss, val);
+        pairs[key] = val;
+
+    } // while true
+
+    ifs.close();
+
+}
+
+
+InputFile::~InputFile() {}
+
+
+template <typename T>
+T InputFile::get(const string& key, const T& dflt) const {
+    pairstype::const_iterator itr = pairs.find(key);
+    if (itr == pairs.end())
+        return dflt;
+    istringstream iss(itr->second);
+    T val;
+    iss >> val;
+    return val;
+}
+
+
+int InputFile::getInt(const string& key, const int dflt) const {
+    return get(key, dflt);
+}
+
+
+double InputFile::getDouble(const string& key, const double dflt) const {
+    return get(key, dflt);
+}
+
+
+string InputFile::getString(const string& key, const string& dflt) const {
+    return get(key, dflt);
+}
+
+
+vector<double> InputFile::getDoubleList(
+        const string& key,
+        const vector<double>& dflt) const {
+    pairstype::const_iterator itr = pairs.find(key);
+    if (itr == pairs.end())
+        return dflt;
+    istringstream iss(itr->second);
+    vector<double> vallist;
+    double val;
+    while (iss >> val) vallist.push_back(val);
+    return vallist;
+}
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/LICENSE
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/LICENSE
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/LICENSE
@@ -0,0 +1,44 @@
+Copyright (c) 2012, Los Alamos National Security, LLC.
+All rights reserved.
+
+Copyright 2012. Los Alamos National Security, LLC.
+This software was produced under U.S. Government contract
+DE-AC52-06NA25396 for Los Alamos National Laboratory (LANL), which is
+operated by Los Alamos National Security, LLC for the U.S. Department
+of Energy. The U.S. Government has rights to use, reproduce, and
+distribute this software. NEITHER THE GOVERNMENT NOR LOS ALAMOS
+NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is
+modified to produce derivative works, such modified software should be
+clearly marked, so as not to confuse it with the version available from
+LANL.
+
+Additionally, redistribution and use in source and binary forms, with
+or without modification, are permitted provided that the following
+conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided
+   with the distribution.
+
+3. Neither the name of Los Alamos National Security, LLC, Los Alamos
+   National Laboratory, LANL, the U.S. Government, nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
+BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS
+NATIONAL SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Memory.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Memory.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Memory.hh
@@ -0,0 +1,49 @@
+/*
+ * Memory.hh
+ *
+ *  Created on: Jul 3, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef MEMORY_HH_
+#define MEMORY_HH_
+
+#include <cstdlib>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+
+// Namespace Memory provides functions to allocate and free memory.
+// Currently these are just wrappers around std::malloc and free,
+// but they are abstracted here to make it easier to replace them
+// if needed.
+
+namespace Memory {
+
+template<typename T>
+inline T* alloc(const int count) {
+#if defined(_OPENMP) && defined(__INTEL_COMPILER)
+    return (T*) kmp_malloc(count * sizeof(T));
+#else
+    return (T*) std::malloc(count * sizeof(T));
+#endif
+}
+
+template<typename T>
+inline void free(T* ptr) {
+#if defined(_OPENMP) && defined(__INTEL_COMPILER)
+    kmp_free(ptr);
+#else
+    std::free(ptr);
+#endif
+}
+
+};  // namespace Memory
+
+#endif /* MEMORY_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Mesh.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Mesh.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Mesh.hh
@@ -0,0 +1,245 @@
+/*
+ * Mesh.hh
+ *
+ *  Created on: Jan 5, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef MESH_HH_
+#define MESH_HH_
+
+#include <string>
+#include <vector>
+
+#include "Vec2.hh"
+
+// forward declarations
+class InputFile;
+class GenMesh;
+class WriteXY;
+class ExportGold;
+
+
+class Mesh {
+public:
+
+    // children
+    GenMesh* gmesh;
+    WriteXY* wxy;
+    ExportGold* egold;
+
+    // parameters
+    int chunksize;                 // max size for processing chunks
+    std::vector<double> subregion; // bounding box for a subregion
+                                   // if nonempty, should have 4 entries:
+                                   // xmin, xmax, ymin, ymax
+    bool writexy;                  // flag:  write .xy file?
+    bool writegold;                // flag:  write Ensight file?
+
+    // mesh variables
+    // (See documentation for more details on the mesh
+    //  data structures...)
+    int nump, nume, numz, nums, numc;
+                       // number of points, edges, zones,
+                       // sides, corners, resp.
+    int numsbad;       // number of bad sides (negative volume)
+    int* mapsp1;       // maps: side -> points 1 and 2
+    int* mapsp2;
+    int* mapsz;        // map: side -> zone
+    int* mapse;        // map: side -> edge
+    int* mapss3;       // map: side -> previous side
+    int* mapss4;       // map: side -> next side
+
+    // point-to-corner inverse map is stored as a linked list...
+    int* mappcfirst;   // map:  point -> first corner
+    int* mapccnext;    // map:  corner -> next corner
+
+    // mpi comm variables
+    int nummstrpe;     // number of messages mype sends to master pes
+    int numslvpe;      // number of messages mype receives from slave pes
+    int numprx;        // number of proxies on mype
+    int numslv;        // number of slaves on mype
+    int* mapslvpepe;   // map: slave pe -> (global) pe
+    int* mapslvpeprx1; // map: slave pe -> first proxy in proxy buffer
+    int* mapprxp;      // map: proxy -> corresponding (master) point
+    int* slvpenumprx;  // number of proxies for each slave pe
+    int* mapmstrpepe;  // map: master pe -> (global) pe
+    int* mstrpenumslv; // number of slaves for each master pe
+    int* mapmstrpeslv1;// map: master pe -> first slave in slave buffer
+    int* mapslvp;      // map: slave -> corresponding (slave) point
+
+    int* znump;        // number of points in zone
+
+    double2* px;       // point coordinates
+    double2* ex;       // edge center coordinates
+    double2* zx;       // zone center coordinates
+    double2* pxp;      // point coords, middle of cycle
+    double2* exp;      // edge ctr coords, middle of cycle
+    double2* zxp;      // zone ctr coords, middle of cycle
+    double2* px0;      // point coords, start of cycle
+
+    double* sarea;     // side area
+    double* svol;      // side volume
+    double* zarea;     // zone area
+    double* zvol;      // zone volume
+    double* sareap;    // side area, middle of cycle
+    double* svolp;     // side volume, middle of cycle
+    double* zareap;    // zone area, middle of cycle
+    double* zvolp;     // zone volume, middle of cycle
+    double* zvol0;     // zone volume, start of cycle
+
+    double2* ssurfp;   // side surface vector
+    double* elen;      // edge length
+    double* smf;       // side mass fraction
+    double* zdl;       // zone characteristic length
+
+    int numsch;                    // number of side chunks
+    std::vector<int> schsfirst;    // start/stop index for side chunks
+    std::vector<int> schslast;
+    std::vector<int> schzfirst;    // start/stop index for zone chunks
+    std::vector<int> schzlast;
+    int numpch;                    // number of point chunks
+    std::vector<int> pchpfirst;    // start/stop index for point chunks
+    std::vector<int> pchplast;
+    int numzch;                    // number of zone chunks
+    std::vector<int> zchzfirst;    // start/stop index for zone chunks
+    std::vector<int> zchzlast;
+
+    Mesh(const InputFile* inp);
+    ~Mesh();
+
+    void init();
+
+    // populate mapping arrays
+    void initSides(
+            const std::vector<int>& cellstart,
+            const std::vector<int>& cellsize,
+            const std::vector<int>& cellnodes);
+    void initEdges();
+
+    // populate chunk information
+    void initChunks();
+
+    // populate inverse map
+    void initInvMap();
+
+    void initParallel(
+            const std::vector<int>& slavemstrpes,
+            const std::vector<int>& slavemstrcounts,
+            const std::vector<int>& slavepoints,
+            const std::vector<int>& masterslvpes,
+            const std::vector<int>& masterslvcounts,
+            const std::vector<int>& masterpoints);
+
+    // write mesh statistics
+    void writeStats();
+
+    // write mesh
+    void write(
+            const std::string& probname,
+            const int cycle,
+            const double time,
+            const double* zr,
+            const double* ze,
+            const double* zp);
+
+    // find plane with constant x, y value
+    std::vector<int> getXPlane(const double c);
+    std::vector<int> getYPlane(const double c);
+
+    // compute chunks for a given plane
+    void getPlaneChunks(
+            const int numb,
+            const int* mapbp,
+            std::vector<int>& pchbfirst,
+            std::vector<int>& pchblast);
+
+    // compute edge, zone centers
+    void calcCtrs(
+            const double2* px,
+            double2* ex,
+            double2* zx,
+            const int sfirst,
+            const int slast);
+
+    // compute side, corner, zone volumes
+    void calcVols(
+            const double2* px,
+            const double2* zx,
+            double* sarea,
+            double* svol,
+            double* zarea,
+            double* zvol,
+            const int sfirst,
+            const int slast);
+
+    // check to see if previous volume computation had any
+    // sides with negative volumes
+    void checkBadSides();
+
+    // compute side mass fractions
+    void calcSideFracs(
+            const double* sarea,
+            const double* zarea,
+            double* smf,
+            const int sfirst,
+            const int slast);
+
+    // compute surface vectors for median mesh
+    void calcSurfVecs(
+            const double2* zx,
+            const double2* ex,
+            double2* ssurf,
+            const int sfirst,
+            const int slast);
+
+    // compute edge lengths
+    void calcEdgeLen(
+            const double2* px,
+            double* elen,
+            const int sfirst,
+            const int slast);
+
+    // compute characteristic lengths
+    void calcCharLen(
+            const double* sarea,
+            double* zdl,
+            const int sfirst,
+            const int slast);
+
+    // sum corner variables to points (double or double2)
+    template <typename T>
+    void sumToPoints(
+            const T* cvar,
+            T* pvar);
+
+    // helper routines for sumToPoints
+    template <typename T>
+    void sumOnProc(
+            const T* cvar,
+            T* pvar);
+    template <typename T>
+    void sumAcrossProcs(T* pvar);
+    template <typename T>
+    void parallelGather(
+            const T* pvar,
+            T* prxvar);
+    template <typename T>
+    void parallelSum(
+            T* pvar,
+            T* prxvar);
+    template <typename T>
+    void parallelScatter(
+            T* pvar,
+            const T* prxvar);
+
+}; // class Mesh
+
+
+
+#endif /* MESH_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Mesh.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Mesh.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Mesh.cc
@@ -0,0 +1,790 @@
+/*
+ * Mesh.cc
+ *
+ *  Created on: Jan 5, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "Mesh.hh"
+
+#include <stdint.h>
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+
+#include "Vec2.hh"
+#include "Memory.hh"
+#include "Parallel.hh"
+#include "InputFile.hh"
+#include "GenMesh.hh"
+#include "WriteXY.hh"
+#include "ExportGold.hh"
+
+using namespace std;
+
+
+Mesh::Mesh(const InputFile* inp) :
+    gmesh(NULL), egold(NULL), wxy(NULL) {
+
+    using Parallel::mype;
+
+    chunksize = inp->getInt("chunksize", 0);
+    if (chunksize < 0) {
+        if (mype == 0)
+            cerr << "Error: bad chunksize " << chunksize << endl;
+        exit(1);
+    }
+
+    subregion = inp->getDoubleList("subregion", vector<double>());
+    if (subregion.size() != 0 && subregion.size() != 4) {
+        if (mype == 0)
+            cerr << "Error:  subregion must have 4 entries" << endl;
+        exit(1);
+    }
+
+    writexy = inp->getInt("writexy", 0);
+    writegold = inp->getInt("writegold", 0);
+
+    gmesh = new GenMesh(inp);
+    wxy = new WriteXY(this);
+    egold = new ExportGold(this);
+
+    init();
+}
+
+
+Mesh::~Mesh() {
+    delete gmesh;
+    delete wxy;
+    delete egold;
+}
+
+
+void Mesh::init() {
+
+    // generate mesh
+    vector<double2> nodepos;
+    vector<int> cellstart, cellsize, cellnodes;
+    vector<int> slavemstrpes, slavemstrcounts, slavepoints;
+    vector<int> masterslvpes, masterslvcounts, masterpoints;
+    gmesh->generate(nodepos, cellstart, cellsize, cellnodes,
+            slavemstrpes, slavemstrcounts, slavepoints,
+            masterslvpes, masterslvcounts, masterpoints);
+
+    nump = nodepos.size();
+    numz = cellstart.size();
+    nums = cellnodes.size();
+    numc = nums;
+
+    // copy cell sizes to mesh
+    znump = Memory::alloc<int>(numz);
+    copy(cellsize.begin(), cellsize.end(), znump);
+
+    // populate maps:
+    // use the cell* arrays to populate the side maps
+    initSides(cellstart, cellsize, cellnodes);
+    // release memory from cell* arrays
+    cellstart.resize(0);
+    cellsize.resize(0);
+    cellnodes.resize(0);
+    // now populate edge maps using side maps
+    initEdges();
+
+    // populate chunk information
+    initChunks();
+
+    // create inverse map for corner-to-point gathers
+    initInvMap();
+
+    // calculate parallel data structures
+    initParallel(slavemstrpes, slavemstrcounts, slavepoints,
+            masterslvpes, masterslvcounts, masterpoints);
+    // release memory from parallel-related arrays
+    slavemstrpes.resize(0);
+    slavemstrcounts.resize(0);
+    slavepoints.resize(0);
+    masterslvpes.resize(0);
+    masterslvcounts.resize(0);
+    masterpoints.resize(0);
+
+    // write mesh statistics
+    writeStats();
+
+    // allocate remaining arrays
+    px = Memory::alloc<double2>(nump);
+    ex = Memory::alloc<double2>(nume);
+    zx = Memory::alloc<double2>(numz);
+    px0 = Memory::alloc<double2>(nump);
+    pxp = Memory::alloc<double2>(nump);
+    exp = Memory::alloc<double2>(nume);
+    zxp = Memory::alloc<double2>(numz);
+    sarea = Memory::alloc<double>(nums);
+    svol = Memory::alloc<double>(nums);
+    zarea = Memory::alloc<double>(numz);
+    zvol = Memory::alloc<double>(numz);
+    sareap = Memory::alloc<double>(nums);
+    svolp = Memory::alloc<double>(nums);
+    zareap = Memory::alloc<double>(numz);
+    zvolp = Memory::alloc<double>(numz);
+    zvol0 = Memory::alloc<double>(numz);
+    ssurfp = Memory::alloc<double2>(nums);
+    elen = Memory::alloc<double>(nume);
+    zdl = Memory::alloc<double>(numz);
+    smf = Memory::alloc<double>(nums);
+
+    // do a few initial calculations
+    #pragma omp parallel for schedule(static)
+    for (int pch = 0; pch < numpch; ++pch) {
+        int pfirst = pchpfirst[pch];
+        int plast = pchplast[pch];
+        // copy nodepos into px, distributed across threads
+        for (int p = pfirst; p < plast; ++p)
+            px[p] = nodepos[p];
+
+    }
+
+    numsbad = 0;
+    #pragma omp parallel for schedule(static)
+    for (int sch = 0; sch < numsch; ++sch) {
+        int sfirst = schsfirst[sch];
+        int slast = schslast[sch];
+        calcCtrs(px, ex, zx, sfirst, slast);
+        calcVols(px, zx, sarea, svol, zarea, zvol, sfirst, slast);
+        calcSideFracs(sarea, zarea, smf, sfirst, slast);
+    }
+    checkBadSides();
+
+}
+
+
+void Mesh::initSides(
+        const vector<int>& cellstart,
+        const vector<int>& cellsize,
+        const vector<int>& cellnodes) {
+
+    mapsp1 = Memory::alloc<int>(nums);
+    mapsp2 = Memory::alloc<int>(nums);
+    mapsz  = Memory::alloc<int>(nums);
+    mapss3 = Memory::alloc<int>(nums);
+    mapss4 = Memory::alloc<int>(nums);
+
+    for (int z = 0; z < numz; ++z) {
+        int sbase = cellstart[z];
+        int size = cellsize[z];
+        for (int n = 0; n < size; ++n) {
+            int s = sbase + n;
+            int snext = sbase + (n + 1 == size ? 0 : n + 1);
+            int slast = sbase + (n == 0 ? size : n) - 1;
+            mapsz[s] = z;
+            mapsp1[s] = cellnodes[s];
+            mapsp2[s] = cellnodes[snext];
+            mapss3[s] = slast;
+            mapss4[s] = snext;
+        } // for n
+    } // for z
+
+}
+
+
+void Mesh::initEdges() {
+
+    vector<vector<int> > edgepp(nump), edgepe(nump);
+
+    mapse = Memory::alloc<int>(nums);
+
+    int e = 0;
+    for (int s = 0; s < nums; ++s) {
+        int p1 = min(mapsp1[s], mapsp2[s]);
+        int p2 = max(mapsp1[s], mapsp2[s]);
+
+        vector<int>& vpp = edgepp[p1];
+        vector<int>& vpe = edgepe[p1];
+        int i = find(vpp.begin(), vpp.end(), p2) - vpp.begin();
+        if (i == vpp.size()) {
+            // (p, p2) isn't in the edge list - add it
+            vpp.push_back(p2);
+            vpe.push_back(e);
+            ++e;
+        }
+        mapse[s] = vpe[i];
+    }  // for s
+
+    nume = e;
+
+}
+
+
+void Mesh::initChunks() {
+
+    if (chunksize == 0) chunksize = max(nump, nums);
+
+    // compute side chunks
+    // use 'chunksize' for maximum chunksize; decrease as needed
+    // to ensure that no zone has its sides split across chunk
+    // boundaries
+    int s1, s2 = 0;
+    while (s2 < nums) {
+        s1 = s2;
+        s2 = min(s2 + chunksize, nums);
+        while (s2 < nums && mapsz[s2] == mapsz[s2-1])
+            --s2;
+        schsfirst.push_back(s1);
+        schslast.push_back(s2);
+        schzfirst.push_back(mapsz[s1]);
+        schzlast.push_back(mapsz[s2-1] + 1);
+    }
+    numsch = schsfirst.size();
+
+    // compute point chunks
+    int p1, p2 = 0;
+    while (p2 < nump) {
+        p1 = p2;
+        p2 = min(p2 + chunksize, nump);
+        pchpfirst.push_back(p1);
+        pchplast.push_back(p2);
+    }
+    numpch = pchpfirst.size();
+
+    // compute zone chunks
+    int z1, z2 = 0;
+    while (z2 < numz) {
+        z1 = z2;
+        z2 = min(z2 + chunksize, numz);
+        zchzfirst.push_back(z1);
+        zchzlast.push_back(z2);
+    }
+    numzch = zchzfirst.size();
+
+}
+
+
+void Mesh::initInvMap() {
+    mappcfirst = Memory::alloc<int>(nump);
+    mapccnext = Memory::alloc<int>(nums);
+
+    vector<pair<int, int> > pcpair(nums);
+    for (int c = 0; c < numc; ++c)
+        pcpair[c] = make_pair(mapsp1[c], c);
+    sort(pcpair.begin(), pcpair.end());
+    for (int i = 0; i < numc; ++i) {
+        int p = pcpair[i].first;
+        int pp = pcpair[i+1].first;
+        int pm = pcpair[i-1].first;
+        int c = pcpair[i].second;
+        int cp = pcpair[i+1].second;
+
+        if (i == 0 || p != pm)  mappcfirst[p] = c;
+        if (i+1 == numc || p != pp)
+            mapccnext[c] = -1;
+        else
+            mapccnext[c] = cp;
+    }
+
+}
+
+
+void Mesh::initParallel(
+        const vector<int>& slavemstrpes,
+        const vector<int>& slavemstrcounts,
+        const vector<int>& slavepoints,
+        const vector<int>& masterslvpes,
+        const vector<int>& masterslvcounts,
+        const vector<int>& masterpoints) {
+    if (Parallel::numpe == 1) return;
+
+    nummstrpe = slavemstrpes.size();
+    mapmstrpepe = Memory::alloc<int>(nummstrpe);
+    copy(slavemstrpes.begin(), slavemstrpes.end(), mapmstrpepe);
+    mstrpenumslv = Memory::alloc<int>(nummstrpe);
+    copy(slavemstrcounts.begin(), slavemstrcounts.end(), mstrpenumslv);
+    mapmstrpeslv1 = Memory::alloc<int>(nummstrpe);
+    int count = 0;
+    for (int mstrpe = 0; mstrpe < nummstrpe; ++mstrpe) {
+        mapmstrpeslv1[mstrpe] = count;
+        count += mstrpenumslv[mstrpe];
+    }
+    numslv = slavepoints.size();
+    mapslvp = Memory::alloc<int>(numslv);
+    copy(slavepoints.begin(), slavepoints.end(), mapslvp);
+
+    numslvpe = masterslvpes.size();
+    mapslvpepe = Memory::alloc<int>(numslvpe);
+    copy(masterslvpes.begin(), masterslvpes.end(), mapslvpepe);
+    slvpenumprx = Memory::alloc<int>(numslvpe);
+    copy(masterslvcounts.begin(), masterslvcounts.end(), slvpenumprx);
+    mapslvpeprx1 = Memory::alloc<int>(numslvpe);
+    count = 0;
+    for (int slvpe = 0; slvpe < numslvpe; ++slvpe) {
+        mapslvpeprx1[slvpe] = count;
+        count += slvpenumprx[slvpe];
+    }
+    numprx = masterpoints.size();
+    mapprxp = Memory::alloc<int>(numprx);
+    copy(masterpoints.begin(), masterpoints.end(), mapprxp);
+
+}
+
+
+void Mesh::writeStats() {
+
+    int64_t gnump = nump;
+    // make sure that boundary points aren't double-counted;
+    // only count them if they are masters
+    if (Parallel::numpe > 1) gnump -= numslv;
+    int64_t gnumz = numz;
+    int64_t gnums = nums;
+    int64_t gnume = nume;
+    int gnumpch = numpch;
+    int gnumzch = numzch;
+    int gnumsch = numsch;
+
+    Parallel::globalSum(gnump);
+    Parallel::globalSum(gnumz);
+    Parallel::globalSum(gnums);
+    Parallel::globalSum(gnume);
+    Parallel::globalSum(gnumpch);
+    Parallel::globalSum(gnumzch);
+    Parallel::globalSum(gnumsch);
+
+    if (Parallel::mype > 0) return;
+
+    cout << "--- Mesh Information ---" << endl;
+    cout << "Points:  " << gnump << endl;
+    cout << "Zones:  "  << gnumz << endl;
+    cout << "Sides:  "  << gnums << endl;
+    cout << "Edges:  "  << gnume << endl;
+    cout << "Side chunks:  " << gnumsch << endl;
+    cout << "Point chunks:  " << gnumpch << endl;
+    cout << "Zone chunks:  " << gnumzch << endl;
+    cout << "Chunk size:  " << chunksize << endl;
+    cout << "------------------------" << endl;
+
+}
+
+
+void Mesh::write(
+        const string& probname,
+        const int cycle,
+        const double time,
+        const double* zr,
+        const double* ze,
+        const double* zp) {
+
+    if (writexy) {
+        if (Parallel::mype == 0)
+            cout << "Writing .xy file..." << endl;
+        wxy->write(probname, zr, ze, zp);
+    }
+    if (writegold) {
+        if (Parallel::mype == 0) 
+            cout << "Writing gold file..." << endl;
+        egold->write(probname, cycle, time, zr, ze, zp);
+    }
+
+}
+
+
+vector<int> Mesh::getXPlane(const double c) {
+
+    vector<int> mapbp;
+    const double eps = 1.e-12;
+
+    for (int p = 0; p < nump; ++p) {
+        if (fabs(px[p].x - c) < eps) {
+            mapbp.push_back(p);
+        }
+    }
+    return mapbp;
+
+}
+
+
+vector<int> Mesh::getYPlane(const double c) {
+
+    vector<int> mapbp;
+    const double eps = 1.e-12;
+
+    for (int p = 0; p < nump; ++p) {
+        if (fabs(px[p].y - c) < eps) {
+            mapbp.push_back(p);
+        }
+    }
+    return mapbp;
+
+}
+
+
+void Mesh::getPlaneChunks(
+        const int numb,
+        const int* mapbp,
+        vector<int>& pchbfirst,
+        vector<int>& pchblast) {
+
+    pchbfirst.resize(0);
+    pchblast.resize(0);
+
+    // compute boundary point chunks
+    // (boundary points contained in each point chunk)
+    int bf, bl = 0;
+    for (int pch = 0; pch < numpch; ++pch) {
+         int pl = pchplast[pch];
+         bf = bl;
+         bl = lower_bound(&mapbp[bf], &mapbp[numb], pl) - &mapbp[0];
+         pchbfirst.push_back(bf);
+         pchblast.push_back(bl);
+    }
+
+}
+
+
+void Mesh::calcCtrs(
+        const double2* px,
+        double2* ex,
+        double2* zx,
+        const int sfirst,
+        const int slast) {
+
+    int zfirst = mapsz[sfirst];
+    int zlast = (slast < nums ? mapsz[slast] : numz);
+    fill(&zx[zfirst], &zx[zlast], double2(0., 0.));
+
+    for (int s = sfirst; s < slast; ++s) {
+        int p1 = mapsp1[s];
+        int p2 = mapsp2[s];
+        int e = mapse[s];
+        int z = mapsz[s];
+        ex[e] = 0.5 * (px[p1] + px[p2]);
+        zx[z] += px[p1];
+    }
+
+    for (int z = zfirst; z < zlast; ++z) {
+        zx[z] /= (double) znump[z];
+    }
+
+}
+
+
+void Mesh::calcVols(
+        const double2* px,
+        const double2* zx,
+        double* sarea,
+        double* svol,
+        double* zarea,
+        double* zvol,
+        const int sfirst,
+        const int slast) {
+
+    int zfirst = mapsz[sfirst];
+    int zlast = (slast < nums ? mapsz[slast] : numz);
+    fill(&zvol[zfirst], &zvol[zlast], 0.);
+    fill(&zarea[zfirst], &zarea[zlast], 0.);
+
+    const double third = 1. / 3.;
+    int count = 0;
+    for (int s = sfirst; s < slast; ++s) {
+        int p1 = mapsp1[s];
+        int p2 = mapsp2[s];
+        int z = mapsz[s];
+
+        // compute side volumes, sum to zone
+        double sa = 0.5 * cross(px[p2] - px[p1], zx[z] - px[p1]);
+        double sv = third * sa * (px[p1].x + px[p2].x + zx[z].x);
+        sarea[s] = sa;
+        svol[s] = sv;
+        zarea[z] += sa;
+        zvol[z] += sv;
+
+        // check for negative side volumes
+        if (sv <= 0.) count += 1;
+
+    } // for s
+
+    if (count > 0) {
+        #pragma omp atomic
+        numsbad += count;
+    }
+
+}
+
+
+void Mesh::checkBadSides() {
+
+    // if there were negative side volumes, error exit
+    if (numsbad > 0) {
+        cerr << "Error: " << numsbad << " negative side volumes" << endl;
+        cerr << "Exiting..." << endl;
+        exit(1);
+    }
+
+}
+
+
+void Mesh::calcSideFracs(
+        const double* sarea,
+        const double* zarea,
+        double* smf,
+        const int sfirst,
+        const int slast) {
+
+    #pragma ivdep
+    for (int s = sfirst; s < slast; ++s) {
+        int z = mapsz[s];
+        smf[s] = sarea[s] / zarea[z];
+    }
+}
+
+
+void Mesh::calcSurfVecs(
+        const double2* zx,
+        const double2* ex,
+        double2* ssurf,
+        const int sfirst,
+        const int slast) {
+
+    #pragma ivdep
+    for (int s = sfirst; s < slast; ++s) {
+        int z = mapsz[s];
+        int e = mapse[s];
+
+        ssurf[s] = rotateCCW(ex[e] - zx[z]);
+
+    }
+
+}
+
+
+void Mesh::calcEdgeLen(
+        const double2* px,
+        double* elen,
+        const int sfirst,
+        const int slast) {
+
+    for (int s = sfirst; s < slast; ++s) {
+        const int p1 = mapsp1[s];
+        const int p2 = mapsp2[s];
+        const int e = mapse[s];
+
+        elen[e] = length(px[p2] - px[p1]);
+
+    }
+}
+
+
+void Mesh::calcCharLen(
+        const double* sarea,
+        double* zdl,
+        const int sfirst,
+        const int slast) {
+
+    int zfirst = mapsz[sfirst];
+    int zlast = (slast < nums ? mapsz[slast] : numz);
+    fill(&zdl[zfirst], &zdl[zlast], 1.e99);
+
+    for (int s = sfirst; s < slast; ++s) {
+        int z = mapsz[s];
+        int e = mapse[s];
+
+        double area = sarea[s];
+        double base = elen[e];
+        double fac = (znump[z] == 3 ? 3. : 4.);
+        double sdl = fac * area / base;
+        zdl[z] = min(zdl[z], sdl);
+    }
+}
+
+
+template <typename T>
+void Mesh::parallelGather(
+        const T* pvar,
+        T* prxvar) {
+#ifdef USE_MPI
+    // This routine gathers slave values for which MYPE owns the masters.
+    const int tagmpi = 100;
+    const int type_size = sizeof(T);
+//    std::vector<T> slvvar(numslv);
+    T* slvvar = Memory::alloc<T>(numslv);
+
+    // Post receives for incoming messages from slaves.
+    // Store results in proxy buffer.
+//    vector<MPI_Request> request(numslvpe);
+    MPI_Request* request = Memory::alloc<MPI_Request>(numslvpe);
+    for (int slvpe = 0; slvpe < numslvpe; ++slvpe) {
+        int pe = mapslvpepe[slvpe];
+        int nprx = slvpenumprx[slvpe];
+        int prx1 = mapslvpeprx1[slvpe];
+        MPI_Irecv(&prxvar[prx1], nprx * type_size, MPI_BYTE,
+                pe, tagmpi, MPI_COMM_WORLD, &request[slvpe]);
+    }
+
+    // Load slave data buffer from points.
+    for (int slv = 0; slv < numslv; ++slv) {
+        int p = mapslvp[slv];
+        slvvar[slv] = pvar[p];
+    }
+
+    // Send slave data to master PEs.
+    for (int mstrpe = 0; mstrpe < nummstrpe; ++mstrpe) {
+        int pe = mapmstrpepe[mstrpe];
+        int nslv = mstrpenumslv[mstrpe];
+        int slv1 = mapmstrpeslv1[mstrpe];
+        MPI_Send(&slvvar[slv1], nslv * type_size, MPI_BYTE,
+                pe, tagmpi, MPI_COMM_WORLD);
+    }
+
+    // Wait for all receives to complete.
+//    vector<MPI_Status> status(numslvpe);
+    MPI_Status* status = Memory::alloc<MPI_Status>(numslvpe);
+    int ierr = MPI_Waitall(numslvpe, &request[0], &status[0]);
+    if (ierr != 0) {
+        cerr << "Error: parallelGather MPI error " << ierr <<
+                " on PE " << Parallel::mype << endl;
+        cerr << "Exiting..." << endl;
+        exit(1);
+    }
+
+    Memory::free(slvvar);
+    Memory::free(request);
+    Memory::free(status);
+#endif
+}
+
+
+template <typename T>
+void Mesh::parallelSum(
+        T* pvar,
+        T* prxvar) {
+#ifdef USE_MPI
+    // Compute sum of all (proxy/master) sets.
+    // Store results in master.
+    for (int prx = 0; prx < numprx; ++prx) {
+        int p = mapprxp[prx];
+        pvar[p] += prxvar[prx];
+    }
+
+    // Copy updated master data back to proxies.
+    for (int prx = 0; prx < numprx; ++prx) {
+        int p = mapprxp[prx];
+        prxvar[prx] = pvar[p];
+    }
+#endif
+}
+
+
+template <typename T>
+void Mesh::parallelScatter(
+        T* pvar,
+        const T* prxvar) {
+#ifdef USE_MPI
+    // This routine scatters master values on MYPE to all slave copies
+    // owned by other PEs.
+    const int tagmpi = 200;
+    const int type_size = sizeof(T);
+//    std::vector<T> slvvar(numslv);
+    T* slvvar = Memory::alloc<T>(numslv);
+
+    // Post receives for incoming messages from masters.
+    // Store results in slave buffer.
+//    vector<MPI_Request> request(nummstrpe);
+    MPI_Request* request = Memory::alloc<MPI_Request>(nummstrpe);
+    for (int mstrpe = 0; mstrpe < nummstrpe; ++mstrpe) {
+        int pe = mapmstrpepe[mstrpe];
+        int nslv = mstrpenumslv[mstrpe];
+        int slv1 = mapmstrpeslv1[mstrpe];
+        MPI_Irecv(&slvvar[slv1], nslv * type_size, MPI_BYTE,
+                pe, tagmpi, MPI_COMM_WORLD,  &request[mstrpe]);
+    }
+
+    // Send updated slave data from proxy buffer back to slave PEs.
+    for (int slvpe = 0; slvpe < numslvpe; ++slvpe) {
+        int pe = mapslvpepe[slvpe];
+        int nprx = slvpenumprx[slvpe];
+        int prx1 = mapslvpeprx1[slvpe];
+        MPI_Send((void*)&prxvar[prx1], nprx * type_size, MPI_BYTE,
+                pe, tagmpi, MPI_COMM_WORLD);
+    }
+
+    // Wait for all receives to complete.
+//    vector<MPI_Status> status(nummstrpe);
+    MPI_Status* status = Memory::alloc<MPI_Status>(nummstrpe);
+    int ierr = MPI_Waitall(nummstrpe, &request[0], &status[0]);
+    if (ierr != 0) {
+        cerr << "Error: parallelScatter MPI error " << ierr <<
+                " on PE " << Parallel::mype << endl;
+        cerr << "Exiting..." << endl;
+        exit(1);
+    }
+
+    // Store slave data from buffer back to points.
+    for (int slv = 0; slv < numslv; ++slv) {
+        int p = mapslvp[slv];
+        pvar[p] = slvvar[slv];
+    }
+
+    Memory::free(slvvar);
+    Memory::free(request);
+    Memory::free(status);
+#endif
+}
+
+
+template <typename T>
+void Mesh::sumAcrossProcs(T* pvar) {
+    if (Parallel::numpe == 1) return;
+//    std::vector<T> prxvar(numprx);
+    T* prxvar = Memory::alloc<T>(numprx);
+    parallelGather(pvar, &prxvar[0]);
+    parallelSum(pvar, &prxvar[0]);
+    parallelScatter(pvar, &prxvar[0]);
+    Memory::free(prxvar);
+}
+
+
+template <typename T>
+void Mesh::sumOnProc(
+        const T* cvar,
+        T* pvar) {
+
+    #pragma omp parallel for schedule(static)
+    for (int pch = 0; pch < numpch; ++pch) {
+        int pfirst = pchpfirst[pch];
+        int plast = pchplast[pch];
+        for (int p = pfirst; p < plast; ++p) {
+            T x = T();
+            for (int c = mappcfirst[p]; c >= 0; c = mapccnext[c]) {
+                x += cvar[c];
+            }
+            pvar[p] = x;
+        }  // for p
+    }  // for pch
+
+}
+
+
+template <>
+void Mesh::sumToPoints(
+        const double* cvar,
+        double* pvar) {
+
+    sumOnProc(cvar, pvar);
+    if (Parallel::numpe > 1)
+        sumAcrossProcs(pvar);
+
+}
+
+
+template <>
+void Mesh::sumToPoints(
+        const double2* cvar,
+        double2* pvar) {
+
+    sumOnProc(cvar, pvar);
+    if (Parallel::numpe > 1)
+        sumAcrossProcs(pvar);
+
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PENNANT.reference_output
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PENNANT.reference_output
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PENNANT.reference_output
@@ -0,0 +1,26 @@
+********************
+Running PENNANT v0.9
+********************
+
+--- Mesh Information ---
+Points:  91001
+Zones:  90000
+Sides:  360000
+Edges:  181000
+Side chunks:  704
+Point chunks:  178
+Zone chunks:  176
+Chunk size:  512
+------------------------
+Energy check:  total energy  =   9.424778e-01
+(internal =   9.424778e-01, kinetic =   0.000000e+00)
+dt limiter: Initial timestep
+dt limiter: Hydro dV/V limit for z = 30167
+
+Run complete
+cycle =     10,         cstop =     10
+time  =   5.589080e-02, tstop =   6.000000e+00
+
+Energy check:  total energy  =   9.424778e-01
+(internal =   9.406983e-01, kinetic =   1.779474e-03)
+exit 0
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Parallel.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Parallel.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Parallel.hh
@@ -0,0 +1,59 @@
+/*
+ * Parallel.hh
+ *
+ *  Created on: May 31, 2013
+ *      Author: cferenba
+ *
+ * Copyright (c) 2013, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef PARALLEL_HH_
+#define PARALLEL_HH_
+
+#include <stdint.h>
+
+#ifdef USE_MPI
+#include "mpi.h"
+#endif
+
+
+// Namespace Parallel provides helper functions and variables for
+// running in distributed parallel mode using MPI, or for stubbing
+// these out if not using MPI.
+
+namespace Parallel {
+    extern int numpe;           // number of MPI PEs in use
+                                // (1 if not using MPI)
+    extern int mype;            // PE number for my rank
+                                // (0 if not using MPI)
+
+    void init();                // initialize MPI
+    void final();               // finalize MPI
+
+    void globalMinLoc(double& x, int& xpe);
+                                // find minimum over all PEs, and
+                                // report which PE had the minimum
+    void globalSum(int& x);     // find sum over all PEs - overloaded
+    void globalSum(int64_t& x);
+    void globalSum(double& x);
+    void gather(const int x, int* y);
+                                // gather list of ints from all PEs
+    void scatter(const int* x, int& y);
+                                // gather list of ints from all PEs
+
+    template<typename T>
+    void gatherv(               // gather variable-length list
+            const T *x, const int numx,
+            T* y, const int* numy);
+    template<typename T>
+    void gathervImpl(           // helper function for gatherv
+            const T *x, const int numx,
+            T* y, const int* numy);
+
+}  // namespace Parallel
+
+
+#endif /* PARALLEL_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Parallel.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Parallel.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Parallel.cc
@@ -0,0 +1,181 @@
+/*
+ * Parallel.cc
+ *
+ *  Created on: May 31, 2013
+ *      Author: cferenba
+ *
+ * Copyright (c) 2013, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "Parallel.hh"
+
+#include <vector>
+#include <algorithm>
+#include <numeric>
+
+#include "Vec2.hh"
+
+
+namespace Parallel {
+
+#ifdef USE_MPI
+// We're running under MPI, so set these to dummy values
+// that will be overwritten on MPI_Init.
+int numpe = 0;
+int mype = -1;
+#else
+// We're in serial mode, so only 1 PE.
+int numpe = 1;
+int mype = 0;
+#endif
+
+
+void init() {
+#ifdef USE_MPI
+    MPI_Init(0, 0);
+    MPI_Comm_size(MPI_COMM_WORLD, &numpe);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mype);
+#endif
+}  // init
+
+
+void final() {
+#ifdef USE_MPI
+    MPI_Finalize();
+#endif
+}  // final
+
+
+void globalMinLoc(double& x, int& xpe) {
+    if (numpe == 1) {
+        xpe = 0;
+        return;
+    }
+#ifdef USE_MPI
+    struct doubleInt {
+        double d;
+        int i;
+    } xdi, ydi;
+    xdi.d = x;
+    xdi.i = mype;
+    MPI_Allreduce(&xdi, &ydi, 1, MPI_DOUBLE_INT, MPI_MINLOC,
+            MPI_COMM_WORLD);
+    x = ydi.d;
+    xpe = ydi.i;
+#endif
+}
+
+
+void globalSum(int& x) {
+    if (numpe == 1) return;
+#ifdef USE_MPI
+    int y;
+    MPI_Allreduce(&x, &y, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    x = y;
+#endif
+}
+
+
+void globalSum(int64_t& x) {
+    if (numpe == 1) return;
+#ifdef USE_MPI
+    int64_t y;
+    MPI_Allreduce(&x, &y, 1, MPI_INT64_T, MPI_SUM, MPI_COMM_WORLD);
+    x = y;
+#endif
+}
+
+
+void globalSum(double& x) {
+    if (numpe == 1) return;
+#ifdef USE_MPI
+    double y;
+    MPI_Allreduce(&x, &y, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    x = y;
+#endif
+}
+
+
+void gather(int x, int* y) {
+    if (numpe == 1) {
+        y[0] = x;
+        return;
+    }
+#ifdef USE_MPI
+    MPI_Gather(&x, 1, MPI_INT, y, 1, MPI_INT, 0, MPI_COMM_WORLD);
+#endif
+}
+
+
+void scatter(const int* x, int& y) {
+    if (numpe == 1) {
+        y = x[0];
+        return;
+    }
+#ifdef USE_MPI
+    MPI_Scatter((void*) x, 1, MPI_INT, &y, 1, MPI_INT, 0, MPI_COMM_WORLD);
+#endif
+}
+
+
+template<typename T>
+void gathervImpl(
+        const T *x, const int numx,
+        T* y, const int* numy) {
+
+    if (numpe == 1) {
+        std::copy(x, x + numx, y);
+        return;
+    }
+#ifdef USE_MPI
+    const int type_size = sizeof(T);
+    int sendcount = type_size * numx;
+    std::vector<int> recvcount, disp;
+    if (mype == 0) {
+        recvcount.resize(numpe);
+        for (int pe = 0; pe < numpe; ++pe) {
+            recvcount[pe] = type_size * numy[pe];
+        }
+        // exclusive scan isn't available in the standard library,
+        // so we use an inclusive scan and displace it by one place
+        disp.resize(numpe + 1);
+        std::partial_sum(recvcount.begin(), recvcount.end(), &disp[1]);
+    } // if mype
+
+    MPI_Gatherv((void*) x, sendcount, MPI_BYTE,
+            y, &recvcount[0], &disp[0], MPI_BYTE,
+            0, MPI_COMM_WORLD);
+#endif
+
+}
+
+
+template<>
+void gatherv(
+        const double2 *x, const int numx,
+        double2* y, const int* numy) {
+    gathervImpl(x, numx, y, numy);
+}
+
+
+template<>
+void gatherv(
+        const double *x, const int numx,
+        double* y, const int* numy) {
+    gathervImpl(x, numx, y, numy);
+}
+
+
+template<>
+void gatherv(
+        const int *x, const int numx,
+        int* y, const int* numy) {
+    gathervImpl(x, numx, y, numy);
+}
+
+
+}  // namespace Parallel
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PolyGas.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PolyGas.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PolyGas.hh
@@ -0,0 +1,67 @@
+/*
+ * PolyGas.hh
+ *
+ *  Created on: Mar 23, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef POLYGAS_HH_
+#define POLYGAS_HH_
+
+#include "Vec2.hh"
+
+// forward declarations
+class InputFile;
+class Hydro;
+
+
+class PolyGas {
+public:
+
+    // parent hydro object
+    Hydro* hydro;
+
+    double gamma;                  // coeff. for ideal gas equation
+    double ssmin;                  // minimum sound speed for gas
+
+    PolyGas(const InputFile* inp, Hydro* h);
+    ~PolyGas();
+
+    void calcStateAtHalf(
+            const double* zr0,
+            const double* zvolp,
+            const double* zvol0,
+            const double* ze,
+            const double* zwrate,
+            const double* zm,
+            const double dt,
+            double* zp,
+            double* zss,
+            const int zfirst,
+            const int zlast);
+
+    void calcEOS(
+            const double* zr,
+            const double* ze,
+            double* zp,
+            double* z0per,
+            double* zss,
+            const int zfirst,
+            const int zlast);
+
+    void calcForce(
+            const double* zp,
+            const double2* ssurfp,
+            double2* sf,
+            const int sfirst,
+            const int slast);
+
+};  // class PolyGas
+
+
+#endif /* POLYGAS_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PolyGas.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PolyGas.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/PolyGas.cc
@@ -0,0 +1,115 @@
+/*
+ * PolyGas.cc
+ *
+ *  Created on: Mar 26, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "PolyGas.hh"
+
+#include "Memory.hh"
+#include "InputFile.hh"
+#include "Hydro.hh"
+#include "Mesh.hh"
+
+using namespace std;
+
+
+PolyGas::PolyGas(const InputFile* inp, Hydro* h) : hydro(h) {
+    gamma = inp->getDouble("gamma", 5. / 3.);
+    ssmin = inp->getDouble("ssmin", 0.);
+
+}
+
+
+void PolyGas::calcStateAtHalf(
+        const double* zr0,
+        const double* zvolp,
+        const double* zvol0,
+        const double* ze,
+        const double* zwrate,
+        const double* zm,
+        const double dt,
+        double* zp,
+        double* zss,
+        const int zfirst,
+        const int zlast) {
+
+    double* z0per = Memory::alloc<double>(zlast - zfirst);
+
+    const double dth = 0.5 * dt;
+
+    // compute EOS at beginning of time step
+    calcEOS(zr0, ze, zp, z0per, zss, zfirst, zlast);
+
+    // now advance pressure to the half-step
+    #pragma ivdep
+    for (int z = zfirst; z < zlast; ++z) {
+        int z0 = z - zfirst;
+        double zminv = 1. / zm[z];
+        double dv = (zvolp[z] - zvol0[z]) * zminv;
+        double bulk = zr0[z] * zss[z] * zss[z];
+        double denom = 1. + 0.5 * z0per[z0] * dv;
+        double src = zwrate[z] * dth * zminv;
+        zp[z] += (z0per[z0] * src - zr0[z] * bulk * dv) / denom;
+    }
+
+    Memory::free(z0per);
+}
+
+
+void PolyGas::calcEOS(
+        const double* zr,
+        const double* ze,
+        double* zp,
+        double* z0per,
+        double* zss,
+        const int zfirst,
+        const int zlast) {
+
+    const double gm1 = gamma - 1.;
+    const double ss2 = max(ssmin * ssmin, 1.e-99);
+
+    #pragma ivdep
+    for (int z = zfirst; z < zlast; ++z) {
+        int z0 = z - zfirst;
+        double rx = zr[z];
+        double ex = max(ze[z], 0.0);
+        double px = gm1 * rx * ex;
+        double prex = gm1 * ex;
+        double perx = gm1 * rx;
+        double csqd = max(ss2, prex + perx * px / (rx * rx));
+        zp[z] = px;
+        z0per[z0] = perx;
+        zss[z] = sqrt(csqd);
+    }
+
+}
+
+
+void PolyGas::calcForce(
+        const double* zp,
+        const double2* ssurfp,
+        double2* sf,
+        const int sfirst,
+        const int slast) {
+
+    const Mesh* mesh = hydro->mesh;
+
+    #pragma ivdep
+    for (int s = sfirst; s < slast; ++s) {
+        int z = mesh->mapsz[s];
+        double2 sfx = -zp[z] * ssurfp[s];
+        sf[s] = sfx;
+
+    }
+}
+
+
+
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/QCS.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/QCS.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/QCS.hh
@@ -0,0 +1,73 @@
+/*
+ * QCS.hh
+ *
+ *  Created on: Feb 21, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef QCS_HH_
+#define QCS_HH_
+
+#include "Vec2.hh"
+
+// forward declarations
+class InputFile;
+class Hydro;
+
+
+class QCS {
+public:
+
+    // parent hydro object
+    Hydro* hydro;
+
+    double qgamma;                 // gamma coefficient for Q model
+    double q1, q2;                 // linear and quadratic coefficients
+                                   // for Q model
+
+    QCS(const InputFile* inp, Hydro* h);
+    ~QCS();
+
+    void calcForce(
+            double2* sf,
+            const int sfirst,
+            const int slast);
+
+    void setCornerDiv(
+            double* c0area,
+            double* c0div,
+            double* c0evol,
+            double* c0du,
+            double* c0cos,
+            const int sfirst,
+            const int slast);
+
+    void setQCnForce(
+            const double* c0div,
+            const double* c0du,
+            const double* c0evol,
+            double2* c0qe,
+            const int sfirst,
+            const int slast);
+
+    void setForce(
+            const double* c0area,
+            const double2* c0qe,
+            double* c0cos,
+            double2* sfqq,
+            const int sfirst,
+            const int slast);
+
+    void setVelDiff(
+            const int sfirst,
+            const int slast);
+
+};  // class QCS
+
+
+#endif /* QCS_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/QCS.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/QCS.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/QCS.cc
@@ -0,0 +1,369 @@
+/*
+ * QCS.cc
+ *
+ *  Created on: Feb 21, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "QCS.hh"
+
+#include <cmath>
+#include "Memory.hh"
+#include "InputFile.hh"
+#include "Vec2.hh"
+#include "Mesh.hh"
+#include "Hydro.hh"
+
+using namespace std;
+
+
+QCS::QCS(const InputFile* inp, Hydro* h) : hydro(h) {
+    qgamma = inp->getDouble("qgamma", 5. / 3.);
+    q1 = inp->getDouble("q1", 0.);
+    q2 = inp->getDouble("q2", 2.);
+
+}
+
+QCS::~QCS() {}
+
+
+void QCS::calcForce(
+        double2* sf,
+        const int sfirst,
+        const int slast) {
+    int cfirst = sfirst;
+    int clast = slast;
+
+    // declare temporary variables
+    double* c0area = Memory::alloc<double>(clast - cfirst);
+    double* c0evol = Memory::alloc<double>(clast - cfirst);
+    double* c0du = Memory::alloc<double>(clast - cfirst);
+    double* c0div = Memory::alloc<double>(clast - cfirst);
+    double* c0cos = Memory::alloc<double>(clast - cfirst);
+    double2* c0qe = Memory::alloc<double2>(2 * (clast - cfirst));
+
+    // [1] Find the right, left, top, bottom  edges to use for the
+    //     limiters
+    // *** NOT IMPLEMENTED IN PENNANT ***
+
+    // [2] Compute corner divergence and related quantities
+    // [2.1] Find the corner divergence
+    // [2.2] Compute the cos angle for c
+    // [2.3] Find the evolution factor c0evol(c) and the Delta u(c) = du(c)
+    // [2.4] Find the weights c0w(c)
+    setCornerDiv(c0area, c0div, c0evol, c0du, c0cos, sfirst, slast);
+
+    // [3] Find the limiters Psi(c)
+    // *** NOT IMPLEMENTED IN PENNANT ***
+
+    // [4] Compute the Q vector (corner based)
+    // [4.1] Compute cmu = (1-psi) . crho . zKUR . c0evol
+    // [4.2] Compute the q vector associated with c on edges
+    //       e1=[n0,n1], e2=[n1,n2]
+    //       c0qe(2,c) = cmu(c).( u(n2)-u(n1) ) / l_{n1->n2}
+    //       c0qe(1,c) = cmu(c).( u(n1)-u(n0) ) / l_{n0->n1}
+    setQCnForce(c0div, c0du, c0evol, c0qe, sfirst, slast);
+
+    // [5] Compute the Q forces
+    setForce(c0area, c0qe, c0cos, sf, sfirst, slast);
+
+    // [6] Set velocity difference to use to compute timestep
+    setVelDiff(sfirst, slast);
+
+    Memory::free(c0area);
+    Memory::free(c0evol);
+    Memory::free(c0du);
+    Memory::free(c0div);
+    Memory::free(c0cos);
+    Memory::free(c0qe);
+}
+
+
+// Routine number [2]  in the full algorithm
+//     [2.1] Find the corner divergence
+//     [2.2] Compute the cos angle for c
+//     [2.3] Find the evolution factor c0evol(c)
+//           and the Delta u(c) = du(c)
+void QCS::setCornerDiv(
+            double* c0area,
+            double* c0div,
+            double* c0evol,
+            double* c0du,
+            double* c0cos,
+            const int sfirst,
+            const int slast) {
+
+    const Mesh* mesh = hydro->mesh;
+    const int nums = mesh->nums;
+    const int numz = mesh->numz;
+
+    const double2* pu = hydro->pu;
+    const double2* px = mesh->pxp;
+    const double2* ex = mesh->exp;
+    const double2* zx = mesh->zxp;
+    const double* elen = mesh->elen;
+    const int* znump = mesh->znump;
+
+    int cfirst = sfirst;
+    int clast = slast;
+    int zfirst = mesh->mapsz[sfirst];
+    int zlast = (slast < nums ? mesh->mapsz[slast] : numz);
+
+    double2* z0uc = Memory::alloc<double2>(zlast - zfirst);
+    double2 up0, up1, up2, up3;
+    double2 xp0, xp1, xp2, xp3;
+
+    // [1] Compute a zone-centered velocity
+    fill(&z0uc[0], &z0uc[zlast-zfirst], double2(0., 0.));
+    for (int c = cfirst; c < clast; ++c) {
+        int p = mesh->mapsp1[c];
+        int z = mesh->mapsz[c];
+        int z0 = z - zfirst;
+        z0uc[z0] += pu[p];
+    }
+
+    for (int z = zfirst; z < zlast; ++z) {
+        int z0 = z - zfirst;
+        z0uc[z0] /= (double) znump[z];
+    }
+
+    // [2] Divergence at the corner
+    #pragma ivdep
+    for (int c = cfirst; c < clast; ++c) {
+        int s2 = c;
+        int s = mesh->mapss3[s2];
+        // Associated zone, corner, point
+        int z = mesh->mapsz[s];
+        int z0 = z - zfirst;
+        int c0 = c - cfirst;
+        int p = mesh->mapsp2[s];
+        // Points
+        int p1 = mesh->mapsp1[s];
+        int p2 = mesh->mapsp2[s2];
+        // Edges
+        int e1 = mesh->mapse[s];
+        int e2 = mesh->mapse[s2];
+
+        // Velocities and positions
+        // 0 = point p
+        up0 = pu[p];
+        xp0 = px[p];
+        // 1 = edge e2
+        up1 = 0.5 * (pu[p] + pu[p2]);
+        xp1 = ex[e2];
+        // 2 = zone center z
+        up2 = z0uc[z0];
+        xp2 = zx[z];
+        // 3 = edge e1
+        up3 = 0.5 * (pu[p1] + pu[p]);
+        xp3 = ex[e1];
+
+        // compute 2d cartesian volume of corner
+        double cvolume = 0.5 * cross(xp2 - xp0, xp3 - xp1);
+        c0area[c0] = cvolume;
+
+        // compute cosine angle
+        double2 v1 = xp3 - xp0;
+        double2 v2 = xp1 - xp0;
+        double de1 = elen[e1];
+        double de2 = elen[e2];
+        double minelen = min(de1, de2);
+        c0cos[c0] = ((minelen < 1.e-12) ?
+                0. :
+                4. * dot(v1, v2) / (de1 * de2));
+
+        // compute divergence of corner
+        c0div[c0] = (cross(up2 - up0, xp3 - xp1) -
+                cross(up3 - up1, xp2 - xp0)) /
+                (2.0 * cvolume);
+
+        // compute evolution factor
+        double2 dxx1 = 0.5 * (xp1 + xp2 - xp0 - xp3);
+        double2 dxx2 = 0.5 * (xp2 + xp3 - xp0 - xp1);
+        double dx1 = length(dxx1);
+        double dx2 = length(dxx2);
+
+        // average corner-centered velocity
+        double2 duav = 0.25 * (up0 + up1 + up2 + up3);
+
+        double test1 = abs(dot(dxx1, duav) * dx2);
+        double test2 = abs(dot(dxx2, duav) * dx1);
+        double num = (test1 > test2 ? dx1 : dx2);
+        double den = (test1 > test2 ? dx2 : dx1);
+        double r = num / den;
+        double evol = sqrt(4.0 * cvolume * r);
+        evol = min(evol, 2.0 * minelen);
+
+        // compute delta velocity
+        double dv1 = length2(up1 + up2 - up0 - up3);
+        double dv2 = length2(up2 + up3 - up0 - up1);
+        double du = sqrt(max(dv1, dv2));
+
+        c0evol[c0] = (c0div[c0] < 0.0 ? evol : 0.);
+        c0du[c0]   = (c0div[c0] < 0.0 ? du   : 0.);
+    }  // for s
+
+    Memory::free(z0uc);
+}
+
+
+// Routine number [4]  in the full algorithm CS2DQforce(...)
+void QCS::setQCnForce(
+        const double* c0div,
+        const double* c0du,
+        const double* c0evol,
+        double2* c0qe,
+        const int sfirst,
+        const int slast) {
+
+    const Mesh* mesh = hydro->mesh;
+
+    const double2* pu = hydro->pu;
+    const double* zrp = hydro->zrp;
+    const double* zss = hydro->zss;
+    const double* elen = mesh->elen;
+
+    int cfirst = sfirst;
+    int clast = slast;
+
+    double* c0rmu = Memory::alloc<double>(clast - cfirst);
+
+    const double gammap1 = qgamma + 1.0;
+
+    // [4.1] Compute the c0rmu (real Kurapatenko viscous scalar)
+    #pragma ivdep
+    for (int c = cfirst; c < clast; ++c) {
+        int c0 = c - cfirst;
+        int z = mesh->mapsz[c];
+
+        // Kurapatenko form of the viscosity
+        double ztmp2 = q2 * 0.25 * gammap1 * c0du[c0];
+        double ztmp1 = q1 * zss[z];
+        double zkur = ztmp2 + sqrt(ztmp2 * ztmp2 + ztmp1 * ztmp1);
+        // Compute c0rmu for each corner
+        double rmu = zkur * zrp[z] * c0evol[c0];
+        c0rmu[c0] = ((c0div[c0] > 0.0) ? 0. : rmu);
+
+    } // for c
+
+    // [4.2] Compute the c0qe for each corner
+    #pragma ivdep
+    for (int c = cfirst; c < clast; ++c) {
+        int s4 = c;
+        int s = mesh->mapss3[s4];
+        int c0 = c - cfirst;
+        int p = mesh->mapsp2[s];
+        // Associated point and edge 1
+        int p1 = mesh->mapsp1[s];
+        int e1 = mesh->mapse[s];
+        // Associated point and edge 2
+        int p2 = mesh->mapsp2[s4];
+        int e2 = mesh->mapse[s4];
+
+        // Compute: c0qe(1,2,3)=edge 1, y component (2nd), 3rd corner
+        //          c0qe(2,1,3)=edge 2, x component (1st)
+        c0qe[2 * c0]     = c0rmu[c0] * (pu[p] - pu[p1]) / elen[e1];
+        c0qe[2 * c0 + 1] = c0rmu[c0] * (pu[p2] - pu[p]) / elen[e2];
+
+    } // for s
+
+    Memory::free(c0rmu);
+}
+
+
+// Routine number [5]  in the full algorithm CS2DQforce(...)
+void QCS::setForce(
+        const double* c0area,
+        const double2* c0qe,
+        double* c0cos,
+        double2* sfq,
+        const int sfirst,
+        const int slast) {
+
+    const Mesh* mesh = hydro->mesh;
+    const double* elen = mesh->elen;
+
+    int cfirst = sfirst;
+    int clast = slast;
+
+    double* c0w = Memory::alloc<double>(clast - cfirst);
+
+    // [5.1] Preparation of extra variables
+    #pragma ivdep
+    for (int c = cfirst; c < clast; ++c) {
+        int c0 = c - cfirst;
+        double csin2 = 1.0 - c0cos[c0] * c0cos[c0];
+        c0w[c0]   = ((csin2 < 1.e-4) ? 0. : c0area[c0] / csin2);
+        c0cos[c0] = ((csin2 < 1.e-4) ? 0. : c0cos[c0]);
+    } // for c
+
+    // [5.2] Set-Up the forces on corners
+    #pragma ivdep
+    for (int s = sfirst; s < slast; ++s) {
+        // Associated corners 1 and 2, and edge
+        int c1 = s;
+        int c10 = c1 - cfirst;
+        int c2 = mesh->mapss4[s];
+        int c20 = c2 - cfirst;
+        int e = mesh->mapse[s];
+        // Edge length for c1, c2 contribution to s
+        double el = elen[e];
+
+        sfq[s] = (c0w[c10] * (c0qe[2*c10+1] + c0cos[c10] * c0qe[2*c10]) +
+                  c0w[c20] * (c0qe[2*c20] + c0cos[c20] * c0qe[2*c20+1]))
+            / el;
+
+    } // for s
+
+    Memory::free(c0w);
+}
+
+
+// Routine number [6] in the full algorithm
+void QCS::setVelDiff(
+        const int sfirst,
+        const int slast) {
+
+    const Mesh* mesh = hydro->mesh;
+    const int nums = mesh->nums;
+    const int numz = mesh->numz;
+    int zfirst = mesh->mapsz[sfirst];
+    int zlast = (slast < nums ? mesh->mapsz[slast] : numz);
+    const double2* px = mesh->pxp;
+    const double2* pu = hydro->pu;
+    const double* zss = hydro->zss;
+    double* zdu = hydro->zdu;
+    const double* elen = mesh->elen;
+
+    double* z0tmp = Memory::alloc<double>(zlast - zfirst);
+
+    fill(&z0tmp[0], &z0tmp[zlast-zfirst], 0.);
+    for (int s = sfirst; s < slast; ++s) {
+        int p1 = mesh->mapsp1[s];
+        int p2 = mesh->mapsp2[s];
+        int z = mesh->mapsz[s];
+        int e = mesh->mapse[s];
+        int z0 = z - zfirst;
+
+        double2 dx = px[p2] - px[p1];
+        double2 du = pu[p2] - pu[p1];
+        double lenx = elen[e];
+        double dux = dot(du, dx);
+        dux = (lenx > 0. ? abs(dux) / lenx : 0.);
+
+        z0tmp[z0] = max(z0tmp[z0], dux);
+    }
+
+    for (int z = zfirst; z < zlast; ++z) {
+        int z0 = z - zfirst;
+        zdu[z] = q1 * zss[z] + 2. * q2 * z0tmp[z0];
+    }
+
+    Memory::free(z0tmp);
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/README
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/README
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/README
@@ -0,0 +1,60 @@
+The PENNANT Mini-App
+
+Charles R. Ferenbaugh
+Los Alamos National Laboratory
+cferenba@lanl.gov
+
+Version 0.9 -- February 2016
+LA-CC-12-021
+https://github.com/losalamos/PENNANT
+
+
+Description:
+
+PENNANT is an unstructured mesh physics mini-app designed for advanced
+architecture research.  It contains mesh data structures and a few
+physics algorithms adapted from the LANL rad-hydro code FLAG, and gives
+a sample of the typical memory access patterns of FLAG.
+
+Further documentation can be found in the 'doc' directory of the
+PENNANT distribution.
+
+
+Version Log:
+
+0.9, February 2016:
+     Added leblancx64 problem.  Added energy check diagnostic
+     for verifying large problems.
+
+0.8, November 2015:
+     Added multi-node test problems.  Added information for
+     APEX benchmark testing.
+
+0.7, February 2015:
+     Further optimizations for MPI+OpenMP.
+
+0.6, February 2014:
+     First MPI version.  MPI capability is working and mostly
+     optimized; MPI+OpenMP is working but needs optimization.
+     Replaced GMV mesh reader with internal mesh generators.
+     Added QCS velocity difference routine to reflect a recent
+     bugfix in FLAG.  Increased size of big test problems.
+
+0.5, May 2013:
+     Further optimizations.
+
+0.4, January 2013:
+     First open-source release.  Fixed a bug in QCS and added some
+     optimizations.  Added Sedov and Leblanc test problems, and some
+     new input keywords to support them.
+
+0.3, July 2012:
+     Added OpenMP pragmas and point chunk processing.  Modified physics
+     state arrays to be flat arrays instead of STL vectors.
+
+0.2, June 2012:
+     Added side chunk processing.  Miscellaneous minor cleanup.
+
+0.1, March 2012:
+     Initial release, internal LANL only.
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/TTS.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/TTS.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/TTS.hh
@@ -0,0 +1,49 @@
+/*
+ * TTS.hh
+ *
+ *  Created on: Feb 2, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef TTS_HH_
+#define TTS_HH_
+
+#include "Vec2.hh"
+
+// forward declarations
+class InputFile;
+class Hydro;
+
+
+class TTS {
+public:
+
+    // parent hydro object
+    Hydro* hydro;
+
+    double alfa;                   // alpha coefficient for TTS model
+    double ssmin;                  // minimum sound speed
+
+    TTS(const InputFile* inp, Hydro* h);
+    ~TTS();
+
+void calcForce(
+        const double* zarea,
+        const double* zr,
+        const double* zss,
+        const double* sarea,
+        const double* smf,
+        const double2* ssurfp,
+        double2* sf,
+        const int sfirst,
+        const int slast);
+
+}; // class TTS
+
+
+#endif /* TTS_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/TTS.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/TTS.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/TTS.cc
@@ -0,0 +1,73 @@
+/*
+ * TTS.cc
+ *
+ *  Created on: Feb 2, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "TTS.hh"
+
+#include "Vec2.hh"
+#include "InputFile.hh"
+#include "Mesh.hh"
+#include "Hydro.hh"
+
+using namespace std;
+
+
+TTS::TTS(const InputFile* inp, Hydro* h) : hydro(h) {
+    alfa = inp->getDouble("alfa", 0.5);
+    ssmin = inp->getDouble("ssmin", 0.);
+
+}
+
+
+TTS::~TTS() {}
+
+
+void TTS::calcForce(
+        const double* zarea,
+        const double* zr,
+        const double* zss,
+        const double* sarea,
+        const double* smf,
+        const double2* ssurfp,
+        double2* sf,
+        const int sfirst,
+        const int slast) {
+
+    //  Side density:
+    //    srho = sm/sv = zr (sm/zm) / (sv/zv)
+    //  Side pressure:
+    //    sp   = zp + alfa dpdr (srho-zr)
+    //         = zp + sdp
+    //  Side delta pressure:
+    //    sdp  = alfa dpdr (srho-zr)
+    //         = alfa c**2 (srho-zr)
+    //
+    //    Notes: smf stores (sm/zm)
+    //           svfac stores (sv/zv)
+
+    const Mesh* mesh = hydro->mesh;
+
+    #pragma ivdep
+    for (int s = sfirst; s < slast; ++s) {
+        int z = mesh->mapsz[s];
+
+        double svfacinv = zarea[z] / sarea[s];
+        double srho = zr[z] * smf[s] * svfacinv;
+        double sstmp = max(zss[z], ssmin);
+        sstmp = alfa * sstmp * sstmp;
+        double sdp = sstmp * (srho - zr[z]);
+        double2 sqq = -sdp * ssurfp[s];
+        sf[s] = sqq;
+
+    }
+
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Vec2.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Vec2.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/Vec2.hh
@@ -0,0 +1,185 @@
+/*
+ * Vec2.hh
+ *
+ *  Created on: Dec 21, 2011
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef VEC2_HH_
+#define VEC2_HH_
+
+#include <cmath>
+
+
+// This struct is defined with all functions inline,
+// to give the compiler maximum opportunity to optimize.
+
+struct double2
+{
+    typedef double value_type;
+    double x, y;
+    inline double2() : x(0.), y(0.) {}
+    inline double2(const double& x_, const double& y_) : x(x_), y(y_) {}
+    inline double2(const double2& v2) : x(v2.x), y(v2.y) {}
+    inline ~double2() {}
+
+    inline double2& operator=(const double2& v2)
+    {
+        x = v2.x;
+        y = v2.y;
+        return(*this);
+    }
+
+    inline double2& operator+=(const double2& v2)
+    {
+        x += v2.x;
+        y += v2.y;
+        return(*this);
+    }
+
+    inline double2& operator-=(const double2& v2)
+    {
+        x -= v2.x;
+        y -= v2.y;
+        return(*this);
+    }
+
+    inline double2& operator*=(const double& r)
+    {
+        x *= r;
+        y *= r;
+        return(*this);
+    }
+
+    inline double2& operator/=(const double& r)
+    {
+        x /= r;
+        y /= r;
+        return(*this);
+    }
+
+}; // double2
+
+inline double2 make_double2(const double& x_, const double& y_) {
+    return(double2(x_, y_));
+}
+
+
+
+// comparison operators:
+
+// equals
+inline bool operator==(const double2& v1, const double2& v2)
+{
+    return((v1.x == v2.x) && (v1.y == v2.y));
+}
+
+// not-equals
+inline bool operator!=(const double2& v1, const double2& v2)
+{
+    return(!(v1 == v2));
+}
+
+
+// unary operators:
+
+// unary plus
+inline double2 operator+(const double2& v)
+{
+    return(v);
+}
+
+// unary minus
+inline double2 operator-(const double2& v)
+{
+    return(double2(-v.x, -v.y));
+}
+
+
+// binary operators:
+
+// add
+inline double2 operator+(const double2& v1, const double2& v2)
+{
+    return(double2(v1.x + v2.x, v1.y + v2.y));
+}
+
+// subtract
+inline double2 operator-(const double2& v1, const double2& v2)
+{
+    return(double2(v1.x - v2.x, v1.y - v2.y));
+}
+
+// multiply vector by scalar
+inline double2 operator*(const double2& v, const double& r)
+{
+    return(double2(v.x * r, v.y * r));
+}
+
+// multiply scalar by vector
+inline double2 operator*(const double& r, const double2& v)
+{
+    return(double2(v.x * r, v.y * r));
+}
+
+// divide vector by scalar
+inline double2 operator/(const double2& v, const double& r)
+{
+    double rinv = (double) 1. / r;
+    return(double2(v.x * rinv, v.y * rinv));
+}
+
+
+// other vector operations:
+
+// dot product
+inline double dot(const double2& v1, const double2& v2)
+{
+    return(v1.x * v2.x + v1.y * v2.y);
+}
+
+// cross product (2D)
+inline double cross(const double2& v1, const double2& v2)
+{
+    return(v1.x * v2.y - v1.y * v2.x);
+}
+
+// length
+inline double length(const double2& v)
+{
+    return(std::sqrt(v.x * v.x + v.y * v.y));
+}
+
+// length squared
+inline double length2(const double2& v)
+{
+    return(v.x * v.x + v.y * v.y);
+}
+
+// rotate 90 degrees counterclockwise
+inline double2 rotateCCW(const double2& v)
+{
+    return(double2(-v.y, v.x));
+}
+
+// rotate 90 degrees clockwise
+inline double2 rotateCW(const double2& v)
+{
+    return(double2(v.y, -v.x));
+}
+
+// project v onto subspace perpendicular to u
+// u must be a unit vector
+inline double2 project(double2& v, const double2& u)
+{
+    // assert(length2(u) == 1.);
+    return v - dot(v, u) * u;
+}
+
+
+#endif /* VEC2_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/WriteXY.hh
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/WriteXY.hh
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/WriteXY.hh
@@ -0,0 +1,39 @@
+/*
+ * WriteXY.hh
+ *
+ *  Created on: Dec 16, 2013
+ *      Author: cferenba
+ *
+ * Copyright (c) 2013, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#ifndef WRITEXY_HH_
+#define WRITEXY_HH_
+
+#include <string>
+
+// forward declarations
+class Mesh;
+
+
+class WriteXY {
+public:
+
+    Mesh* mesh;
+
+    WriteXY(Mesh* m);
+    ~WriteXY();
+
+    void write(
+            const std::string& basename,
+            const double* zr,
+            const double* ze,
+            const double* zp);
+
+};
+
+
+#endif /* WRITEXY_HH_ */
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/WriteXY.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/WriteXY.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/WriteXY.cc
@@ -0,0 +1,71 @@
+/*
+ * WriteXY.cc
+ *
+ *  Created on: Dec 16, 2013
+ *      Author: cferenba
+ *
+ * Copyright (c) 2013, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include "WriteXY.hh"
+
+#include <fstream>
+#include <iomanip>
+
+#include "Parallel.hh"
+#include "Mesh.hh"
+
+using namespace std;
+
+
+WriteXY::WriteXY(Mesh* m) : mesh(m) {}
+
+WriteXY::~WriteXY() {}
+
+
+void WriteXY::write(
+        const string& basename,
+        const double* zr,
+        const double* ze,
+        const double* zp) {
+
+    using Parallel::numpe;
+    using Parallel::mype;
+    const int numz = mesh->numz;
+
+    int gnumz = numz;
+    Parallel::globalSum(gnumz);
+    gnumz = (mype == 0 ? gnumz : 0);
+    vector<int> penumz(mype == 0 ? numpe : 0);
+    Parallel::gather(numz, &penumz[0]);
+
+    vector<double> gzr(gnumz), gze(gnumz), gzp(gnumz);
+    Parallel::gatherv(&zr[0], numz, &gzr[0], &penumz[0]);
+    Parallel::gatherv(&ze[0], numz, &gze[0], &penumz[0]);
+    Parallel::gatherv(&zp[0], numz, &gzp[0], &penumz[0]);
+
+    if (mype == 0) {
+        string xyname = basename + ".xy";
+        ofstream ofs(xyname.c_str());
+        ofs << scientific << setprecision(8);
+        ofs << "#  zr" << endl;
+        for (int z = 0; z < gnumz; ++z) {
+            ofs << setw(5) << (z + 1) << setw(18) << gzr[z] << endl;
+        }
+        ofs << "#  ze" << endl;
+        for (int z = 0; z < gnumz; ++z) {
+            ofs << setw(5) << (z + 1) << setw(18) << gze[z] << endl;
+        }
+        ofs << "#  zp" << endl;
+        for (int z = 0; z < gnumz; ++z) {
+            ofs << setw(5) << (z + 1) << setw(18) << gzp[z] << endl;
+        }
+        ofs.close();
+
+    } // if mype
+
+}
+
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/intermediate_leblanc.pnt
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/intermediate_leblanc.pnt
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/intermediate_leblanc.pnt
@@ -0,0 +1,17 @@
+cstop   10
+tstop   6.0
+meshtype rect
+meshparams 100 900  1.0 9.0
+subregion 0.0 1.0  3.0 9.0
+rinit   1.0
+einit   0.1
+rinitsub 1.0e-3
+einitsub 1.0e-7
+bcx     0.0 1.0
+bcy     0.0 9.0
+ssmin   0.1
+q1      0.3
+q2      2.0
+dtinit  1.e-2
+writexy 0
+chunksize 512
Index: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/main.cc
===================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/main.cc
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/main.cc
@@ -0,0 +1,54 @@
+/*
+ * main.cc
+ *
+ *  Created on: Jan 23, 2012
+ *      Author: cferenba
+ *
+ * Copyright (c) 2012, Los Alamos National Security, LLC.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style open-source
+ * license; see top-level LICENSE file for full license text.
+ */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+
+#include "Parallel.hh"
+#include "InputFile.hh"
+#include "Driver.hh"
+
+using namespace std;
+
+
+int main(const int argc, const char** argv)
+{
+    Parallel::init();
+
+    if (argc != 2) {
+        if (Parallel::mype == 0)
+            cerr << "Usage: pennant <filename>" << endl;
+        exit(1);
+    }
+
+    const char* filename = argv[1];
+    InputFile inp(filename);
+
+    string probname(filename);
+    // strip .pnt suffix from filename
+    int len = probname.length();
+    if (probname.substr(len - 4, 4) == ".pnt")
+        probname = probname.substr(0, len - 4);
+
+    Driver drv(&inp, probname);
+
+    drv.run();
+
+    Parallel::final();
+
+    return 0;
+
+}
+
+
+