Index: test-suite/trunk/Bitcode/Benchmarks/CMakeLists.txt
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/CMakeLists.txt
+++ test-suite/trunk/Bitcode/Benchmarks/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Halide)
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/CMakeLists.txt
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/CMakeLists.txt
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/CMakeLists.txt
@@ -0,0 +1,22 @@
+if (NOT WIN32)
+  list(APPEND LDFLAGS -lpthread -ldl)
+endif()
+if (NOT MSVC)
+  list(APPEND CXXFLAGS "-std=c++11")
+endif()
+
+macro(test_img_input img)
+  set(imgpath "${CMAKE_CURRENT_SOURCE_DIR}/../images/${img}")
+  llvm_test_run(${imgpath}.bytes ${ARGN}
+	        ${CMAKE_CURRENT_BINARY_DIR}/${img}_out.bytes)
+  llvm_test_verify(${FPCMP}
+    ${CMAKE_CURRENT_SOURCE_DIR}/output/${img}_out.bytes
+    ${CMAKE_CURRENT_BINARY_DIR}/${img}_out.bytes
+  )
+endmacro()
+
+if(ARCH STREQUAL "x86")
+  add_subdirectory(local_laplacian)
+  add_subdirectory(bilateral_grid)
+  add_subdirectory(blur)
+endif()
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/CMakeLists.txt
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/CMakeLists.txt
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/CMakeLists.txt
@@ -0,0 +1,14 @@
+file(GLOB bcsources ${CMAKE_CURRENT_SOURCE_DIR}/../common/x86_halide_runtime.bc ${CMAKE_CURRENT_SOURCE_DIR}/bilateral_grid.bc)
+SET_SOURCE_FILES_PROPERTIES(${bcsources} PROPERTIES LANGUAGE CXX)
+
+set(Source ${CMAKE_CURRENT_SOURCE_DIR}/driver.cpp ${bcsources})
+set(PROG halide_bilateral_grid)
+
+test_img_input(rgb 0.1 10)
+test_img_input(rgba 0.1 10)
+
+llvm_multisource()
+
+
+
+
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/bilateral_grid.h
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/bilateral_grid.h
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/bilateral_grid.h
@@ -0,0 +1,42 @@
+#ifndef HALIDE__bilateral_grid_h
+#define HALIDE__bilateral_grid_h
+#ifndef HALIDE_ATTRIBUTE_ALIGN
+  #ifdef _MSC_VER
+    #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x))
+  #else
+    #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x)))
+  #endif
+#endif
+#ifndef BUFFER_T_DEFINED
+#define BUFFER_T_DEFINED
+#include <stdbool.h>
+#include <stdint.h>
+typedef struct buffer_t {
+    uint64_t dev;
+    uint8_t* host;
+    int32_t extent[4];
+    int32_t stride[4];
+    int32_t min[4];
+    int32_t elem_size;
+    HALIDE_ATTRIBUTE_ALIGN(1) bool host_dirty;
+    HALIDE_ATTRIBUTE_ALIGN(1) bool dev_dirty;
+    HALIDE_ATTRIBUTE_ALIGN(1) uint8_t _padding[10 - sizeof(void *)];
+} buffer_t;
+#endif
+struct halide_filter_metadata_t;
+#ifndef HALIDE_FUNCTION_ATTRS
+#define HALIDE_FUNCTION_ATTRS
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int bilateral_grid(float _r_sigma, buffer_t *_input_buffer, buffer_t *_bilateral_grid_buffer) HALIDE_FUNCTION_ATTRS;
+int bilateral_grid_argv(void **args) HALIDE_FUNCTION_ATTRS;
+// Result is never null and points to constant static data
+const struct halide_filter_metadata_t *bilateral_grid_metadata() HALIDE_FUNCTION_ATTRS;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/driver.cpp
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/driver.cpp
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/driver.cpp
@@ -0,0 +1,37 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+
+#include "../common/benchmark.h"
+#include "../common/halide_image.h"
+#include "../common/halide_image_io.h"
+#include "bilateral_grid.h"
+
+using namespace Halide::Tools;
+
+int main(int argc, char **argv) {
+
+    if (argc < 5) {
+        printf("Usage: ./filter input.png range_sigma timing_iterations output.png\n"
+               "e.g. ./filter input.png 0.1 10 output.png\n");
+        return 0;
+    }
+
+    int timing_iterations = atoi(argv[3]);
+
+    Image<float> input = load_image(argv[1]);
+    Image<float> output(input.width(), input.height(), 1);
+
+    bilateral_grid(atof(argv[2]), input, output);
+
+    // Timing code. Timing doesn't include copying the input data to
+    // the gpu or copying the output back.
+    double min_t = benchmark(timing_iterations, 10, [&]() {
+        bilateral_grid(atof(argv[2]), input, output);
+    });
+    printf("%gms\n", min_t * 1e3);
+
+    save_image(output, argv[4]);
+
+    return 0;
+}
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/blur/CMakeLists.txt
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/blur/CMakeLists.txt
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/blur/CMakeLists.txt
@@ -0,0 +1,8 @@
+file(GLOB bcsources ${CMAKE_CURRENT_SOURCE_DIR}/../common/x86_halide_runtime.bc ${CMAKE_CURRENT_SOURCE_DIR}/halide_blur.bc)
+SET_SOURCE_FILES_PROPERTIES(${bcsources} PROPERTIES LANGUAGE CXX)
+
+set(Source ${CMAKE_CURRENT_SOURCE_DIR}/driver.cpp ${bcsources})
+set(PROG halide_blur)
+
+llvm_multisource()
+
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/blur/driver.cpp
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/blur/driver.cpp
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/blur/driver.cpp
@@ -0,0 +1,119 @@
+#include <emmintrin.h>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+
+#include "../common/benchmark.h"
+#include "../common/halide_image.h"
+#include "../common/halide_image_io.h"
+#include "halide_blur.h"
+
+using namespace Halide::Tools;
+
+double t;
+
+Image<uint16_t> blur(Image<uint16_t> in) {
+    Image<uint16_t> tmp(in.width()-8, in.height());
+    Image<uint16_t> out(in.width()-8, in.height()-2);
+
+    t = benchmark(10, 1, [&]() {
+        for (int y = 0; y < tmp.height(); y++)
+            for (int x = 0; x < tmp.width(); x++)
+                tmp(x, y) = (in(x, y) + in(x+1, y) + in(x+2, y))/3;
+
+        for (int y = 0; y < out.height(); y++)
+            for (int x = 0; x < out.width(); x++)
+                out(x, y) = (tmp(x, y) + tmp(x, y+1) + tmp(x, y+2))/3;
+    });
+
+    return out;
+}
+
+
+Image<uint16_t> blur_fast(Image<uint16_t> in) {
+    Image<uint16_t> out(in.width()-8, in.height()-2);
+
+    t = benchmark(10, 1, [&]() {
+        __m128i one_third = _mm_set1_epi16(21846);
+#pragma omp parallel for
+        for (int yTile = 0; yTile < out.height(); yTile += 32) {
+            __m128i a, b, c, sum, avg;
+            __m128i tmp[(128/8) * (32 + 2)];
+            for (int xTile = 0; xTile < out.width(); xTile += 128) {
+                __m128i *tmpPtr = tmp;
+                for (int y = 0; y < 32+2; y++) {
+                    const uint16_t *inPtr = &(in(xTile, yTile+y));
+                    for (int x = 0; x < 128; x += 8) {
+                        a = _mm_load_si128((__m128i*)(inPtr));
+                        b = _mm_loadu_si128((__m128i*)(inPtr+1));
+                        c = _mm_loadu_si128((__m128i*)(inPtr+2));
+                        sum = _mm_add_epi16(_mm_add_epi16(a, b), c);
+                        avg = _mm_mulhi_epi16(sum, one_third);
+                        _mm_store_si128(tmpPtr++, avg);
+                        inPtr+=8;
+                    }
+                }
+                tmpPtr = tmp;
+                for (int y = 0; y < 32; y++) {
+                    __m128i *outPtr = (__m128i *)(&(out(xTile, yTile+y)));
+                    for (int x = 0; x < 128; x += 8) {
+                        a = _mm_load_si128(tmpPtr+(2*128)/8);
+                        b = _mm_load_si128(tmpPtr+128/8);
+                        c = _mm_load_si128(tmpPtr++);
+                        sum = _mm_add_epi16(_mm_add_epi16(a, b), c);
+                        avg = _mm_mulhi_epi16(sum, one_third);
+                        _mm_store_si128(outPtr++, avg);
+                    }
+                }
+            }
+        }
+    });
+
+    return out;
+}
+
+Image<uint16_t> blur_halide(Image<uint16_t> in) {
+    Image<uint16_t> out(in.width()-8, in.height()-2);
+
+    // Call it once to initialize the halide runtime stuff
+    halide_blur(in, out);
+
+    t = benchmark(10, 1, [&]() {
+        // Compute the same region of the output as blur_fast (i.e., we're
+        // still being sloppy with boundary conditions)
+        halide_blur(in, out);
+    });
+
+    return out;
+}
+
+int main(int argc, char **argv) {
+
+    Image<uint16_t> input(6408, 4802);
+
+    for (int y = 0; y < input.height(); y++) {
+        for (int x = 0; x < input.width(); x++) {
+            input(x, y) = rand() & 0xfff;
+        }
+    }
+
+    Image<uint16_t> blurry = blur(input);
+    double slow_time = t;
+
+    Image<uint16_t> speedy = blur_fast(input);
+    double fast_time = t;
+
+    Image<uint16_t> halide = blur_halide(input);
+    double halide_time = t;
+
+    printf("times: %f %f %f\n", slow_time, fast_time, halide_time);
+
+    for (int y = 64; y < input.height() - 64; y++) {
+        for (int x = 64; x < input.width() - 64; x++) {
+            if (blurry(x, y) != speedy(x, y) || blurry(x, y) != halide(x, y))
+                printf("difference at (%d,%d): %d %d %d\n", x, y, blurry(x, y), speedy(x, y), halide(x, y));
+        }
+    }
+
+    return 0;
+}
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/blur/halide_blur.h
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/blur/halide_blur.h
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/blur/halide_blur.h
@@ -0,0 +1,42 @@
+#ifndef HALIDE__halide_blur_h
+#define HALIDE__halide_blur_h
+#ifndef HALIDE_ATTRIBUTE_ALIGN
+  #ifdef _MSC_VER
+    #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x))
+  #else
+    #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x)))
+  #endif
+#endif
+#ifndef BUFFER_T_DEFINED
+#define BUFFER_T_DEFINED
+#include <stdbool.h>
+#include <stdint.h>
+typedef struct buffer_t {
+    uint64_t dev;
+    uint8_t* host;
+    int32_t extent[4];
+    int32_t stride[4];
+    int32_t min[4];
+    int32_t elem_size;
+    HALIDE_ATTRIBUTE_ALIGN(1) bool host_dirty;
+    HALIDE_ATTRIBUTE_ALIGN(1) bool dev_dirty;
+    HALIDE_ATTRIBUTE_ALIGN(1) uint8_t _padding[10 - sizeof(void *)];
+} buffer_t;
+#endif
+struct halide_filter_metadata_t;
+#ifndef HALIDE_FUNCTION_ATTRS
+#define HALIDE_FUNCTION_ATTRS
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int halide_blur(buffer_t *_p0_buffer, buffer_t *_blur_y_buffer) HALIDE_FUNCTION_ATTRS;
+int halide_blur_argv(void **args) HALIDE_FUNCTION_ATTRS;
+// Result is never null and points to constant static data
+const struct halide_filter_metadata_t *halide_blur_metadata() HALIDE_FUNCTION_ATTRS;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/benchmark.h
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/common/benchmark.h
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/benchmark.h
@@ -0,0 +1,58 @@
+#ifndef BENCHMARK_H
+#define BENCHMARK_H
+
+#include <limits>
+
+// Benchmark the operation 'op'. The number of iterations refers to
+// how many times the operation is run for each time measurement, the
+// result is the minimum over a number of samples runs. The result is the
+// amount of time in seconds for one iteration.
+#ifdef _WIN32
+
+union _LARGE_INTEGER;
+typedef union _LARGE_INTEGER LARGE_INTEGER;
+extern "C" int __stdcall QueryPerformanceCounter(LARGE_INTEGER*);
+extern "C" int __stdcall QueryPerformanceFrequency(LARGE_INTEGER*);
+
+template <typename F>
+double benchmark(int samples, int iterations, F op) {
+    int64_t freq;
+    QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
+
+    double best = std::numeric_limits<double>::infinity();
+    for (int i = 0; i < samples; i++) {
+        int64_t t1;
+        QueryPerformanceCounter((LARGE_INTEGER*)&t1);
+        for (int j = 0; j < iterations; j++) {
+            op();
+        }
+        int64_t t2;
+        QueryPerformanceCounter((LARGE_INTEGER*)&t2);
+        double dt = (t2 - t1) / static_cast<double>(freq);
+        if (dt < best) best = dt;
+    }
+    return best / iterations;
+}
+
+#else
+
+#include <chrono>
+
+template <typename F>
+double benchmark(int samples, int iterations, F op) {
+    double best = std::numeric_limits<double>::infinity();
+    for (int i = 0; i < samples; i++) {
+        auto t1 = std::chrono::high_resolution_clock::now();
+        for (int j = 0; j < iterations; j++) {
+            op();
+        }
+        auto t2 = std::chrono::high_resolution_clock::now();
+        double dt = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1e6;
+        if (dt < best) best = dt;
+    }
+    return best / iterations;
+}
+
+#endif
+
+#endif
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_buffer.h
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_buffer.h
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_buffer.h
@@ -0,0 +1,27 @@
+#ifndef HALIDE_ATTRIBUTE_ALIGN
+  #ifdef _MSC_VER
+    #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x))
+  #else
+    #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x)))
+  #endif
+#endif
+#ifndef BUFFER_T_DEFINED
+#define BUFFER_T_DEFINED
+#include <stdbool.h>
+#include <stdint.h>
+typedef struct buffer_t {
+    uint64_t dev;
+    uint8_t* host;
+    int32_t extent[4];
+    int32_t stride[4];
+    int32_t min[4];
+    int32_t elem_size;
+    HALIDE_ATTRIBUTE_ALIGN(1) bool host_dirty;
+    HALIDE_ATTRIBUTE_ALIGN(1) bool dev_dirty;
+    HALIDE_ATTRIBUTE_ALIGN(1) uint8_t _padding[10 - sizeof(void *)];
+} buffer_t;
+#endif
+struct halide_filter_metadata_t;
+#ifndef HALIDE_FUNCTION_ATTRS
+#define HALIDE_FUNCTION_ATTRS
+#endif
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image.h
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image.h
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image.h
@@ -0,0 +1,234 @@
+// This header defines a simple Image class which wraps a buffer_t. This is
+// useful when interacting with a statically-compiled Halide pipeline emitted by
+// Func::compile_to_file, when you do not want to link your processing program
+// against Halide.h/libHalide.a.
+
+#ifndef HALIDE_TOOLS_IMAGE_H
+#define HALIDE_TOOLS_IMAGE_H
+
+#include <cassert>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <stdint.h>  // <cstdint> requires C++11
+
+//#include "HalideRuntime.h"
+#include "halide_buffer.h"
+
+namespace Halide {
+namespace Tools {
+
+template<typename T>
+class Image {
+    struct Contents {
+        Contents(const buffer_t &b, uint8_t *a) : buf(b), ref_count(1), alloc(a) {}
+        buffer_t buf;
+        int ref_count;
+        uint8_t *alloc;
+
+        void dev_free() {
+            //no-op, currently running on x86
+            //halide_device_free(NULL, &buf);
+        }
+
+        ~Contents() {
+            if (buf.dev) {
+                dev_free();
+            }
+            delete[] alloc;
+        }
+    };
+
+    Contents *contents;
+
+    void initialize(int x, int y, int z, int w, bool interleaved) {
+        buffer_t buf = {0};
+        buf.extent[0] = x;
+        buf.extent[1] = y;
+        buf.extent[2] = z;
+        buf.extent[3] = w;
+        if (interleaved) {
+            buf.stride[0] = z;
+            buf.stride[1] = x*z;
+            buf.stride[2] = 1;
+            buf.stride[3] = x*y*z;
+        } else {
+            buf.stride[0] = 1;
+            buf.stride[1] = x;
+            buf.stride[2] = x*y;
+            buf.stride[3] = x*y*z;
+        }
+        buf.elem_size = sizeof(T);
+
+        size_t size = 1;
+        if (x) size *= x;
+        if (y) size *= y;
+        if (z) size *= z;
+        if (w) size *= w;
+
+        uint8_t *ptr = new uint8_t[sizeof(T)*size + 40];
+        buf.host = ptr;
+        buf.host_dirty = false;
+        buf.dev_dirty = false;
+        buf.dev = 0;
+        while ((size_t)buf.host & 0x1f) buf.host++;
+        contents = new Contents(buf, ptr);
+    }
+
+public:
+    typedef T ElemType;
+
+    Image() : contents(NULL) {
+    }
+
+    Image(int x, int y = 0, int z = 0, int w = 0, bool interleaved = false) {
+        initialize(x, y, z, w, interleaved);
+    }
+
+    Image(const Image &other) : contents(other.contents) {
+        if (contents) {
+            contents->ref_count++;
+        }
+    }
+
+    ~Image() {
+        if (contents) {
+            contents->ref_count--;
+            if (contents->ref_count == 0) {
+                delete contents;
+                contents = NULL;
+            }
+        }
+    }
+
+    Image &operator=(const Image &other) {
+        Contents *p = other.contents;
+        if (p) {
+            p->ref_count++;
+        }
+        if (contents) {
+            contents->ref_count--;
+            if (contents->ref_count == 0) {
+                delete contents;
+                contents = NULL;
+            }
+        }
+        contents = p;
+        return *this;
+    }
+
+    T *data() { return (T*)contents->buf.host; }
+
+    const T *data() const { return (T*)contents->buf.host; }
+
+    void set_host_dirty(bool dirty = true) {
+        // If you use data directly, you must also call this so that
+        // gpu-side code knows that it needs to copy stuff over.
+        contents->buf.host_dirty = dirty;
+    }
+
+    void copy_to_host() {
+        if (contents->buf.dev_dirty) {
+            //halide_copy_to_host(NULL, &contents->buf);
+            contents->buf.dev_dirty = false;
+        }
+    }
+
+    void copy_to_device(const struct halide_device_interface *device_interface) {
+        if (contents->buf.host_dirty) {
+            // If host
+            //halide_copy_to_device(NULL, &contents->buf, device_interface);
+            contents->buf.host_dirty = false;
+        }
+    }
+
+    void dev_free() {
+        assert(!contents->buf.dev_dirty);
+        contents->dev_free();
+    }
+
+    Image(T vals[]) {
+        initialize(sizeof(vals)/sizeof(T));
+        for (int i = 0; i < sizeof(vals); i++) (*this)(i) = vals[i];
+    }
+
+    /** Make sure you've called copy_to_host before you start
+     * accessing pixels directly. */
+    T &operator()(int x, int y = 0, int z = 0, int w = 0) {
+        T *ptr = (T *)contents->buf.host;
+        x -= contents->buf.min[0];
+        y -= contents->buf.min[1];
+        z -= contents->buf.min[2];
+        w -= contents->buf.min[3];
+        size_t s0 = contents->buf.stride[0];
+        size_t s1 = contents->buf.stride[1];
+        size_t s2 = contents->buf.stride[2];
+        size_t s3 = contents->buf.stride[3];
+        return ptr[s0 * x + s1 * y + s2 * z + s3 * w];
+    }
+
+    /** Make sure you've called copy_to_host before you start
+     * accessing pixels directly */
+    const T &operator()(int x, int y = 0, int z = 0, int w = 0) const {
+        const T *ptr = (const T *)contents->buf.host;
+        x -= contents->buf.min[0];
+        y -= contents->buf.min[1];
+        z -= contents->buf.min[2];
+        w -= contents->buf.min[3];
+        size_t s0 = contents->buf.stride[0];
+        size_t s1 = contents->buf.stride[1];
+        size_t s2 = contents->buf.stride[2];
+        size_t s3 = contents->buf.stride[3];
+        return ptr[s0 * x + s1 * y + s2 * z + s3 * w];
+    }
+
+    operator buffer_t *() const {
+        return &(contents->buf);
+    }
+
+    int width() const {
+        return dimensions() > 0 ? contents->buf.extent[0] : 1;
+    }
+
+    int height() const {
+        return dimensions() > 1 ? contents->buf.extent[1] : 1;
+    }
+
+    int channels() const {
+        return dimensions() > 2 ? contents->buf.extent[2] : 1;
+    }
+
+    int dimensions() const {
+        for (int i = 0; i < 4; i++) {
+            if (contents->buf.extent[i] == 0) {
+                return i;
+            }
+        }
+        return 4;
+    }
+
+    int stride(int dim) const {
+        return contents->buf.stride[dim];
+    }
+
+    int min(int dim) const {
+        return contents->buf.min[dim];
+    }
+
+    int extent(int dim) const {
+        return contents->buf.extent[dim];
+    }
+
+    void set_min(int x, int y = 0, int z = 0, int w = 0) {
+        contents->buf.min[0] = x;
+        contents->buf.min[1] = y;
+        contents->buf.min[2] = z;
+        contents->buf.min[3] = w;
+    }
+};
+
+}  // namespace Tools
+}  // namespace Halide
+
+#include "halide_image_info.h"
+#endif  // HALIDE_TOOLS_IMAGE_H
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_info.h
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_info.h
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_info.h
@@ -0,0 +1,314 @@
+// This header defines several methods useful for debugging programs that
+// operate on the Image class supporting images with arbitrary dimensions.
+//
+//   Image<uint16_t> input = load_image(argv[1]);
+//
+//   info(input, "input");  // Output the Image header info
+//   dump(input, "input");  // Dump the Image data
+//   stats(input, "input"); // Report statistics for the Image
+//
+//
+#ifndef HALIDE_TOOLS_IMAGE_INFO_H
+#define HALIDE_TOOLS_IMAGE_INFO_H
+
+#include <cassert>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <stdint.h>
+
+#include "halide_buffer.h"
+
+namespace Halide {
+namespace Tools {
+
+static inline void print_dimid(int d, int val) {
+    static const char *dimid[] = {"x", "y", "z", "w"};
+    int numdimid = 4;
+    if (d < numdimid) {
+        std::cout << " " << dimid[d] << ":" << val;
+    } else {
+        std::cout << " extent[" << d << "]:" << val;
+    }
+}
+
+static inline void print_loc(int32_t *loc, int dim, int32_t *min) {
+    for (int d = 0; d < dim; d++) {
+        if (d) { 
+            std::cout << ",";
+        }
+        std::cout << loc[d] + min[d];
+    }
+}
+
+static inline void print_memalign(intptr_t val) {
+    intptr_t align_chk = 1024*1024;
+    while (align_chk > 0) {
+        if ((val & (align_chk-1)) == 0) {
+            char aunit = ' ';
+            if (align_chk >= 1024) {
+                align_chk >>= 10;
+                aunit = 'K';
+            }
+            if (align_chk >= 1024) {
+                align_chk >>= 10;
+                aunit = 'M';
+            }
+            std::cout << "align:" << align_chk;
+            if (aunit != ' ') {
+                std::cout << aunit;
+            }
+            break;
+        }
+        align_chk >>= 1;
+    }
+}
+
+template<typename T>
+void info(Image<T> &img, const char *tag = "Image") {
+    buffer_t *buf = &(*img);
+    int32_t *min = buf->min;
+    int32_t *extent = buf->extent;
+    int32_t *stride = buf->stride;
+    int dim = img.dimensions();
+    int img_bpp = buf->elem_size;
+    int img_tsize = sizeof(T);
+    int img_csize = sizeof(Image<T>);
+    int img_bsize = sizeof(buffer_t);
+    int32_t size = 1;
+    uint64_t dev = buf->dev;
+    bool host_dirty = buf->host_dirty;
+    bool dev_dirty = buf->dev_dirty;
+
+    std::cout << std::endl
+              << "-----------------------------------------------------------------------------";
+    std::cout << std::endl << "Image info: " << tag
+              << " dim:" << dim << " bpp:" << img_bpp;
+    for (int d = 0; d < dim; d++) {
+        print_dimid(d, extent[d]);
+        size *= extent[d];
+    }
+    std::cout << std::endl;
+    std::cout << tag << " class       = 0x" << std::left << std::setw(10) << (void*)img
+                     << std::right << " # ";
+    print_memalign((intptr_t)&img); std::cout << std::endl;
+    std::cout << tag << " class size  = "<< img_csize
+                     << " (0x"<< std::hex << img_csize << std::dec <<")\n";
+    std::cout << tag << "-class => [ 0x" << (void*)&img
+                     << ", 0x" << (void*)(((char*)&img)+img_csize-1)
+                     << " ], # size:" << img_csize << ", ";
+    print_memalign((intptr_t)&img); std::cout << std::endl;
+    std::cout << tag << " buf_t size  = "<< img_bsize
+                     << " (0x"<< std::hex << img_bsize << std::dec <<")\n";
+    std::cout << tag << "-buf_t => [ 0x" << (void*)&buf
+                     << ", 0x" << (void*)(((char*)&buf)+img_bsize-1)
+                     << " ], # size:" << img_bsize << ", ";
+    print_memalign((intptr_t)&buf); std::cout << std::endl;
+    if (img_bpp != img_tsize) {
+        std::cout << tag << " sizeof(T)   = " << img_tsize << std::endl;
+    }
+    std::cout << tag << " host_dirty  = " << host_dirty << std::endl;
+    std::cout << tag << " dev_dirty   = " << dev_dirty << std::endl;
+    std::cout << tag << " dev handle  = " << dev << std::endl;
+    std::cout << tag << " elem_size   = " << img_bpp << std::endl;
+    std::cout << tag << " img_dim     = " << dim << std::endl;
+    std::cout << tag << " width       = " << img.width() << std::endl;
+    std::cout << tag << " height      = " << img.height() << std::endl;
+    std::cout << tag << " channels    = " << img.channels() << std::endl;
+    std::cout << tag << " extent[]    = ";
+    for (int d = 0; d < dim; d++) {
+        std::cout << extent[d] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << tag << " min[]       = ";
+    for (int d = 0; d < dim; d++) {
+        std::cout << min[d] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << tag << " stride[]    = ";
+    for (int d = 0; d < dim; d++) {
+        std::cout << stride[d] << " ";
+    }
+    std::cout << std::endl;
+    if (img_bpp > 1) {
+        for (int d = 0; d < dim; d++) {
+            std::cout << tag << " str[" << d << "]*bpp  = "
+                             << std::left << std::setw(12) << stride[d] * img_bpp
+                             << std::right << " # ";
+            print_memalign(stride[d] * img_bpp); std::cout << std::endl;
+        }
+    }
+
+    const T *img_data = img.data();
+    const T *img_next = img_data + size;
+    int32_t img_size = size * img_bpp;
+    int32_t data_size = (char*)img_next - (char*)img_data;
+    std::cout << tag << " size        = " << size << " (0x"
+                             << std::hex << size << ")" << std::dec << std::endl;
+    std::cout << tag << " img_size    = " << img_size << " (0x"
+                             << std::hex << img_size << ")" << std::dec << std::endl;
+    std::cout << tag << " data        = 0x" << std::left << std::setw(10) << (void *)img_data
+                     << std::right << " # ";
+    print_memalign((intptr_t)img_data); std::cout << std::endl;
+    std::cout << tag << " next        = 0x" << std::left << std::setw(10) << (void *)img_next
+                     << std::right << " # ";
+    print_memalign((intptr_t)img_next); std::cout << std::endl;
+    std::cout << tag << " data_size   = " << data_size  << " (0x"
+                             << std::hex << data_size  << ")" << std::dec << std::endl;
+    std::cout << tag << " => [ 0x" << (void *)img_data
+                         << ", 0x" << (void *)(((char*)img_next)-1)
+                         << "], # size:" << data_size << ", ";
+    print_memalign((intptr_t)img_data); std::cout << std::endl;
+}
+
+template<typename T>
+void dump(Image<T> &img, const char *tag = "Image") {
+    buffer_t *buf = &(*img);
+    int32_t *min = buf->min;
+    int32_t *extent = buf->extent;
+    int32_t *stride = buf->stride;
+    int dim = img.dimensions();
+    int bpp = buf->elem_size;
+    int32_t size = 1;
+
+    std::cout << std::endl << "Image dump: " << tag
+              << " dim:" << dim << " bpp:" << bpp;
+    for (int d = 0; d < dim; d++) {
+        print_dimid(d, extent[d]);
+        size *= extent[d];
+    }
+
+    // Arbitrary dimension image traversal
+    const T *ptr = img.data();
+    int32_t curloc[dim];
+    for (int d = 1; d < dim; d++) {
+        curloc[d] = -1;
+    }
+    curloc[0] = 0;
+
+    for (int32_t i = 0; i < size; i++) {
+        // Track changes in position in higher dimensions
+        for (int d = 1; d < dim; d++) {
+            if ((i % stride[d]) == 0) {
+                curloc[d]++;
+                for (int din = 0; din < d; din++) {
+                    curloc[din] = 0;
+                }
+                std::cout << std::endl;
+                // Print separators for dimensions beyond (x0,y1)
+                if (d > 1) {
+                    print_dimid(d, curloc[d]+min[d]);
+                    std::cout << "\n==========================================";
+                }
+            }
+        }
+
+        // Check for start of row (or wrap due to width)
+        if ((curloc[0] % 16) == 0) {
+            int widx = 0;
+            std::ostringstream idx;
+            if (dim > 1) {   // Multi-dim, just report (x0,y1) on each row
+               idx << "(" << curloc[0]+min[0] << "," << curloc[1]+min[1] << ")";
+               widx = 12;
+            } else {         // Single-dim
+               idx << curloc[0]+min[0];
+               widx = 4;
+            }
+            std::cout << std::endl << std::setw(widx) << idx.str() << ": ";
+        }
+
+        // Display data
+        std::cout << std::setw(4) << *ptr++ + 0 << " ";
+
+        curloc[0]++;  // Track position in row
+    }
+    std::cout << std::endl;
+}
+
+template<typename T>
+void stats(Image<T> &img, const char *tag = "Image") {
+    buffer_t *buf = &(*img);
+    int32_t *min = buf->min;
+    int32_t *extent = buf->extent;
+    int32_t *stride = buf->stride;
+    int dim = img.dimensions();
+    int bpp = buf->elem_size;
+    int32_t size = 1;
+    std::cout << std::endl << "Image stats: " << tag
+              << " dim:" << dim << " bpp:" << bpp;
+    for (int d = 0; d < dim; d++) {
+        print_dimid(d, extent[d]);
+        size *= extent[d];
+    }
+
+    // Arbitrary dimension image traversal
+    const T *ptr = img.data();
+    int32_t curloc[dim];
+    for (int d = 1; d < dim; d++) {
+        curloc[d] = -1;
+    }
+    curloc[0] = 0;
+
+    // Statistics
+    int32_t cnt = 0;
+    double sum = 0;
+    T minval = *ptr;
+    T maxval = *ptr;
+    int32_t minloc[dim];
+    int32_t maxloc[dim];
+    for (int d = 0; d < dim; d++) {
+        minloc[d] = 0;
+        maxloc[d] = 0;
+    }
+
+    for (int32_t i = 0; i < size; i++) {
+        // Track changes in position in higher dimensions
+        for (int d = 1; d < dim; d++) {
+            if ((i % stride[d]) == 0) {
+                curloc[d]++;
+                for (int din = 0; din < d; din++) {
+                    curloc[din] = 0;
+                }
+            }
+        }
+
+        // Collect data
+        T val = *ptr++;
+        sum += val;
+        cnt++;
+        if (val < minval) {
+            minval = val;
+            for (int d = 0; d < dim; d++) {
+                minloc[d] = curloc[d];
+            }
+        }
+        if (val > maxval) {
+            maxval = val;
+            for (int d = 0; d < dim; d++) {
+                maxloc[d] = curloc[d];
+            }
+        }
+
+        curloc[0]++;  // Track position in row
+    }
+
+    double avg = sum / cnt;
+    std::cout << std::endl;
+    std::cout << "min        = " << minval + 0 << " @ (";
+    print_loc(minloc, dim, min);
+    std::cout << ")" << std::endl;
+    std::cout << "max        = " << maxval + 0 << " @ (";
+    print_loc(maxloc, dim, min);
+    std::cout << ")" << std::endl;
+    std::cout << "mean       = " << avg << std::endl;
+    std::cout << "N          = " << cnt << std::endl;
+    std::cout << std::endl;
+}
+
+} // namespace Tools
+} // namespace Halide
+
+#endif  // HALIDE_TOOLS_IMAGE_INFO_H
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_io.h
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_io.h
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_io.h
@@ -0,0 +1,320 @@
+// This simple PNG IO library works with *both* the Halide::Image<T> type *and*
+// the simple halide_image.h version. Also now includes PPM support for faster load/save.
+
+#ifndef HALIDE_IMAGE_IO_H
+#define HALIDE_IMAGE_IO_H
+
+#include <algorithm>
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+namespace Halide {
+namespace Tools {
+
+namespace Internal {
+
+typedef bool (*CheckFunc)(bool condition, const char* fmt, ...);
+
+inline bool CheckFail(bool condition, const char* fmt, ...) {
+    if (!condition) {
+        char buffer[1024];
+        va_list args;
+        va_start(args, fmt);
+        vsnprintf(buffer, sizeof(buffer), fmt, args);
+        va_end(args);
+        fprintf(stderr, "%s", buffer);
+        exit(-1);
+    }
+    return condition;
+}
+
+inline bool CheckReturn(bool condition, const char* fmt, ...) {
+    return condition;
+}
+
+// Convert to u8
+inline void convert(uint8_t in, uint8_t &out) {out = in;}
+inline void convert(uint16_t in, uint8_t &out) {out = in >> 8;}
+inline void convert(uint32_t in, uint8_t &out) {out = in >> 24;}
+inline void convert(int8_t in, uint8_t &out) {out = in;}
+inline void convert(int16_t in, uint8_t &out) {out = in >> 8;}
+inline void convert(int32_t in, uint8_t &out) {out = in >> 24;}
+inline void convert(float in, uint8_t &out) {out = (uint8_t)(in*255.0f);}
+inline void convert(double in, uint8_t &out) {out = (uint8_t)(in*255.0f);}
+
+// Convert to u16
+inline void convert(uint8_t in, uint16_t &out) {out = in << 8;}
+inline void convert(uint16_t in, uint16_t &out) {out = in;}
+inline void convert(uint32_t in, uint16_t &out) {out = in >> 16;}
+inline void convert(int8_t in, uint16_t &out) {out = in << 8;}
+inline void convert(int16_t in, uint16_t &out) {out = in;}
+inline void convert(int32_t in, uint16_t &out) {out = in >> 16;}
+inline void convert(float in, uint16_t &out) {out = (uint16_t)(in*65535.0f);}
+inline void convert(double in, uint16_t &out) {out = (uint16_t)(in*65535.0f);}
+
+// Convert from u8
+inline void convert(uint8_t in, uint32_t &out) {out = in << 24;}
+inline void convert(uint8_t in, int8_t &out) {out = in;}
+inline void convert(uint8_t in, int16_t &out) {out = in << 8;}
+inline void convert(uint8_t in, int32_t &out) {out = in << 24;}
+inline void convert(uint8_t in, float &out) {out = in/255.0f;}
+inline void convert(uint8_t in, double &out) {out = in/255.0f;}
+
+// Convert from u16
+inline void convert(uint16_t in, uint32_t &out) {out = in << 16;}
+inline void convert(uint16_t in, int8_t &out) {out = in >> 8;}
+inline void convert(uint16_t in, int16_t &out) {out = in;}
+inline void convert(uint16_t in, int32_t &out) {out = in << 16;}
+inline void convert(uint16_t in, float &out) {out = in/65535.0f;}
+inline void convert(uint16_t in, double &out) {out = in/65535.0f;}
+
+
+inline bool ends_with_ignore_case(const std::string &ac, const std::string &bc) {
+    if (ac.length() < bc.length()) { return false; }
+    std::string a = ac, b = bc;
+    std::transform(a.begin(), a.end(), a.begin(), ::tolower);
+    std::transform(b.begin(), b.end(), b.begin(), ::tolower);
+    return a.compare(a.length()-b.length(), b.length(), b) == 0;
+}
+
+inline bool is_little_endian() {
+    int value = 1;
+    return ((char *) &value)[0] == 1;
+}
+
+inline void swap_endian_16(bool little_endian, uint16_t &value) {
+    if (little_endian) {
+        value = ((value & 0xff)<<8)|((value & 0xff00)>>8);
+    }
+}
+
+struct FileOpener {
+    FileOpener(const char* filename, const char* mode) : f(fopen(filename, mode)) {
+        // nothing
+    }
+    ~FileOpener() {
+        if (f != nullptr) {
+            fclose(f);
+        }
+    }
+    FILE * const f;
+};
+
+}  // namespace Internal
+
+
+struct BytesImgStruct {
+  int dims[3]; //width, height, channels
+  float* ptr;
+};
+
+
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load_bytes(const std::string &filename, ImageType *im) {
+    Internal::FileOpener f(filename.c_str(), "rb");
+    if (!check(f.f != nullptr, "File %s could not be opened for reading\n", filename.c_str())) return false;
+
+    BytesImgStruct ptrStruct;
+    if (!check(fread(ptrStruct.dims, sizeof(int), 3, f.f) == 3,
+               "Could not read dimensions (width, height, channels) for .bytes image\n")) return false;
+    int img_size = ptrStruct.dims[0]*ptrStruct.dims[1];
+    if (!check(ptrStruct.dims[0] > 0 && ptrStruct.dims[1] > 0 && ptrStruct.dims[2] > 0,
+               "File %s does not have valid input\n", filename.c_str())) return false;
+    ptrStruct.ptr = (float*) malloc(img_size * sizeof(float));
+    if (!check(fread(ptrStruct.ptr, sizeof(float), img_size, f.f) == img_size,
+               "Could not read .bytes image\n")) return false;
+
+    if (ptrStruct.dims[2] != 1) {
+        *im = ImageType(ptrStruct.dims[0], ptrStruct.dims[1], ptrStruct.dims[2]);
+    } else {
+        *im = ImageType(ptrStruct.dims[0], ptrStruct.dims[1]);
+    }
+    typename ImageType::ElemType *ptr = (typename ImageType::ElemType*)im->data();
+    for (int i=0; i<img_size; i++)
+      ptr[i] = (typename ImageType::ElemType) ptrStruct.ptr[i];
+    free(ptrStruct.ptr);
+    return true;
+}
+
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save_bytes(ImageType &im, const std::string &filename) {
+    BytesImgStruct ptrStruct;
+    ptrStruct.dims[0] = im.width();
+    ptrStruct.dims[1] = im.height();
+    ptrStruct.dims[2] = im.channels();
+    int img_size = ptrStruct.dims[0]*ptrStruct.dims[1];
+    ptrStruct.ptr = (float*) malloc(img_size * sizeof(float));
+    typename ImageType::ElemType *ptr = (typename ImageType::ElemType*)im.data();
+    for (int i=0; i<img_size; i++)
+      ptrStruct.ptr[i] = (float) ptr[i];
+
+    Internal::FileOpener f(filename.c_str(), "wb");
+    if (!check(f.f != nullptr, "File %s could not be opened for writing\n", filename.c_str())) return false;
+    if (!check(fwrite (ptrStruct.dims, sizeof(int), 3, f.f),
+               "Could not write dimensions (width, height, channels) for .bytes image\n")) return false;
+    if (!check(fwrite (ptrStruct.ptr, sizeof(float), img_size, f.f),
+               "Could not write .bytes image\n")) return false;
+    return true;
+}
+
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load_ppm(const std::string &filename, ImageType *im) {
+
+    /* open file and test for it being a ppm */
+    Internal::FileOpener f(filename.c_str(), "rb");
+    if (!check(f.f != nullptr, "File %s could not be opened for reading\n", filename.c_str())) return false;
+
+    int width, height, maxval;
+    char header[256];
+    if (!check(fscanf(f.f, "%255s", header) == 1, "Could not read PPM header\n")) return false;
+    if (!check(fscanf(f.f, "%d %d\n", &width, &height) == 2, "Could not read PPM width and height\n")) return false;
+    if (!check(fscanf(f.f, "%d", &maxval) == 1, "Could not read PPM max value\n")) return false;
+    if (!check(fgetc(f.f) != EOF, "Could not read char from PPM\n")) return false;
+
+    int bit_depth = 0;
+    if (maxval == 255) { bit_depth = 8; }
+    else if (maxval == 65535) { bit_depth = 16; }
+    else { if (!check(false, "Invalid bit depth in PPM\n")) return false; }
+
+    if (!check(header == std::string("P6") || header == std::string("p6"), "Input is not binary PPM\n")) return false;
+
+    int channels = 3;
+    *im = ImageType(width, height, channels);
+
+    // convert the data to ImageType::ElemType
+    if (bit_depth == 8) {
+        std::vector<uint8_t> data(width*height*3);
+        if (!check(fread((void *) data.data(), sizeof(uint8_t), width*height*3, f.f) == (size_t) (width*height*3), "Could not read PPM 8-bit data\n")) return false;
+        typename ImageType::ElemType *im_data = (typename ImageType::ElemType*) im->data();
+        for (int y = 0; y < im->height(); y++) {
+            uint8_t *row = &data[(y*width)*3];
+            for (int x = 0; x < im->width(); x++) {
+                Internal::convert(*row++, im_data[(0*height+y)*width+x]);
+                Internal::convert(*row++, im_data[(1*height+y)*width+x]);
+                Internal::convert(*row++, im_data[(2*height+y)*width+x]);
+            }
+        }
+    } else if (bit_depth == 16) {
+        int little_endian = Internal::is_little_endian();
+        std::vector<uint16_t> data(width*height*3);
+        if (!check(fread((void *) data.data(), sizeof(uint16_t), width*height*3, f.f) == (size_t) (width*height*3), "Could not read PPM 16-bit data\n")) return false;
+        typename ImageType::ElemType *im_data = (typename ImageType::ElemType*) im->data();
+        for (int y = 0; y < im->height(); y++) {
+            uint16_t *row = &data[(y*width)*3];
+            for (int x = 0; x < im->width(); x++) {
+                uint16_t value;
+                value = *row++; Internal::swap_endian_16(little_endian, value); Internal::convert(value, im_data[(0*height+y)*width+x]);
+                value = *row++; Internal::swap_endian_16(little_endian, value); Internal::convert(value, im_data[(1*height+y)*width+x]);
+                value = *row++; Internal::swap_endian_16(little_endian, value); Internal::convert(value, im_data[(2*height+y)*width+x]);
+            }
+        }
+    }
+    (*im)(0,0,0) = (*im)(0,0,0);      /* Mark dirty inside read/write functions. */
+
+    return true;
+}
+
+// "im" is not const-ref because copy_to_host() is not const.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save_ppm(ImageType &im, const std::string &filename) {
+    im.copy_to_host();
+
+    unsigned int bit_depth = sizeof(typename ImageType::ElemType) == 1 ? 8: 16;
+
+    Internal::FileOpener f(filename.c_str(), "wb");
+    if (!check(f.f != nullptr, "File %s could not be opened for writing\n", filename.c_str())) return false;
+    fprintf(f.f, "P6\n%d %d\n%d\n", im.width(), im.height(), (1<<bit_depth)-1);
+    int width = im.width(), height = im.height();
+
+    if (bit_depth == 8) {
+        std::vector<uint8_t> data(width*height*3);
+        for (int y = 0; y < im.height(); y++) {
+            for (int x = 0; x < im.width(); x++) {
+                uint8_t *p = &data[(y*width+x)*3];
+                for (int c = 0; c < im.channels(); c++) {
+                    Internal::convert(im(x, y, c), p[c]);
+                }
+            }
+        }
+        if (!check(fwrite((void *) data.data(), sizeof(uint8_t), width*height*3, f.f) == (size_t) (width*height*3), "Could not write PPM 8-bit data\n")) return false;
+    } else if (bit_depth == 16) {
+        int little_endian = Internal::is_little_endian();
+        std::vector<uint16_t> data(width*height*3);
+        for (int y = 0; y < im.height(); y++) {
+            for (int x = 0; x < im.width(); x++) {
+                uint16_t *p = &data[(y*width+x)*3];
+                for (int c = 0; c < im.channels(); c++) {
+                    uint16_t value;
+                    Internal::convert(im(x, y, c), value);
+                    Internal::swap_endian_16(little_endian, value);
+                    p[c] = value;
+                }
+            }
+        }
+        if (!check(fwrite((void *) data.data(), sizeof(uint16_t), width*height*3, f.f) == (size_t) (width*height*3), "Could not write PPM 16-bit data\n")) return false;
+    } else {
+        return check(false, "We only support saving 8- and 16-bit images.");
+    }
+    return true;
+}
+
+// Returns false upon failure.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load(const std::string &filename, ImageType *im) {
+    if (Internal::ends_with_ignore_case(filename, ".ppm")) {
+        return load_ppm<ImageType, check>(filename, im);
+    } else if (Internal::ends_with_ignore_case(filename, ".bytes")) {
+        return (load_bytes<ImageType, check>(filename, im));
+    } else {
+        return check(false, "[load] unsupported file extension (bytes|ppm supported)");
+    }
+}
+
+// Returns false upon failure.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save(ImageType &im, const std::string &filename) {
+    if (Internal::ends_with_ignore_case(filename, ".ppm")) {
+        return save_ppm<ImageType, check>(im, filename);
+    } else if (Internal::ends_with_ignore_case(filename, ".bytes")) {
+        return save_bytes<ImageType, check>(im, filename);
+    } else {
+        return check(false, "[save] unsupported file extension (bytes|ppm supported)");
+    }
+}
+
+// Fancy wrapper to call load() with CheckFail, inferring the return type;
+// this allows you to simply use
+//
+//    Image im = load_image("filename");
+//
+// without bothering to check error results (all errors simply abort).
+class load_image {
+public:
+    load_image(const std::string &f) : filename(f) {}
+    template<typename ImageType>
+    inline operator ImageType() {
+        ImageType im;
+        (void) load<ImageType, Internal::CheckFail>(filename, &im);
+        return im;
+    }
+private:
+  const std::string filename;
+};
+
+// Fancy wrapper to call save() with CheckFail; this allows you to simply use
+//
+//    save_image(im, "filename");
+//
+// without bothering to check error results (all errors simply abort).
+template<typename ImageType>
+void save_image(ImageType &im, const std::string &filename) {
+    (void) save<ImageType, Internal::CheckFail>(im, filename);
+}
+
+}  // namespace Tools
+}  // namespace Halide
+
+#endif  // HALIDE_IMAGE_IO_H
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/CMakeLists.txt
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/CMakeLists.txt
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/CMakeLists.txt
@@ -0,0 +1,14 @@
+file(GLOB bcsources ${CMAKE_CURRENT_SOURCE_DIR}/../common/x86_halide_runtime.bc ${CMAKE_CURRENT_SOURCE_DIR}/local_laplacian.bc)
+SET_SOURCE_FILES_PROPERTIES(${bcsources} PROPERTIES LANGUAGE CXX)
+
+set(Source ${CMAKE_CURRENT_SOURCE_DIR}/driver.cpp ${bcsources})
+set(PROG halide_local_laplacian)
+
+test_img_input(rgb 8 1 1 10)
+test_img_input(rgba 8 1 1 10)
+
+llvm_multisource()
+
+
+
+
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/driver.cpp
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/driver.cpp
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/driver.cpp
@@ -0,0 +1,33 @@
+#include "../common/benchmark.h"
+#include "../common/halide_image.h"
+#include "../common/halide_image_io.h"
+#include "local_laplacian.h"
+
+using namespace Halide::Tools;
+
+int main(int argc, char **argv) {
+    if (argc < 7) {
+        printf("Usage: ./process input.png levels alpha beta timing_iterations output.png\n"
+               "e.g.: ./process input.png 8 1 1 10 output.png\n");
+        return 0;
+    }
+
+    Image<uint16_t> input = load_image(argv[1]);
+    int levels = atoi(argv[2]);
+    float alpha = atof(argv[3]), beta = atof(argv[4]);
+    Image<uint16_t> output(input.width(), input.height(), 3);
+    int timing = atoi(argv[5]);
+
+    local_laplacian(levels, alpha/(levels-1), beta, input, output);
+
+    // Timing code
+    double best = benchmark(timing, 1, [&]() {
+        local_laplacian(levels, alpha/(levels-1), beta, input, output);
+    });
+    printf("%gus\n", best * 1e6);
+
+    save_image(output, argv[6]);
+
+    return 0;
+}
+
Index: test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/local_laplacian.h
===================================================================
--- test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/local_laplacian.h
+++ test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/local_laplacian.h
@@ -0,0 +1,42 @@
+#ifndef HALIDE__local_laplacian_h
+#define HALIDE__local_laplacian_h
+#ifndef HALIDE_ATTRIBUTE_ALIGN
+  #ifdef _MSC_VER
+    #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x))
+  #else
+    #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x)))
+  #endif
+#endif
+#ifndef BUFFER_T_DEFINED
+#define BUFFER_T_DEFINED
+#include <stdbool.h>
+#include <stdint.h>
+typedef struct buffer_t {
+    uint64_t dev;
+    uint8_t* host;
+    int32_t extent[4];
+    int32_t stride[4];
+    int32_t min[4];
+    int32_t elem_size;
+    HALIDE_ATTRIBUTE_ALIGN(1) bool host_dirty;
+    HALIDE_ATTRIBUTE_ALIGN(1) bool dev_dirty;
+    HALIDE_ATTRIBUTE_ALIGN(1) uint8_t _padding[10 - sizeof(void *)];
+} buffer_t;
+#endif
+struct halide_filter_metadata_t;
+#ifndef HALIDE_FUNCTION_ATTRS
+#define HALIDE_FUNCTION_ATTRS
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int local_laplacian(int32_t _levels, float _alpha, float _beta, buffer_t *_input_buffer, buffer_t *_local_laplacian_buffer) HALIDE_FUNCTION_ATTRS;
+int local_laplacian_argv(void **args) HALIDE_FUNCTION_ATTRS;
+// Result is never null and points to constant static data
+const struct halide_filter_metadata_t *local_laplacian_metadata() HALIDE_FUNCTION_ATTRS;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
Index: test-suite/trunk/Bitcode/CMakeLists.txt
===================================================================
--- test-suite/trunk/Bitcode/CMakeLists.txt
+++ test-suite/trunk/Bitcode/CMakeLists.txt
@@ -8,6 +8,7 @@
 
 set(TEST_SUITE_ENABLE_BITCODE_TESTS ${ENABLE_BITCODE_DEFAULT} CACHE BOOL "Enable bitcode tests")
 if(TEST_SUITE_ENABLE_BITCODE_TESTS)
+  llvm_add_subdirectories(Benchmarks)
   if(NOT TEST_SUITE_BENCHMARKING_ONLY)
     llvm_add_subdirectories(Regression)
     if(ARCH STREQUAL "x86" OR ARCH STREQUAL "AArch64" OR ARCH STREQUAL "ARM")