Index: test-suite/trunk/Bitcode/Benchmarks/CMakeLists.txt =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/CMakeLists.txt +++ test-suite/trunk/Bitcode/Benchmarks/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Halide) Index: test-suite/trunk/Bitcode/Benchmarks/Halide/CMakeLists.txt =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/CMakeLists.txt +++ test-suite/trunk/Bitcode/Benchmarks/Halide/CMakeLists.txt @@ -0,0 +1,22 @@ +if (NOT WIN32) + list(APPEND LDFLAGS -lpthread -ldl) +endif() +if (NOT MSVC) + list(APPEND CXXFLAGS "-std=c++11") +endif() + +macro(test_img_input img) + set(imgpath "${CMAKE_CURRENT_SOURCE_DIR}/../images/${img}") + llvm_test_run(${imgpath}.bytes ${ARGN} + ${CMAKE_CURRENT_BINARY_DIR}/${img}_out.bytes) + llvm_test_verify(${FPCMP} + ${CMAKE_CURRENT_SOURCE_DIR}/output/${img}_out.bytes + ${CMAKE_CURRENT_BINARY_DIR}/${img}_out.bytes + ) +endmacro() + +if(ARCH STREQUAL "x86") + add_subdirectory(local_laplacian) + add_subdirectory(bilateral_grid) + add_subdirectory(blur) +endif() Index: test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/CMakeLists.txt =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/CMakeLists.txt +++ test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/CMakeLists.txt @@ -0,0 +1,14 @@ +file(GLOB bcsources ${CMAKE_CURRENT_SOURCE_DIR}/../common/x86_halide_runtime.bc ${CMAKE_CURRENT_SOURCE_DIR}/bilateral_grid.bc) +SET_SOURCE_FILES_PROPERTIES(${bcsources} PROPERTIES LANGUAGE CXX) + +set(Source ${CMAKE_CURRENT_SOURCE_DIR}/driver.cpp ${bcsources}) +set(PROG halide_bilateral_grid) + +test_img_input(rgb 0.1 10) +test_img_input(rgba 0.1 10) + +llvm_multisource() + + + + Index: test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/bilateral_grid.h =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/bilateral_grid.h +++ test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/bilateral_grid.h @@ -0,0 +1,42 @@ +#ifndef HALIDE__bilateral_grid_h +#define HALIDE__bilateral_grid_h +#ifndef HALIDE_ATTRIBUTE_ALIGN + #ifdef _MSC_VER + #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x)) + #else + #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x))) + #endif +#endif +#ifndef BUFFER_T_DEFINED +#define BUFFER_T_DEFINED +#include +#include +typedef struct buffer_t { + uint64_t dev; + uint8_t* host; + int32_t extent[4]; + int32_t stride[4]; + int32_t min[4]; + int32_t elem_size; + HALIDE_ATTRIBUTE_ALIGN(1) bool host_dirty; + HALIDE_ATTRIBUTE_ALIGN(1) bool dev_dirty; + HALIDE_ATTRIBUTE_ALIGN(1) uint8_t _padding[10 - sizeof(void *)]; +} buffer_t; +#endif +struct halide_filter_metadata_t; +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif +#ifdef __cplusplus +extern "C" { +#endif + +int bilateral_grid(float _r_sigma, buffer_t *_input_buffer, buffer_t *_bilateral_grid_buffer) HALIDE_FUNCTION_ATTRS; +int bilateral_grid_argv(void **args) HALIDE_FUNCTION_ATTRS; +// Result is never null and points to constant static data +const struct halide_filter_metadata_t *bilateral_grid_metadata() HALIDE_FUNCTION_ATTRS; + +#ifdef __cplusplus +} // extern "C" +#endif +#endif Index: test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/driver.cpp =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/driver.cpp +++ test-suite/trunk/Bitcode/Benchmarks/Halide/bilateral_grid/driver.cpp @@ -0,0 +1,37 @@ +#include +#include +#include + +#include "../common/benchmark.h" +#include "../common/halide_image.h" +#include "../common/halide_image_io.h" +#include "bilateral_grid.h" + +using namespace Halide::Tools; + +int main(int argc, char **argv) { + + if (argc < 5) { + printf("Usage: ./filter input.png range_sigma timing_iterations output.png\n" + "e.g. ./filter input.png 0.1 10 output.png\n"); + return 0; + } + + int timing_iterations = atoi(argv[3]); + + Image input = load_image(argv[1]); + Image output(input.width(), input.height(), 1); + + bilateral_grid(atof(argv[2]), input, output); + + // Timing code. Timing doesn't include copying the input data to + // the gpu or copying the output back. + double min_t = benchmark(timing_iterations, 10, [&]() { + bilateral_grid(atof(argv[2]), input, output); + }); + printf("%gms\n", min_t * 1e3); + + save_image(output, argv[4]); + + return 0; +} Index: test-suite/trunk/Bitcode/Benchmarks/Halide/blur/CMakeLists.txt =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/blur/CMakeLists.txt +++ test-suite/trunk/Bitcode/Benchmarks/Halide/blur/CMakeLists.txt @@ -0,0 +1,8 @@ +file(GLOB bcsources ${CMAKE_CURRENT_SOURCE_DIR}/../common/x86_halide_runtime.bc ${CMAKE_CURRENT_SOURCE_DIR}/halide_blur.bc) +SET_SOURCE_FILES_PROPERTIES(${bcsources} PROPERTIES LANGUAGE CXX) + +set(Source ${CMAKE_CURRENT_SOURCE_DIR}/driver.cpp ${bcsources}) +set(PROG halide_blur) + +llvm_multisource() + Index: test-suite/trunk/Bitcode/Benchmarks/Halide/blur/driver.cpp =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/blur/driver.cpp +++ test-suite/trunk/Bitcode/Benchmarks/Halide/blur/driver.cpp @@ -0,0 +1,119 @@ +#include +#include +#include +#include + +#include "../common/benchmark.h" +#include "../common/halide_image.h" +#include "../common/halide_image_io.h" +#include "halide_blur.h" + +using namespace Halide::Tools; + +double t; + +Image blur(Image in) { + Image tmp(in.width()-8, in.height()); + Image out(in.width()-8, in.height()-2); + + t = benchmark(10, 1, [&]() { + for (int y = 0; y < tmp.height(); y++) + for (int x = 0; x < tmp.width(); x++) + tmp(x, y) = (in(x, y) + in(x+1, y) + in(x+2, y))/3; + + for (int y = 0; y < out.height(); y++) + for (int x = 0; x < out.width(); x++) + out(x, y) = (tmp(x, y) + tmp(x, y+1) + tmp(x, y+2))/3; + }); + + return out; +} + + +Image blur_fast(Image in) { + Image out(in.width()-8, in.height()-2); + + t = benchmark(10, 1, [&]() { + __m128i one_third = _mm_set1_epi16(21846); +#pragma omp parallel for + for (int yTile = 0; yTile < out.height(); yTile += 32) { + __m128i a, b, c, sum, avg; + __m128i tmp[(128/8) * (32 + 2)]; + for (int xTile = 0; xTile < out.width(); xTile += 128) { + __m128i *tmpPtr = tmp; + for (int y = 0; y < 32+2; y++) { + const uint16_t *inPtr = &(in(xTile, yTile+y)); + for (int x = 0; x < 128; x += 8) { + a = _mm_load_si128((__m128i*)(inPtr)); + b = _mm_loadu_si128((__m128i*)(inPtr+1)); + c = _mm_loadu_si128((__m128i*)(inPtr+2)); + sum = _mm_add_epi16(_mm_add_epi16(a, b), c); + avg = _mm_mulhi_epi16(sum, one_third); + _mm_store_si128(tmpPtr++, avg); + inPtr+=8; + } + } + tmpPtr = tmp; + for (int y = 0; y < 32; y++) { + __m128i *outPtr = (__m128i *)(&(out(xTile, yTile+y))); + for (int x = 0; x < 128; x += 8) { + a = _mm_load_si128(tmpPtr+(2*128)/8); + b = _mm_load_si128(tmpPtr+128/8); + c = _mm_load_si128(tmpPtr++); + sum = _mm_add_epi16(_mm_add_epi16(a, b), c); + avg = _mm_mulhi_epi16(sum, one_third); + _mm_store_si128(outPtr++, avg); + } + } + } + } + }); + + return out; +} + +Image blur_halide(Image in) { + Image out(in.width()-8, in.height()-2); + + // Call it once to initialize the halide runtime stuff + halide_blur(in, out); + + t = benchmark(10, 1, [&]() { + // Compute the same region of the output as blur_fast (i.e., we're + // still being sloppy with boundary conditions) + halide_blur(in, out); + }); + + return out; +} + +int main(int argc, char **argv) { + + Image input(6408, 4802); + + for (int y = 0; y < input.height(); y++) { + for (int x = 0; x < input.width(); x++) { + input(x, y) = rand() & 0xfff; + } + } + + Image blurry = blur(input); + double slow_time = t; + + Image speedy = blur_fast(input); + double fast_time = t; + + Image halide = blur_halide(input); + double halide_time = t; + + printf("times: %f %f %f\n", slow_time, fast_time, halide_time); + + for (int y = 64; y < input.height() - 64; y++) { + for (int x = 64; x < input.width() - 64; x++) { + if (blurry(x, y) != speedy(x, y) || blurry(x, y) != halide(x, y)) + printf("difference at (%d,%d): %d %d %d\n", x, y, blurry(x, y), speedy(x, y), halide(x, y)); + } + } + + return 0; +} Index: test-suite/trunk/Bitcode/Benchmarks/Halide/blur/halide_blur.h =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/blur/halide_blur.h +++ test-suite/trunk/Bitcode/Benchmarks/Halide/blur/halide_blur.h @@ -0,0 +1,42 @@ +#ifndef HALIDE__halide_blur_h +#define HALIDE__halide_blur_h +#ifndef HALIDE_ATTRIBUTE_ALIGN + #ifdef _MSC_VER + #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x)) + #else + #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x))) + #endif +#endif +#ifndef BUFFER_T_DEFINED +#define BUFFER_T_DEFINED +#include +#include +typedef struct buffer_t { + uint64_t dev; + uint8_t* host; + int32_t extent[4]; + int32_t stride[4]; + int32_t min[4]; + int32_t elem_size; + HALIDE_ATTRIBUTE_ALIGN(1) bool host_dirty; + HALIDE_ATTRIBUTE_ALIGN(1) bool dev_dirty; + HALIDE_ATTRIBUTE_ALIGN(1) uint8_t _padding[10 - sizeof(void *)]; +} buffer_t; +#endif +struct halide_filter_metadata_t; +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif +#ifdef __cplusplus +extern "C" { +#endif + +int halide_blur(buffer_t *_p0_buffer, buffer_t *_blur_y_buffer) HALIDE_FUNCTION_ATTRS; +int halide_blur_argv(void **args) HALIDE_FUNCTION_ATTRS; +// Result is never null and points to constant static data +const struct halide_filter_metadata_t *halide_blur_metadata() HALIDE_FUNCTION_ATTRS; + +#ifdef __cplusplus +} // extern "C" +#endif +#endif Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/benchmark.h =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/common/benchmark.h +++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/benchmark.h @@ -0,0 +1,58 @@ +#ifndef BENCHMARK_H +#define BENCHMARK_H + +#include + +// Benchmark the operation 'op'. The number of iterations refers to +// how many times the operation is run for each time measurement, the +// result is the minimum over a number of samples runs. The result is the +// amount of time in seconds for one iteration. +#ifdef _WIN32 + +union _LARGE_INTEGER; +typedef union _LARGE_INTEGER LARGE_INTEGER; +extern "C" int __stdcall QueryPerformanceCounter(LARGE_INTEGER*); +extern "C" int __stdcall QueryPerformanceFrequency(LARGE_INTEGER*); + +template +double benchmark(int samples, int iterations, F op) { + int64_t freq; + QueryPerformanceFrequency((LARGE_INTEGER*)&freq); + + double best = std::numeric_limits::infinity(); + for (int i = 0; i < samples; i++) { + int64_t t1; + QueryPerformanceCounter((LARGE_INTEGER*)&t1); + for (int j = 0; j < iterations; j++) { + op(); + } + int64_t t2; + QueryPerformanceCounter((LARGE_INTEGER*)&t2); + double dt = (t2 - t1) / static_cast(freq); + if (dt < best) best = dt; + } + return best / iterations; +} + +#else + +#include + +template +double benchmark(int samples, int iterations, F op) { + double best = std::numeric_limits::infinity(); + for (int i = 0; i < samples; i++) { + auto t1 = std::chrono::high_resolution_clock::now(); + for (int j = 0; j < iterations; j++) { + op(); + } + auto t2 = std::chrono::high_resolution_clock::now(); + double dt = std::chrono::duration_cast(t2 - t1).count() / 1e6; + if (dt < best) best = dt; + } + return best / iterations; +} + +#endif + +#endif Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_buffer.h =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_buffer.h +++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_buffer.h @@ -0,0 +1,27 @@ +#ifndef HALIDE_ATTRIBUTE_ALIGN + #ifdef _MSC_VER + #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x)) + #else + #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x))) + #endif +#endif +#ifndef BUFFER_T_DEFINED +#define BUFFER_T_DEFINED +#include +#include +typedef struct buffer_t { + uint64_t dev; + uint8_t* host; + int32_t extent[4]; + int32_t stride[4]; + int32_t min[4]; + int32_t elem_size; + HALIDE_ATTRIBUTE_ALIGN(1) bool host_dirty; + HALIDE_ATTRIBUTE_ALIGN(1) bool dev_dirty; + HALIDE_ATTRIBUTE_ALIGN(1) uint8_t _padding[10 - sizeof(void *)]; +} buffer_t; +#endif +struct halide_filter_metadata_t; +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image.h =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image.h +++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image.h @@ -0,0 +1,234 @@ +// This header defines a simple Image class which wraps a buffer_t. This is +// useful when interacting with a statically-compiled Halide pipeline emitted by +// Func::compile_to_file, when you do not want to link your processing program +// against Halide.h/libHalide.a. + +#ifndef HALIDE_TOOLS_IMAGE_H +#define HALIDE_TOOLS_IMAGE_H + +#include +#include +#include +#include +#include // requires C++11 + +//#include "HalideRuntime.h" +#include "halide_buffer.h" + +namespace Halide { +namespace Tools { + +template +class Image { + struct Contents { + Contents(const buffer_t &b, uint8_t *a) : buf(b), ref_count(1), alloc(a) {} + buffer_t buf; + int ref_count; + uint8_t *alloc; + + void dev_free() { + //no-op, currently running on x86 + //halide_device_free(NULL, &buf); + } + + ~Contents() { + if (buf.dev) { + dev_free(); + } + delete[] alloc; + } + }; + + Contents *contents; + + void initialize(int x, int y, int z, int w, bool interleaved) { + buffer_t buf = {0}; + buf.extent[0] = x; + buf.extent[1] = y; + buf.extent[2] = z; + buf.extent[3] = w; + if (interleaved) { + buf.stride[0] = z; + buf.stride[1] = x*z; + buf.stride[2] = 1; + buf.stride[3] = x*y*z; + } else { + buf.stride[0] = 1; + buf.stride[1] = x; + buf.stride[2] = x*y; + buf.stride[3] = x*y*z; + } + buf.elem_size = sizeof(T); + + size_t size = 1; + if (x) size *= x; + if (y) size *= y; + if (z) size *= z; + if (w) size *= w; + + uint8_t *ptr = new uint8_t[sizeof(T)*size + 40]; + buf.host = ptr; + buf.host_dirty = false; + buf.dev_dirty = false; + buf.dev = 0; + while ((size_t)buf.host & 0x1f) buf.host++; + contents = new Contents(buf, ptr); + } + +public: + typedef T ElemType; + + Image() : contents(NULL) { + } + + Image(int x, int y = 0, int z = 0, int w = 0, bool interleaved = false) { + initialize(x, y, z, w, interleaved); + } + + Image(const Image &other) : contents(other.contents) { + if (contents) { + contents->ref_count++; + } + } + + ~Image() { + if (contents) { + contents->ref_count--; + if (contents->ref_count == 0) { + delete contents; + contents = NULL; + } + } + } + + Image &operator=(const Image &other) { + Contents *p = other.contents; + if (p) { + p->ref_count++; + } + if (contents) { + contents->ref_count--; + if (contents->ref_count == 0) { + delete contents; + contents = NULL; + } + } + contents = p; + return *this; + } + + T *data() { return (T*)contents->buf.host; } + + const T *data() const { return (T*)contents->buf.host; } + + void set_host_dirty(bool dirty = true) { + // If you use data directly, you must also call this so that + // gpu-side code knows that it needs to copy stuff over. + contents->buf.host_dirty = dirty; + } + + void copy_to_host() { + if (contents->buf.dev_dirty) { + //halide_copy_to_host(NULL, &contents->buf); + contents->buf.dev_dirty = false; + } + } + + void copy_to_device(const struct halide_device_interface *device_interface) { + if (contents->buf.host_dirty) { + // If host + //halide_copy_to_device(NULL, &contents->buf, device_interface); + contents->buf.host_dirty = false; + } + } + + void dev_free() { + assert(!contents->buf.dev_dirty); + contents->dev_free(); + } + + Image(T vals[]) { + initialize(sizeof(vals)/sizeof(T)); + for (int i = 0; i < sizeof(vals); i++) (*this)(i) = vals[i]; + } + + /** Make sure you've called copy_to_host before you start + * accessing pixels directly. */ + T &operator()(int x, int y = 0, int z = 0, int w = 0) { + T *ptr = (T *)contents->buf.host; + x -= contents->buf.min[0]; + y -= contents->buf.min[1]; + z -= contents->buf.min[2]; + w -= contents->buf.min[3]; + size_t s0 = contents->buf.stride[0]; + size_t s1 = contents->buf.stride[1]; + size_t s2 = contents->buf.stride[2]; + size_t s3 = contents->buf.stride[3]; + return ptr[s0 * x + s1 * y + s2 * z + s3 * w]; + } + + /** Make sure you've called copy_to_host before you start + * accessing pixels directly */ + const T &operator()(int x, int y = 0, int z = 0, int w = 0) const { + const T *ptr = (const T *)contents->buf.host; + x -= contents->buf.min[0]; + y -= contents->buf.min[1]; + z -= contents->buf.min[2]; + w -= contents->buf.min[3]; + size_t s0 = contents->buf.stride[0]; + size_t s1 = contents->buf.stride[1]; + size_t s2 = contents->buf.stride[2]; + size_t s3 = contents->buf.stride[3]; + return ptr[s0 * x + s1 * y + s2 * z + s3 * w]; + } + + operator buffer_t *() const { + return &(contents->buf); + } + + int width() const { + return dimensions() > 0 ? contents->buf.extent[0] : 1; + } + + int height() const { + return dimensions() > 1 ? contents->buf.extent[1] : 1; + } + + int channels() const { + return dimensions() > 2 ? contents->buf.extent[2] : 1; + } + + int dimensions() const { + for (int i = 0; i < 4; i++) { + if (contents->buf.extent[i] == 0) { + return i; + } + } + return 4; + } + + int stride(int dim) const { + return contents->buf.stride[dim]; + } + + int min(int dim) const { + return contents->buf.min[dim]; + } + + int extent(int dim) const { + return contents->buf.extent[dim]; + } + + void set_min(int x, int y = 0, int z = 0, int w = 0) { + contents->buf.min[0] = x; + contents->buf.min[1] = y; + contents->buf.min[2] = z; + contents->buf.min[3] = w; + } +}; + +} // namespace Tools +} // namespace Halide + +#include "halide_image_info.h" +#endif // HALIDE_TOOLS_IMAGE_H Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_info.h =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_info.h +++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_info.h @@ -0,0 +1,314 @@ +// This header defines several methods useful for debugging programs that +// operate on the Image class supporting images with arbitrary dimensions. +// +// Image input = load_image(argv[1]); +// +// info(input, "input"); // Output the Image header info +// dump(input, "input"); // Dump the Image data +// stats(input, "input"); // Report statistics for the Image +// +// +#ifndef HALIDE_TOOLS_IMAGE_INFO_H +#define HALIDE_TOOLS_IMAGE_INFO_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "halide_buffer.h" + +namespace Halide { +namespace Tools { + +static inline void print_dimid(int d, int val) { + static const char *dimid[] = {"x", "y", "z", "w"}; + int numdimid = 4; + if (d < numdimid) { + std::cout << " " << dimid[d] << ":" << val; + } else { + std::cout << " extent[" << d << "]:" << val; + } +} + +static inline void print_loc(int32_t *loc, int dim, int32_t *min) { + for (int d = 0; d < dim; d++) { + if (d) { + std::cout << ","; + } + std::cout << loc[d] + min[d]; + } +} + +static inline void print_memalign(intptr_t val) { + intptr_t align_chk = 1024*1024; + while (align_chk > 0) { + if ((val & (align_chk-1)) == 0) { + char aunit = ' '; + if (align_chk >= 1024) { + align_chk >>= 10; + aunit = 'K'; + } + if (align_chk >= 1024) { + align_chk >>= 10; + aunit = 'M'; + } + std::cout << "align:" << align_chk; + if (aunit != ' ') { + std::cout << aunit; + } + break; + } + align_chk >>= 1; + } +} + +template +void info(Image &img, const char *tag = "Image") { + buffer_t *buf = &(*img); + int32_t *min = buf->min; + int32_t *extent = buf->extent; + int32_t *stride = buf->stride; + int dim = img.dimensions(); + int img_bpp = buf->elem_size; + int img_tsize = sizeof(T); + int img_csize = sizeof(Image); + int img_bsize = sizeof(buffer_t); + int32_t size = 1; + uint64_t dev = buf->dev; + bool host_dirty = buf->host_dirty; + bool dev_dirty = buf->dev_dirty; + + std::cout << std::endl + << "-----------------------------------------------------------------------------"; + std::cout << std::endl << "Image info: " << tag + << " dim:" << dim << " bpp:" << img_bpp; + for (int d = 0; d < dim; d++) { + print_dimid(d, extent[d]); + size *= extent[d]; + } + std::cout << std::endl; + std::cout << tag << " class = 0x" << std::left << std::setw(10) << (void*)img + << std::right << " # "; + print_memalign((intptr_t)&img); std::cout << std::endl; + std::cout << tag << " class size = "<< img_csize + << " (0x"<< std::hex << img_csize << std::dec <<")\n"; + std::cout << tag << "-class => [ 0x" << (void*)&img + << ", 0x" << (void*)(((char*)&img)+img_csize-1) + << " ], # size:" << img_csize << ", "; + print_memalign((intptr_t)&img); std::cout << std::endl; + std::cout << tag << " buf_t size = "<< img_bsize + << " (0x"<< std::hex << img_bsize << std::dec <<")\n"; + std::cout << tag << "-buf_t => [ 0x" << (void*)&buf + << ", 0x" << (void*)(((char*)&buf)+img_bsize-1) + << " ], # size:" << img_bsize << ", "; + print_memalign((intptr_t)&buf); std::cout << std::endl; + if (img_bpp != img_tsize) { + std::cout << tag << " sizeof(T) = " << img_tsize << std::endl; + } + std::cout << tag << " host_dirty = " << host_dirty << std::endl; + std::cout << tag << " dev_dirty = " << dev_dirty << std::endl; + std::cout << tag << " dev handle = " << dev << std::endl; + std::cout << tag << " elem_size = " << img_bpp << std::endl; + std::cout << tag << " img_dim = " << dim << std::endl; + std::cout << tag << " width = " << img.width() << std::endl; + std::cout << tag << " height = " << img.height() << std::endl; + std::cout << tag << " channels = " << img.channels() << std::endl; + std::cout << tag << " extent[] = "; + for (int d = 0; d < dim; d++) { + std::cout << extent[d] << " "; + } + std::cout << std::endl; + std::cout << tag << " min[] = "; + for (int d = 0; d < dim; d++) { + std::cout << min[d] << " "; + } + std::cout << std::endl; + std::cout << tag << " stride[] = "; + for (int d = 0; d < dim; d++) { + std::cout << stride[d] << " "; + } + std::cout << std::endl; + if (img_bpp > 1) { + for (int d = 0; d < dim; d++) { + std::cout << tag << " str[" << d << "]*bpp = " + << std::left << std::setw(12) << stride[d] * img_bpp + << std::right << " # "; + print_memalign(stride[d] * img_bpp); std::cout << std::endl; + } + } + + const T *img_data = img.data(); + const T *img_next = img_data + size; + int32_t img_size = size * img_bpp; + int32_t data_size = (char*)img_next - (char*)img_data; + std::cout << tag << " size = " << size << " (0x" + << std::hex << size << ")" << std::dec << std::endl; + std::cout << tag << " img_size = " << img_size << " (0x" + << std::hex << img_size << ")" << std::dec << std::endl; + std::cout << tag << " data = 0x" << std::left << std::setw(10) << (void *)img_data + << std::right << " # "; + print_memalign((intptr_t)img_data); std::cout << std::endl; + std::cout << tag << " next = 0x" << std::left << std::setw(10) << (void *)img_next + << std::right << " # "; + print_memalign((intptr_t)img_next); std::cout << std::endl; + std::cout << tag << " data_size = " << data_size << " (0x" + << std::hex << data_size << ")" << std::dec << std::endl; + std::cout << tag << " => [ 0x" << (void *)img_data + << ", 0x" << (void *)(((char*)img_next)-1) + << "], # size:" << data_size << ", "; + print_memalign((intptr_t)img_data); std::cout << std::endl; +} + +template +void dump(Image &img, const char *tag = "Image") { + buffer_t *buf = &(*img); + int32_t *min = buf->min; + int32_t *extent = buf->extent; + int32_t *stride = buf->stride; + int dim = img.dimensions(); + int bpp = buf->elem_size; + int32_t size = 1; + + std::cout << std::endl << "Image dump: " << tag + << " dim:" << dim << " bpp:" << bpp; + for (int d = 0; d < dim; d++) { + print_dimid(d, extent[d]); + size *= extent[d]; + } + + // Arbitrary dimension image traversal + const T *ptr = img.data(); + int32_t curloc[dim]; + for (int d = 1; d < dim; d++) { + curloc[d] = -1; + } + curloc[0] = 0; + + for (int32_t i = 0; i < size; i++) { + // Track changes in position in higher dimensions + for (int d = 1; d < dim; d++) { + if ((i % stride[d]) == 0) { + curloc[d]++; + for (int din = 0; din < d; din++) { + curloc[din] = 0; + } + std::cout << std::endl; + // Print separators for dimensions beyond (x0,y1) + if (d > 1) { + print_dimid(d, curloc[d]+min[d]); + std::cout << "\n=========================================="; + } + } + } + + // Check for start of row (or wrap due to width) + if ((curloc[0] % 16) == 0) { + int widx = 0; + std::ostringstream idx; + if (dim > 1) { // Multi-dim, just report (x0,y1) on each row + idx << "(" << curloc[0]+min[0] << "," << curloc[1]+min[1] << ")"; + widx = 12; + } else { // Single-dim + idx << curloc[0]+min[0]; + widx = 4; + } + std::cout << std::endl << std::setw(widx) << idx.str() << ": "; + } + + // Display data + std::cout << std::setw(4) << *ptr++ + 0 << " "; + + curloc[0]++; // Track position in row + } + std::cout << std::endl; +} + +template +void stats(Image &img, const char *tag = "Image") { + buffer_t *buf = &(*img); + int32_t *min = buf->min; + int32_t *extent = buf->extent; + int32_t *stride = buf->stride; + int dim = img.dimensions(); + int bpp = buf->elem_size; + int32_t size = 1; + std::cout << std::endl << "Image stats: " << tag + << " dim:" << dim << " bpp:" << bpp; + for (int d = 0; d < dim; d++) { + print_dimid(d, extent[d]); + size *= extent[d]; + } + + // Arbitrary dimension image traversal + const T *ptr = img.data(); + int32_t curloc[dim]; + for (int d = 1; d < dim; d++) { + curloc[d] = -1; + } + curloc[0] = 0; + + // Statistics + int32_t cnt = 0; + double sum = 0; + T minval = *ptr; + T maxval = *ptr; + int32_t minloc[dim]; + int32_t maxloc[dim]; + for (int d = 0; d < dim; d++) { + minloc[d] = 0; + maxloc[d] = 0; + } + + for (int32_t i = 0; i < size; i++) { + // Track changes in position in higher dimensions + for (int d = 1; d < dim; d++) { + if ((i % stride[d]) == 0) { + curloc[d]++; + for (int din = 0; din < d; din++) { + curloc[din] = 0; + } + } + } + + // Collect data + T val = *ptr++; + sum += val; + cnt++; + if (val < minval) { + minval = val; + for (int d = 0; d < dim; d++) { + minloc[d] = curloc[d]; + } + } + if (val > maxval) { + maxval = val; + for (int d = 0; d < dim; d++) { + maxloc[d] = curloc[d]; + } + } + + curloc[0]++; // Track position in row + } + + double avg = sum / cnt; + std::cout << std::endl; + std::cout << "min = " << minval + 0 << " @ ("; + print_loc(minloc, dim, min); + std::cout << ")" << std::endl; + std::cout << "max = " << maxval + 0 << " @ ("; + print_loc(maxloc, dim, min); + std::cout << ")" << std::endl; + std::cout << "mean = " << avg << std::endl; + std::cout << "N = " << cnt << std::endl; + std::cout << std::endl; +} + +} // namespace Tools +} // namespace Halide + +#endif // HALIDE_TOOLS_IMAGE_INFO_H Index: test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_io.h =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_io.h +++ test-suite/trunk/Bitcode/Benchmarks/Halide/common/halide_image_io.h @@ -0,0 +1,320 @@ +// This simple PNG IO library works with *both* the Halide::Image type *and* +// the simple halide_image.h version. Also now includes PPM support for faster load/save. + +#ifndef HALIDE_IMAGE_IO_H +#define HALIDE_IMAGE_IO_H + +#include +#include +#include +#include +#include +#include + +namespace Halide { +namespace Tools { + +namespace Internal { + +typedef bool (*CheckFunc)(bool condition, const char* fmt, ...); + +inline bool CheckFail(bool condition, const char* fmt, ...) { + if (!condition) { + char buffer[1024]; + va_list args; + va_start(args, fmt); + vsnprintf(buffer, sizeof(buffer), fmt, args); + va_end(args); + fprintf(stderr, "%s", buffer); + exit(-1); + } + return condition; +} + +inline bool CheckReturn(bool condition, const char* fmt, ...) { + return condition; +} + +// Convert to u8 +inline void convert(uint8_t in, uint8_t &out) {out = in;} +inline void convert(uint16_t in, uint8_t &out) {out = in >> 8;} +inline void convert(uint32_t in, uint8_t &out) {out = in >> 24;} +inline void convert(int8_t in, uint8_t &out) {out = in;} +inline void convert(int16_t in, uint8_t &out) {out = in >> 8;} +inline void convert(int32_t in, uint8_t &out) {out = in >> 24;} +inline void convert(float in, uint8_t &out) {out = (uint8_t)(in*255.0f);} +inline void convert(double in, uint8_t &out) {out = (uint8_t)(in*255.0f);} + +// Convert to u16 +inline void convert(uint8_t in, uint16_t &out) {out = in << 8;} +inline void convert(uint16_t in, uint16_t &out) {out = in;} +inline void convert(uint32_t in, uint16_t &out) {out = in >> 16;} +inline void convert(int8_t in, uint16_t &out) {out = in << 8;} +inline void convert(int16_t in, uint16_t &out) {out = in;} +inline void convert(int32_t in, uint16_t &out) {out = in >> 16;} +inline void convert(float in, uint16_t &out) {out = (uint16_t)(in*65535.0f);} +inline void convert(double in, uint16_t &out) {out = (uint16_t)(in*65535.0f);} + +// Convert from u8 +inline void convert(uint8_t in, uint32_t &out) {out = in << 24;} +inline void convert(uint8_t in, int8_t &out) {out = in;} +inline void convert(uint8_t in, int16_t &out) {out = in << 8;} +inline void convert(uint8_t in, int32_t &out) {out = in << 24;} +inline void convert(uint8_t in, float &out) {out = in/255.0f;} +inline void convert(uint8_t in, double &out) {out = in/255.0f;} + +// Convert from u16 +inline void convert(uint16_t in, uint32_t &out) {out = in << 16;} +inline void convert(uint16_t in, int8_t &out) {out = in >> 8;} +inline void convert(uint16_t in, int16_t &out) {out = in;} +inline void convert(uint16_t in, int32_t &out) {out = in << 16;} +inline void convert(uint16_t in, float &out) {out = in/65535.0f;} +inline void convert(uint16_t in, double &out) {out = in/65535.0f;} + + +inline bool ends_with_ignore_case(const std::string &ac, const std::string &bc) { + if (ac.length() < bc.length()) { return false; } + std::string a = ac, b = bc; + std::transform(a.begin(), a.end(), a.begin(), ::tolower); + std::transform(b.begin(), b.end(), b.begin(), ::tolower); + return a.compare(a.length()-b.length(), b.length(), b) == 0; +} + +inline bool is_little_endian() { + int value = 1; + return ((char *) &value)[0] == 1; +} + +inline void swap_endian_16(bool little_endian, uint16_t &value) { + if (little_endian) { + value = ((value & 0xff)<<8)|((value & 0xff00)>>8); + } +} + +struct FileOpener { + FileOpener(const char* filename, const char* mode) : f(fopen(filename, mode)) { + // nothing + } + ~FileOpener() { + if (f != nullptr) { + fclose(f); + } + } + FILE * const f; +}; + +} // namespace Internal + + +struct BytesImgStruct { + int dims[3]; //width, height, channels + float* ptr; +}; + + +template +bool load_bytes(const std::string &filename, ImageType *im) { + Internal::FileOpener f(filename.c_str(), "rb"); + if (!check(f.f != nullptr, "File %s could not be opened for reading\n", filename.c_str())) return false; + + BytesImgStruct ptrStruct; + if (!check(fread(ptrStruct.dims, sizeof(int), 3, f.f) == 3, + "Could not read dimensions (width, height, channels) for .bytes image\n")) return false; + int img_size = ptrStruct.dims[0]*ptrStruct.dims[1]; + if (!check(ptrStruct.dims[0] > 0 && ptrStruct.dims[1] > 0 && ptrStruct.dims[2] > 0, + "File %s does not have valid input\n", filename.c_str())) return false; + ptrStruct.ptr = (float*) malloc(img_size * sizeof(float)); + if (!check(fread(ptrStruct.ptr, sizeof(float), img_size, f.f) == img_size, + "Could not read .bytes image\n")) return false; + + if (ptrStruct.dims[2] != 1) { + *im = ImageType(ptrStruct.dims[0], ptrStruct.dims[1], ptrStruct.dims[2]); + } else { + *im = ImageType(ptrStruct.dims[0], ptrStruct.dims[1]); + } + typename ImageType::ElemType *ptr = (typename ImageType::ElemType*)im->data(); + for (int i=0; i +bool save_bytes(ImageType &im, const std::string &filename) { + BytesImgStruct ptrStruct; + ptrStruct.dims[0] = im.width(); + ptrStruct.dims[1] = im.height(); + ptrStruct.dims[2] = im.channels(); + int img_size = ptrStruct.dims[0]*ptrStruct.dims[1]; + ptrStruct.ptr = (float*) malloc(img_size * sizeof(float)); + typename ImageType::ElemType *ptr = (typename ImageType::ElemType*)im.data(); + for (int i=0; i +bool load_ppm(const std::string &filename, ImageType *im) { + + /* open file and test for it being a ppm */ + Internal::FileOpener f(filename.c_str(), "rb"); + if (!check(f.f != nullptr, "File %s could not be opened for reading\n", filename.c_str())) return false; + + int width, height, maxval; + char header[256]; + if (!check(fscanf(f.f, "%255s", header) == 1, "Could not read PPM header\n")) return false; + if (!check(fscanf(f.f, "%d %d\n", &width, &height) == 2, "Could not read PPM width and height\n")) return false; + if (!check(fscanf(f.f, "%d", &maxval) == 1, "Could not read PPM max value\n")) return false; + if (!check(fgetc(f.f) != EOF, "Could not read char from PPM\n")) return false; + + int bit_depth = 0; + if (maxval == 255) { bit_depth = 8; } + else if (maxval == 65535) { bit_depth = 16; } + else { if (!check(false, "Invalid bit depth in PPM\n")) return false; } + + if (!check(header == std::string("P6") || header == std::string("p6"), "Input is not binary PPM\n")) return false; + + int channels = 3; + *im = ImageType(width, height, channels); + + // convert the data to ImageType::ElemType + if (bit_depth == 8) { + std::vector data(width*height*3); + if (!check(fread((void *) data.data(), sizeof(uint8_t), width*height*3, f.f) == (size_t) (width*height*3), "Could not read PPM 8-bit data\n")) return false; + typename ImageType::ElemType *im_data = (typename ImageType::ElemType*) im->data(); + for (int y = 0; y < im->height(); y++) { + uint8_t *row = &data[(y*width)*3]; + for (int x = 0; x < im->width(); x++) { + Internal::convert(*row++, im_data[(0*height+y)*width+x]); + Internal::convert(*row++, im_data[(1*height+y)*width+x]); + Internal::convert(*row++, im_data[(2*height+y)*width+x]); + } + } + } else if (bit_depth == 16) { + int little_endian = Internal::is_little_endian(); + std::vector data(width*height*3); + if (!check(fread((void *) data.data(), sizeof(uint16_t), width*height*3, f.f) == (size_t) (width*height*3), "Could not read PPM 16-bit data\n")) return false; + typename ImageType::ElemType *im_data = (typename ImageType::ElemType*) im->data(); + for (int y = 0; y < im->height(); y++) { + uint16_t *row = &data[(y*width)*3]; + for (int x = 0; x < im->width(); x++) { + uint16_t value; + value = *row++; Internal::swap_endian_16(little_endian, value); Internal::convert(value, im_data[(0*height+y)*width+x]); + value = *row++; Internal::swap_endian_16(little_endian, value); Internal::convert(value, im_data[(1*height+y)*width+x]); + value = *row++; Internal::swap_endian_16(little_endian, value); Internal::convert(value, im_data[(2*height+y)*width+x]); + } + } + } + (*im)(0,0,0) = (*im)(0,0,0); /* Mark dirty inside read/write functions. */ + + return true; +} + +// "im" is not const-ref because copy_to_host() is not const. +template +bool save_ppm(ImageType &im, const std::string &filename) { + im.copy_to_host(); + + unsigned int bit_depth = sizeof(typename ImageType::ElemType) == 1 ? 8: 16; + + Internal::FileOpener f(filename.c_str(), "wb"); + if (!check(f.f != nullptr, "File %s could not be opened for writing\n", filename.c_str())) return false; + fprintf(f.f, "P6\n%d %d\n%d\n", im.width(), im.height(), (1< data(width*height*3); + for (int y = 0; y < im.height(); y++) { + for (int x = 0; x < im.width(); x++) { + uint8_t *p = &data[(y*width+x)*3]; + for (int c = 0; c < im.channels(); c++) { + Internal::convert(im(x, y, c), p[c]); + } + } + } + if (!check(fwrite((void *) data.data(), sizeof(uint8_t), width*height*3, f.f) == (size_t) (width*height*3), "Could not write PPM 8-bit data\n")) return false; + } else if (bit_depth == 16) { + int little_endian = Internal::is_little_endian(); + std::vector data(width*height*3); + for (int y = 0; y < im.height(); y++) { + for (int x = 0; x < im.width(); x++) { + uint16_t *p = &data[(y*width+x)*3]; + for (int c = 0; c < im.channels(); c++) { + uint16_t value; + Internal::convert(im(x, y, c), value); + Internal::swap_endian_16(little_endian, value); + p[c] = value; + } + } + } + if (!check(fwrite((void *) data.data(), sizeof(uint16_t), width*height*3, f.f) == (size_t) (width*height*3), "Could not write PPM 16-bit data\n")) return false; + } else { + return check(false, "We only support saving 8- and 16-bit images."); + } + return true; +} + +// Returns false upon failure. +template +bool load(const std::string &filename, ImageType *im) { + if (Internal::ends_with_ignore_case(filename, ".ppm")) { + return load_ppm(filename, im); + } else if (Internal::ends_with_ignore_case(filename, ".bytes")) { + return (load_bytes(filename, im)); + } else { + return check(false, "[load] unsupported file extension (bytes|ppm supported)"); + } +} + +// Returns false upon failure. +template +bool save(ImageType &im, const std::string &filename) { + if (Internal::ends_with_ignore_case(filename, ".ppm")) { + return save_ppm(im, filename); + } else if (Internal::ends_with_ignore_case(filename, ".bytes")) { + return save_bytes(im, filename); + } else { + return check(false, "[save] unsupported file extension (bytes|ppm supported)"); + } +} + +// Fancy wrapper to call load() with CheckFail, inferring the return type; +// this allows you to simply use +// +// Image im = load_image("filename"); +// +// without bothering to check error results (all errors simply abort). +class load_image { +public: + load_image(const std::string &f) : filename(f) {} + template + inline operator ImageType() { + ImageType im; + (void) load(filename, &im); + return im; + } +private: + const std::string filename; +}; + +// Fancy wrapper to call save() with CheckFail; this allows you to simply use +// +// save_image(im, "filename"); +// +// without bothering to check error results (all errors simply abort). +template +void save_image(ImageType &im, const std::string &filename) { + (void) save(im, filename); +} + +} // namespace Tools +} // namespace Halide + +#endif // HALIDE_IMAGE_IO_H Index: test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/CMakeLists.txt =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/CMakeLists.txt +++ test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/CMakeLists.txt @@ -0,0 +1,14 @@ +file(GLOB bcsources ${CMAKE_CURRENT_SOURCE_DIR}/../common/x86_halide_runtime.bc ${CMAKE_CURRENT_SOURCE_DIR}/local_laplacian.bc) +SET_SOURCE_FILES_PROPERTIES(${bcsources} PROPERTIES LANGUAGE CXX) + +set(Source ${CMAKE_CURRENT_SOURCE_DIR}/driver.cpp ${bcsources}) +set(PROG halide_local_laplacian) + +test_img_input(rgb 8 1 1 10) +test_img_input(rgba 8 1 1 10) + +llvm_multisource() + + + + Index: test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/driver.cpp =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/driver.cpp +++ test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/driver.cpp @@ -0,0 +1,33 @@ +#include "../common/benchmark.h" +#include "../common/halide_image.h" +#include "../common/halide_image_io.h" +#include "local_laplacian.h" + +using namespace Halide::Tools; + +int main(int argc, char **argv) { + if (argc < 7) { + printf("Usage: ./process input.png levels alpha beta timing_iterations output.png\n" + "e.g.: ./process input.png 8 1 1 10 output.png\n"); + return 0; + } + + Image input = load_image(argv[1]); + int levels = atoi(argv[2]); + float alpha = atof(argv[3]), beta = atof(argv[4]); + Image output(input.width(), input.height(), 3); + int timing = atoi(argv[5]); + + local_laplacian(levels, alpha/(levels-1), beta, input, output); + + // Timing code + double best = benchmark(timing, 1, [&]() { + local_laplacian(levels, alpha/(levels-1), beta, input, output); + }); + printf("%gus\n", best * 1e6); + + save_image(output, argv[6]); + + return 0; +} + Index: test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/local_laplacian.h =================================================================== --- test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/local_laplacian.h +++ test-suite/trunk/Bitcode/Benchmarks/Halide/local_laplacian/local_laplacian.h @@ -0,0 +1,42 @@ +#ifndef HALIDE__local_laplacian_h +#define HALIDE__local_laplacian_h +#ifndef HALIDE_ATTRIBUTE_ALIGN + #ifdef _MSC_VER + #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x)) + #else + #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x))) + #endif +#endif +#ifndef BUFFER_T_DEFINED +#define BUFFER_T_DEFINED +#include +#include +typedef struct buffer_t { + uint64_t dev; + uint8_t* host; + int32_t extent[4]; + int32_t stride[4]; + int32_t min[4]; + int32_t elem_size; + HALIDE_ATTRIBUTE_ALIGN(1) bool host_dirty; + HALIDE_ATTRIBUTE_ALIGN(1) bool dev_dirty; + HALIDE_ATTRIBUTE_ALIGN(1) uint8_t _padding[10 - sizeof(void *)]; +} buffer_t; +#endif +struct halide_filter_metadata_t; +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif +#ifdef __cplusplus +extern "C" { +#endif + +int local_laplacian(int32_t _levels, float _alpha, float _beta, buffer_t *_input_buffer, buffer_t *_local_laplacian_buffer) HALIDE_FUNCTION_ATTRS; +int local_laplacian_argv(void **args) HALIDE_FUNCTION_ATTRS; +// Result is never null and points to constant static data +const struct halide_filter_metadata_t *local_laplacian_metadata() HALIDE_FUNCTION_ATTRS; + +#ifdef __cplusplus +} // extern "C" +#endif +#endif Index: test-suite/trunk/Bitcode/CMakeLists.txt =================================================================== --- test-suite/trunk/Bitcode/CMakeLists.txt +++ test-suite/trunk/Bitcode/CMakeLists.txt @@ -8,6 +8,7 @@ set(TEST_SUITE_ENABLE_BITCODE_TESTS ${ENABLE_BITCODE_DEFAULT} CACHE BOOL "Enable bitcode tests") if(TEST_SUITE_ENABLE_BITCODE_TESTS) + llvm_add_subdirectories(Benchmarks) if(NOT TEST_SUITE_BENCHMARKING_ONLY) llvm_add_subdirectories(Regression) if(ARCH STREQUAL "x86" OR ARCH STREQUAL "AArch64" OR ARCH STREQUAL "ARM")