Index: test-suite/trunk/External/CUDA/CMakeLists.txt
===================================================================
--- test-suite/trunk/External/CUDA/CMakeLists.txt
+++ test-suite/trunk/External/CUDA/CMakeLists.txt
@@ -87,6 +87,7 @@
   create_one_local_test(empty empty.cu)
   create_one_local_test(printf printf.cu)
   create_one_local_test(future future.cu)
+  create_one_local_test(simd simd.cu)
 endmacro()
 
 macro(thrust_make_test_name TestName TestSourcePath)
Index: test-suite/trunk/External/CUDA/simd.cu
===================================================================
--- test-suite/trunk/External/CUDA/simd.cu
+++ test-suite/trunk/External/CUDA/simd.cu
@@ -0,0 +1,455 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <complex>  // Needed for std::min and max to work on device.
+#include <limits>
+
+int verbose = 0;
+
+#if __cplusplus >= 201103L
+#include <type_traits>
+
+// Convert a function into a functor with two arguments.  We rely on SFINAE to
+// instantiate a function template call() which will invoke FUNC() with one or
+// two arguments.
+#define F(FUNC, NELTS, NARGS)                                              \
+  typedef struct FUNC##_f {                                                \
+    static const int num_args = NARGS;                                     \
+    static const int num_elts = NELTS;                                     \
+    template <typename T, int NA = num_args>                               \
+    __device__ static typename std::enable_if<NA == 1, unsigned int>::type \
+    call(T a, T b) {                                                       \
+      return FUNC(a);                                                      \
+    }                                                                      \
+    template <typename T, int NA = num_args>                               \
+    __device__ static typename std::enable_if<NA == 2, unsigned int>::type \
+    call(T a, T b) {                                                       \
+      return FUNC(a, b);                                                   \
+    }                                                                      \
+  } FUNC##_f
+
+template <int N, typename T>
+__device__ unsigned int pack(T a[N]) {
+  unsigned int mask = (N == 2) ? 0xffff : 0xff;
+  unsigned int shift = (N == 2) ? 16 : 8;
+  unsigned int r = 0;
+  for (int i = 0; i < N; ++i) {
+    r |= ((unsigned int)a[i] & mask) << (shift * i);
+  }
+  return r;
+}
+template <int N, typename T>
+__device__ void unpack(unsigned int r, T (&a)[N]) {
+  unsigned int mask = (N == 2) ? 0xffff : 0xff;
+  unsigned int shift = (N == 2) ? 16 : 8;
+  for (int i = 0; i < N; ++i) {
+    a[i] = ((r >> (shift * i)) & mask);
+  }
+}
+
+enum op_t {
+  OP_ABS,
+  OP_ABSDIFF,
+  OP_ABSS,
+  OP_ADD,
+  OP_ADDS,
+  OP_AVG,
+  OP_CMPEQ,
+  OP_CMPGE,
+  OP_CMPGT,
+  OP_CMPLE,
+  OP_CMPLT,
+  OP_CMPNE,
+  OP_HADD,
+  OP_MAX,
+  OP_MIN,
+  OP_NEG,
+  OP_SAD,
+  OP_SETEQ,
+  OP_SETGE,
+  OP_SETGT,
+  OP_SETLE,
+  OP_SETLT,
+  OP_SETNE,
+  OP_SUB,
+  OP_SUBS,
+  OP_LAST
+};
+
+template <enum op_t OP, typename T>
+__device__ inline T elt_op(T a, T b = INT_MIN) {
+  switch (OP) {
+    case OP_ABS:
+      if (!std::numeric_limits<T>::is_signed) return a;
+      // This is wrong, but that's what __vabsN() returns.  We also need to
+      // handle that because abs(std::numeric_limits<T>::min()) would be an
+      // undefined behavior otherwise.
+      if (a == std::numeric_limits<T>::min())
+        return std::numeric_limits<T>::min();
+      return (a >= 0) ? a : -a;
+
+    case OP_ABSDIFF:
+      return std::abs(a - b);
+    case OP_ABSS: {
+      int result = std::abs(a);
+      if (result > std::numeric_limits<T>::max())
+        return std::numeric_limits<T>::max();
+      return result;
+    }
+    case OP_ADD:
+      return a + b;
+    case OP_ADDS: {
+      int result = (int)a + (int)b;
+      if (result > std::numeric_limits<T>::max())
+        return std::numeric_limits<T>::max();
+      if (std::numeric_limits<T>::is_signed &&
+          result < std::numeric_limits<T>::min())
+        return std::numeric_limits<T>::min();
+      return result;
+    }
+    case OP_AVG:
+      // This is *rounded* average. For simplicity let FP do the
+      // rounding. Considering that T is byte or short, we're guaranteed not to
+      // lose any bits.
+      return round(((float)a + (float)b) / 2.0f);
+    case OP_CMPEQ:
+      return a == b ? -1 : 0;
+    case OP_CMPGE:
+      return a >= b ? -1 : 0;
+    case OP_CMPGT:
+      return a > b ? -1 : 0;
+    case OP_CMPLE:
+      return a <= b ? -1 : 0;
+    case OP_CMPLT:
+      return a < b ? -1 : 0;
+    case OP_CMPNE:
+      return a != b ? -1 : 0;
+    case OP_HADD:
+      return (a + b) / 2;
+    case OP_MAX:
+      return std::max(a, b);
+    case OP_MIN:
+      return std::min(a, b);
+    case OP_NEG:
+      // This is wrong, but that's what __vnegN() returns.  We also need to
+      // handle that because abs(std::numeric_limits<T>::min()) would be an
+      // undefined behavior otherwise.
+      if (std::numeric_limits<T>::is_signed &&
+          a == std::numeric_limits<T>::min())
+        return std::numeric_limits<T>::min();
+      return -a;
+    case OP_SAD:
+      return std::abs(a - b);  // need to sum per-element results later.
+    case OP_SETEQ:
+      return a == b ? 1 : 0;
+    case OP_SETGE:
+      return a >= b ? 1 : 0;
+    case OP_SETGT:
+      return a > b ? 1 : 0;
+    case OP_SETLE:
+      return a <= b ? 1 : 0;
+    case OP_SETLT:
+      return a < b ? 1 : 0;
+    case OP_SETNE:
+      return a != b ? 1 : 0;
+    case OP_SUB:
+      return a - b;
+    case OP_SUBS: {
+      int result = (int)a - (int)b;
+      if (result > std::numeric_limits<T>::max())
+        return std::numeric_limits<T>::max();
+      if (result < std::numeric_limits<T>::min())
+        return std::numeric_limits<T>::min();
+      return result;
+    }
+    default:
+      assert(false && "unknown OP");
+  }
+  assert(false && "Unreachable.");
+  return 0;
+}
+
+template <op_t OP, typename T, int N>
+__device__ void simd_op(T (&r)[N], T a[N], T b[N]) {
+  if (OP == OP_SAD) {
+    // Sum up all elements in r[0] and clear the rest of r.
+    int result = 0;
+    for (int i = 0; i < N; ++i) {
+      result += elt_op<OP, T>(a[i], b[i]);
+      r[i] = 0;
+    }
+    r[0] = result;
+  } else {
+    // Just an element-wise op.
+    for (int i = 0; i < N; ++i) {
+      r[i] = elt_op<OP, T>(a[i], b[i]);
+    }
+  }
+}
+
+template <op_t OP, class SIMD_OP, typename T>
+__device__ void test_func(int verbose, int a, int b) {
+  constexpr int N = SIMD_OP::num_elts;
+  int dummy_args[] = {0,
+                      1,
+                      -1,
+                      std::numeric_limits<T>::max(),
+                      std::numeric_limits<T>::max() - 1,
+                      std::numeric_limits<T>::min(),
+                      std::numeric_limits<T>::min() + 1};
+  for (T x : dummy_args) {
+    for (int e = 0; e < N; ++e) {
+      T args_a[N];
+      T args_b[N];
+      for (int i = 0; i < N; ++i) {
+        args_a[i] = x;
+        args_b[i] = x;
+      }
+      args_a[e] = a;
+      args_b[e] = b;
+      unsigned int va = pack<N, T>(args_a);
+      unsigned int vb = pack<N, T>(args_b);
+      T expected_r[N];
+      simd_op<OP, T>(expected_r, args_a, args_b);
+      unsigned int evr = pack<N, T>(expected_r);
+      // This is weird and I don't understand what's going on.  With T = short,
+      // compiler ends up generating code which triggers the assert below
+      // if verbose == false, but triggers no assert if verbose == 1. It may be
+      // due to an undefined behavior somewhere, but the same code (with SIMD_OP
+      // below replaced with a pack(simd_op(a,b)) (so it could run on host)
+      // triggerend no ubsan reports.
+      asm volatile("" ::: "memory");
+      unsigned int vr = SIMD_OP::call(va, vb);
+      if (verbose && vr != evr) {
+        printf("e=%d a=%d b=%d va=%08x vb=%08x vr=%08x expected vr=%08x\n", e,
+               a, b, va, vb, vr, evr);
+      }
+      assert((vr == evr) && "Value mismatch");
+    }
+  }
+}
+
+template <op_t OP, class SIMD_OP, typename T>
+__global__ void test_kernel(int verbose) {
+  int a = blockIdx.x * blockDim.x + threadIdx.x;
+  int b = blockIdx.y * blockDim.y + threadIdx.y;
+  test_func<OP, SIMD_OP, T>(verbose, a, b);
+}
+
+template <op_t OP, class SIMD_OP, typename T>
+void test_op() {
+  int elements_a = SIMD_OP::num_elts == 2 ? 0x10000 : 0x100;
+  // Collapse second dimension if we test single-operand function.
+  int elements_b = SIMD_OP::num_args == 2 ? elements_a : 0;
+  dim3 grid_size(elements_a / 32, elements_b ? elements_b / 32 : 1, 1);
+  dim3 block_size(32, elements_b ? 32 : 1, 1);
+  printf("Testing %s...", __PRETTY_FUNCTION__);
+  test_kernel<OP, SIMD_OP, T><<<grid_size, block_size>>>(verbose);
+  cudaError_t err = cudaDeviceSynchronize();
+  if (err != cudaSuccess) {
+    printf("%s failed\n", __PRETTY_FUNCTION__);
+    printf("CUDA error %d\n", (int)err);
+    exit(EXIT_FAILURE);
+  } else {
+    printf("OK\n");
+  }
+}
+
+// Define functor types which we can then use to parametrize device-side tests.
+// F(function, num-elements, num-args)
+F(__vabs2, 2, 1);
+F(__vabs4, 4, 1);
+F(__vabsdiffs2, 2, 2);
+F(__vabsdiffs4, 4, 2);
+F(__vabsdiffu2, 2, 2);
+F(__vabsdiffu4, 4, 2);
+F(__vabsss2, 2, 1);
+F(__vabsss4, 4, 1);
+F(__vadd2, 2, 2);
+F(__vadd4, 4, 2);
+F(__vaddss2, 2, 2);
+F(__vaddus2, 2, 2);
+F(__vaddss4, 4, 2);
+F(__vaddus4, 4, 2);
+F(__vavgs2, 2, 2);
+F(__vavgu2, 2, 2);
+F(__vavgs4, 4, 2);
+F(__vavgu4, 4, 2);
+F(__vcmpeq2, 2, 2);
+F(__vcmpeq4, 4, 2);
+F(__vcmpges2, 2, 2);
+F(__vcmpges4, 4, 2);
+F(__vcmpgeu2, 2, 2);
+F(__vcmpgeu4, 4, 2);
+F(__vcmpgts2, 2, 2);
+F(__vcmpgts4, 4, 2);
+F(__vcmpgtu2, 2, 2);
+F(__vcmpgtu4, 4, 2);
+F(__vcmples2, 2, 2);
+F(__vcmples4, 4, 2);
+F(__vcmpleu2, 2, 2);
+F(__vcmpleu4, 4, 2);
+F(__vcmplts2, 2, 2);
+F(__vcmplts4, 4, 2);
+F(__vcmpltu2, 2, 2);
+F(__vcmpltu4, 4, 2);
+F(__vcmpne2, 2, 2);
+F(__vcmpne4, 4, 2);
+F(__vhaddu2, 2, 2);
+F(__vhaddu4, 4, 2);
+F(__vmaxs2, 2, 2);
+F(__vmaxs4, 4, 2);
+F(__vmaxu2, 2, 2);
+F(__vmaxu4, 4, 2);
+F(__vmins2, 2, 2);
+F(__vmins4, 4, 2);
+F(__vminu2, 2, 2);
+F(__vminu4, 4, 2);
+F(__vneg2, 2, 1);
+F(__vneg4, 4, 1);
+F(__vsads2, 2, 2);
+F(__vsadu2, 2, 2);
+F(__vsads4, 4, 2);
+F(__vsadu4, 4, 2);
+F(__vseteq2, 2, 2);
+F(__vseteq4, 4, 2);
+F(__vsetges2, 2, 2);
+F(__vsetges4, 4, 2);
+F(__vsetgeu2, 2, 2);
+F(__vsetgeu4, 4, 2);
+F(__vsetgts2, 2, 2);
+F(__vsetgts4, 4, 2);
+F(__vsetgtu2, 2, 2);
+F(__vsetgtu4, 4, 2);
+F(__vsetles2, 2, 2);
+F(__vsetles4, 4, 2);
+F(__vsetleu2, 2, 2);
+F(__vsetleu4, 4, 2);
+F(__vsetlts2, 2, 2);
+F(__vsetlts4, 4, 2);
+F(__vsetltu2, 2, 2);
+F(__vsetltu4, 4, 2);
+F(__vsetne2, 2, 2);
+F(__vsetne4, 4, 2);
+F(__vsub2, 2, 2);
+F(__vsub4, 4, 2);
+F(__vsubss2, 2, 2);
+F(__vsubus2, 2, 2);
+F(__vsubss4, 4, 2);
+F(__vsubus4, 4, 2);
+
+void tests() {
+  test_op<OP_NEG, __vneg2_f, short>();
+  test_op<OP_ABS, __vabs2_f, short>();
+  test_op<OP_ABS, __vabs4_f, signed char>();
+  test_op<OP_ABSDIFF, __vabsdiffs2_f, short>();
+  test_op<OP_ABSDIFF, __vabsdiffs4_f, signed char>();
+  test_op<OP_ABSDIFF, __vabsdiffu2_f, unsigned short>();
+  test_op<OP_ABSDIFF, __vabsdiffu4_f, unsigned char>();
+  test_op<OP_ABSS, __vabsss2_f, short>();
+  test_op<OP_ABSS, __vabsss4_f, signed char>();
+  test_op<OP_ADD, __vadd2_f, short>();
+  test_op<OP_ADD, __vadd4_f, signed char>();
+  test_op<OP_ADDS, __vaddss2_f, short>();
+  test_op<OP_ADDS, __vaddss4_f, signed char>();
+  test_op<OP_ADDS, __vaddus2_f, unsigned short>();
+  test_op<OP_ADDS, __vaddus4_f, unsigned char>();
+  test_op<OP_AVG, __vavgs2_f, short>();
+  test_op<OP_AVG, __vavgs4_f, signed char>();
+  test_op<OP_AVG, __vavgu2_f, unsigned short>();
+  test_op<OP_AVG, __vavgu4_f, unsigned char>();
+  test_op<OP_CMPEQ, __vcmpeq2_f, short>();
+  test_op<OP_CMPEQ, __vcmpeq4_f, signed char>();
+  test_op<OP_CMPGE, __vcmpges2_f, short>();
+  test_op<OP_CMPGE, __vcmpges4_f, signed char>();
+  test_op<OP_CMPGE, __vcmpgeu2_f, unsigned short>();
+  test_op<OP_CMPGE, __vcmpgeu4_f, unsigned char>();
+  test_op<OP_CMPGT, __vcmpgts2_f, short>();
+  test_op<OP_CMPGT, __vcmpgts4_f, signed char>();
+  test_op<OP_CMPGT, __vcmpgtu2_f, unsigned short>();
+  test_op<OP_CMPGT, __vcmpgtu4_f, unsigned char>();
+  test_op<OP_CMPLE, __vcmples2_f, short>();
+  test_op<OP_CMPLE, __vcmples4_f, signed char>();
+  test_op<OP_CMPLE, __vcmpleu2_f, unsigned short>();
+  test_op<OP_CMPLE, __vcmpleu4_f, unsigned char>();
+  test_op<OP_CMPLT, __vcmplts2_f, short>();
+  test_op<OP_CMPLT, __vcmplts4_f, signed char>();
+  test_op<OP_CMPLT, __vcmpltu2_f, unsigned short>();
+  test_op<OP_CMPLT, __vcmpltu4_f, unsigned char>();
+  test_op<OP_CMPNE, __vcmpne2_f, short>();
+  test_op<OP_CMPNE, __vcmpne4_f, signed char>();
+  test_op<OP_HADD, __vhaddu2_f, unsigned short>();
+  test_op<OP_HADD, __vhaddu4_f, unsigned char>();
+  test_op<OP_MAX, __vmaxs2_f, short>();  // ??? Fails?
+  test_op<OP_MAX, __vmaxs4_f, signed char>();
+  test_op<OP_MAX, __vmaxu2_f, unsigned short>();
+  test_op<OP_MAX, __vmaxu4_f, unsigned char>();
+  test_op<OP_MIN, __vmins2_f, short>();
+  test_op<OP_MIN, __vmins4_f, signed char>();
+  test_op<OP_MIN, __vminu2_f, unsigned short>();
+  test_op<OP_MIN, __vminu4_f, unsigned char>();
+  test_op<OP_NEG, __vneg2_f, short>();
+  test_op<OP_NEG, __vneg4_f, signed char>();
+  test_op<OP_SAD, __vsads2_f, short>();
+  test_op<OP_SAD, __vsads4_f, signed char>();
+  test_op<OP_SAD, __vsadu2_f, unsigned short>();
+  test_op<OP_SAD, __vsadu4_f, unsigned char>();
+  test_op<OP_SETEQ, __vseteq2_f, short>();
+  test_op<OP_SETEQ, __vseteq4_f, signed char>();
+  test_op<OP_SETGE, __vsetges2_f, short>();
+  test_op<OP_SETGE, __vsetges4_f, signed char>();
+  test_op<OP_SETGE, __vsetgeu2_f, unsigned short>();
+  test_op<OP_SETGE, __vsetgeu4_f, unsigned char>();
+  test_op<OP_SETGT, __vsetgts2_f, short>();
+  test_op<OP_SETGT, __vsetgts4_f, signed char>();
+  test_op<OP_SETGT, __vsetgtu2_f, unsigned short>();
+  test_op<OP_SETGT, __vsetgtu4_f, unsigned char>();
+  test_op<OP_SETLE, __vsetles2_f, short>();
+  test_op<OP_SETLE, __vsetles4_f, signed char>();
+  test_op<OP_SETLE, __vsetleu2_f, unsigned short>();
+  test_op<OP_SETLE, __vsetleu4_f, unsigned char>();
+  test_op<OP_SETLT, __vsetlts2_f, short>();
+  test_op<OP_SETLT, __vsetlts4_f, signed char>();
+  test_op<OP_SETLT, __vsetltu2_f, unsigned short>();
+  test_op<OP_SETLT, __vsetltu4_f, unsigned char>();
+  test_op<OP_SETNE, __vsetne2_f, short>();
+  test_op<OP_SETNE, __vsetne4_f, signed char>();
+  test_op<OP_SUB, __vsub2_f, short>();
+  test_op<OP_SUB, __vsub4_f, signed char>();
+  test_op<OP_SUBS, __vsubss2_f, short>();
+  test_op<OP_SUBS, __vsubss4_f, signed char>();
+  test_op<OP_SUBS, __vsubus2_f, unsigned short>();
+  test_op<OP_SUBS, __vsubus4_f, unsigned char>();
+}
+#else  // !C++11
+void tests() {
+  // These tests need C++11 to compile.
+}
+#endif
+
+int main(int argc, char** argv) {
+  int opt;
+  while ((opt = getopt(argc, argv, "v")) != -1) {
+    switch (opt) {
+      case 'v':
+        verbose = 1;
+        break;
+      default: /* '?' */
+        fprintf(stderr, "Usage: %s [-v]\n", argv[0]);
+        exit(EXIT_FAILURE);
+    }
+  }
+
+  tests();
+  printf("Success!\n");
+  return 0;
+}