Index: libcxx/include/__config
===================================================================
--- libcxx/include/__config
+++ libcxx/include/__config
@@ -1275,6 +1275,10 @@
 # endif
 #endif // defined(_LIBCPP_ABI_MICROSOFT) && !defined(_LIBCPP_BUILDING_LIBRARY)
 
+#if !defined(_LIBCPP_COMPILER_CLANG) && !defined(_LIBCPP_COMPILER_GCC)
+#define _LIBCPP_HAS_NO_VECTOR_EXTENSION
+#endif // defined(_LIBCPP_COMPILER_CLANG) || defined(_LIBCPP_COMPILER_GCC)
+
 #endif // __cplusplus
 
 #endif // _LIBCPP_CONFIG
Index: libcxx/include/experimental/__config
===================================================================
--- libcxx/include/experimental/__config
+++ libcxx/include/experimental/__config
@@ -66,4 +66,11 @@
 #define _LIBCPP_END_NAMESPACE_EXPERIMENTAL_SIMD_ABI \
     } _LIBCPP_END_NAMESPACE_EXPERIMENTAL_SIMD
 
+// TODO: support more targets
+#if defined(__AVX__)
+#define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 32
+#else
+#define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 16
+#endif
+
 #endif
Index: libcxx/include/experimental/simd
===================================================================
--- libcxx/include/experimental/simd
+++ libcxx/include/experimental/simd
@@ -589,6 +589,7 @@
 */
 
 #include <experimental/__config>
+#include <algorithm>
 #include <array>
 #include <cstddef>
 #include <functional>
@@ -602,25 +603,191 @@
 enum class _StorageKind {
   _Scalar,
   _Array,
+  _VecExt,
 };
 
 template <_StorageKind __kind, int _Np>
 struct __simd_abi {};
 
 template <class _Tp, class _Abi>
-struct __simd_storage_traits {};
+class __simd_storage {};
 
 template <class _Tp, int __num_element>
-struct __simd_storage_traits<_Tp,
-                             __simd_abi<_StorageKind::_Array, __num_element>> {
-  using type = std::array<_Tp, __num_element>;
+class __simd_storage<_Tp, __simd_abi<_StorageKind::_Array, __num_element>> {
+  std::array<_Tp, __num_element> __storage_;
+
+public:
+  using reference = _Tp&;
+
+  _Tp __get(size_t __index) const { return __storage_[__index]; };
+  reference __ref(size_t __index) { return __storage_[__index]; };
 };
 
 template <class _Tp>
-struct __simd_storage_traits<_Tp, __simd_abi<_StorageKind::_Scalar, 1>> {
-  using type = _Tp;
+class __simd_storage<_Tp, __simd_abi<_StorageKind::_Scalar, 1>> {
+  _Tp __storage_;
+
+public:
+  using reference = _Tp&;
+
+  _Tp __get(size_t __index) const { return (&__storage_)[__index]; };
+  reference __ref(size_t __index) { return (&__storage_)[__index]; };
+};
+
+#ifndef _LIBCPP_HAS_NO_VECTOR_EXTENSION
+
+constexpr size_t __floor_pow_of_2(size_t __val) {
+  return ((__val - 1) & __val) == 0 ? __val
+                                    : __floor_pow_of_2((__val - 1) & __val);
+}
+
+constexpr size_t __ceil_pow_of_2(size_t __val) {
+  return __val == 1 ? 1 : __floor_pow_of_2(__val - 1) << 1;
+}
+
+template <class _Tp, size_t __bytes>
+struct __vec_ext_traits {};
+
+#define _SPECIALIZE_VEC_EXT(_TYPE, _NUM_ELEMENT)                               \
+  template <>                                                                  \
+  struct __vec_ext_traits<_TYPE, sizeof(_TYPE) * _NUM_ELEMENT> {               \
+    using type =                                                               \
+        _TYPE __attribute__((vector_size(sizeof(_TYPE) * _NUM_ELEMENT)));      \
+  }
+
+#define _SPECIALIZE_VEC_EXT_32(_TYPE)                                          \
+  _SPECIALIZE_VEC_EXT(_TYPE, 1);                                               \
+  _SPECIALIZE_VEC_EXT(_TYPE, 2);                                               \
+  _SPECIALIZE_VEC_EXT(_TYPE, 4);                                               \
+  _SPECIALIZE_VEC_EXT(_TYPE, 8);                                               \
+  _SPECIALIZE_VEC_EXT(_TYPE, 16);                                              \
+  _SPECIALIZE_VEC_EXT(_TYPE, 32)
+
+_SPECIALIZE_VEC_EXT_32(char);
+_SPECIALIZE_VEC_EXT_32(char16_t);
+_SPECIALIZE_VEC_EXT_32(char32_t);
+_SPECIALIZE_VEC_EXT_32(wchar_t);
+_SPECIALIZE_VEC_EXT_32(signed char);
+_SPECIALIZE_VEC_EXT_32(signed short);
+_SPECIALIZE_VEC_EXT_32(signed int);
+_SPECIALIZE_VEC_EXT_32(signed long);
+_SPECIALIZE_VEC_EXT_32(signed long long);
+_SPECIALIZE_VEC_EXT_32(unsigned char);
+_SPECIALIZE_VEC_EXT_32(unsigned short);
+_SPECIALIZE_VEC_EXT_32(unsigned int);
+_SPECIALIZE_VEC_EXT_32(unsigned long);
+_SPECIALIZE_VEC_EXT_32(unsigned long long);
+_SPECIALIZE_VEC_EXT_32(float);
+_SPECIALIZE_VEC_EXT_32(double);
+_SPECIALIZE_VEC_EXT_32(long double);
+
+#undef _SPECIALIZE_VEC_EXT_32
+#undef _SPECIALIZE_VEC_EXT
+
+template <class _Tp, int __num_element>
+class __simd_storage<_Tp, __simd_abi<_StorageKind::_VecExt, __num_element>> {
+  using _StorageType =
+      typename __vec_ext_traits<_Tp, __ceil_pow_of_2(sizeof(_Tp) *
+                                                     __num_element)>::type;
+
+  _StorageType __storage_;
+
+public:
+  class reference {
+    template <class, class>
+    friend class __simd_storage;
+    template <class, class>
+    friend class simd;
+
+    __simd_storage* __ptr_;
+    size_t __index_;
+
+    reference(__simd_storage* __ptr, size_t __index)
+        : __ptr_(__ptr), __index_(__index) {}
+    reference(const reference&) = default;
+
+  public:
+    reference() = delete;
+    reference& operator=(const reference&) = delete;
+
+    operator _Tp() const { return __ptr_->__storage_[__index_]; }
+
+    reference operator=(_Tp __value) && {
+      __ptr_->__storage_[__index_] = __value;
+      return *this;
+    }
+
+    reference operator++() && {
+      ++__ptr_->__storage_[__index_];
+      return *this;
+    }
+
+    _Tp operator++(int)&& { return __ptr_->__storage_[__index_]++; }
+
+    reference operator--() && {
+      --__ptr_->__storage_[__index_];
+      return *this;
+    }
+
+    _Tp operator--(int)&& { return __ptr_->__storage_[__index_]--; }
+
+    reference operator+=(_Tp __value) && {
+      __ptr_->__storage_[__index_] += __value;
+      return *this;
+    }
+
+    reference operator-=(_Tp __value) && {
+      __ptr_->__storage_[__index_] -= __value;
+      return *this;
+    }
+
+    reference operator*=(_Tp __value) && {
+      __ptr_->__storage_[__index_] *= __value;
+      return *this;
+    }
+
+    reference operator/=(_Tp __value) && {
+      __ptr_->__storage_[__index_] /= __value;
+      return *this;
+    }
+
+    reference operator%=(_Tp __value) && {
+      __ptr_->__storage_[__index_] %= __value;
+      return *this;
+    }
+
+    reference operator>>=(_Tp __value) && {
+      __ptr_->__storage_[__index_] >>= __value;
+      return *this;
+    }
+
+    reference operator<<=(_Tp __value) && {
+      __ptr_->__storage_[__index_] <<= __value;
+      return *this;
+    }
+
+    reference operator&=(_Tp __value) && {
+      __ptr_->__storage_[__index_] &= __value;
+      return *this;
+    }
+
+    reference operator|=(_Tp __value) && {
+      __ptr_->__storage_[__index_] |= __value;
+      return *this;
+    }
+
+    reference operator^=(_Tp __value) && {
+      __ptr_->__storage_[__index_] ^= __value;
+      return *this;
+    }
+  };
+
+  _Tp __get(size_t __index) const { return __storage_[__index]; };
+  reference __ref(size_t __index) { return reference(this, __index); }
 };
 
+#endif // _LIBCPP_HAS_NO_VECTOR_EXTENSION
+
 template <class _To, class _From>
 constexpr auto __is_non_narrowing_convertible_impl(_From a[[gnu::unused]])
     -> decltype(_To{a}, true) {
@@ -658,6 +825,17 @@
   return static_cast<_Tp>(__first) + __variadic_sum<_Tp>(__rest...);
 }
 
+template <class _Tp>
+struct __nodeduce {
+  using type = _Tp;
+};
+
+template <class _Tp>
+constexpr bool __vectorizable() {
+  return std::is_arithmetic<_Tp>::value && !std::is_const<_Tp>::value &&
+         !std::is_volatile<_Tp>::value && !std::is_same<_Tp, bool>::value;
+}
+
 _LIBCPP_END_NAMESPACE_EXPERIMENTAL_SIMD
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL_SIMD_ABI
 
@@ -671,10 +849,18 @@
 template <class _Tp>
 inline constexpr int max_fixed_size = 32;
 #endif
+
 template <class _Tp>
 using compatible = fixed_size<16 / sizeof(_Tp)>;
+
 template <class _Tp>
-using native = compatible<_Tp>;
+using native =
+#ifndef _LIBCPP_HAS_NO_VECTOR_EXTENSION
+    __simd_abi<_StorageKind::_VecExt,
+               _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES / sizeof(_Tp)>;
+#else
+    fixed_size<_Tp, _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES / sizeof(_Tp)>;
+#endif // _LIBCPP_HAS_NO_VECTOR_EXTENSION
 
 _LIBCPP_END_NAMESPACE_EXPERIMENTAL_SIMD_ABI
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL_SIMD
@@ -765,6 +951,7 @@
       "Element type should be vectorizable");
 };
 
+// TODO: implement it.
 template <class _Tp, class _Up = typename _Tp::value_type>
 struct memory_alignment;
 
@@ -913,11 +1100,6 @@
 class where_expression;
 
 // masked assignment [simd.mask.where]
-template <class _Tp>
-struct __nodeduce {
-  using type = _Tp;
-};
-
 template <class _Tp, class _Abi>
 where_expression<simd_mask<_Tp, _Abi>, simd<_Tp, _Abi>>
 where(const typename simd<_Tp, _Abi>::mask_type&, simd<_Tp, _Abi>&) noexcept;
@@ -1054,7 +1236,23 @@
 // TODO: implement simd
 template <class _Tp, class _Abi>
 class simd {
+public:
+  using value_type = _Tp;
+  using reference = typename __simd_storage<_Tp, _Abi>::reference;
+  using mask_type = simd_mask<_Tp, _Abi>;
+  using abi_type = _Abi;
+
+  simd() = default;
+  simd(const simd&) = default;
+  simd& operator=(const simd&) = default;
+
+  static constexpr size_t size() noexcept {
+    return simd_size<_Tp, _Abi>::value;
+  }
+
 private:
+  __simd_storage<_Tp, _Abi> __s_;
+
   template <class _Up>
   static constexpr bool __can_broadcast() {
     return (std::is_arithmetic<_Up>::value &&
@@ -1067,57 +1265,97 @@
             std::is_unsigned<_Tp>::value);
   }
 
-public:
-  using value_type = _Tp;
-  // TODO: this is strawman implementation. Turn it into a proxy type.
-  using reference = _Tp&;
-  using mask_type = simd_mask<_Tp, _Abi>;
-
-  using abi_type = _Abi;
+  template <class _Generator, size_t... __indicies>
+  static constexpr decltype(
+      std::forward_as_tuple(std::declval<_Generator>()(
+          std::integral_constant<size_t, __indicies>())...),
+      bool())
+  __can_generate(std::index_sequence<__indicies...>) {
+    return !__variadic_sum<bool>(
+        !__can_broadcast<decltype(std::declval<_Generator>()(
+            std::integral_constant<size_t, __indicies>()))>()...);
+  }
 
-  static constexpr size_t size() noexcept {
-    return simd_size<_Tp, _Abi>::value;
+  template <class _Generator>
+  static bool __can_generate(...) {
+    return false;
   }
 
-  simd() = default;
+  template <class _Generator, size_t... __indicies>
+  void __generator_init(_Generator&& __g, std::index_sequence<__indicies...>) {
+    int __unused[] = {((*this)[__indicies] =
+                           __g(std::integral_constant<size_t, __indicies>()),
+                       0)...};
+    (void)__unused;
+  }
 
+public:
   // implicit type conversion constructor
   template <class _Up,
             class = typename std::enable_if<
                 std::is_same<_Abi, simd_abi::fixed_size<size()>>::value &&
                 __is_non_narrowing_arithmetic_convertible<_Up, _Tp>()>::type>
-  simd(const simd<_Up, simd_abi::fixed_size<size()>>&) {}
+  simd(const simd<_Up, simd_abi::fixed_size<size()>>& __v) {
+    for (size_t __i = 0; __i < size(); __i++) {
+      (*this)[__i] = static_cast<_Tp>(__v[__i]);
+    }
+  }
 
   // implicit broadcast constructor
   template <class _Up,
             class = typename std::enable_if<__can_broadcast<_Up>()>::type>
-  simd(_Up&&);
+  simd(_Up&& __rv) {
+    auto __v = static_cast<_Tp>(__rv);
+    for (size_t __i = 0; __i < size(); __i++) {
+      (*this)[__i] = __v;
+    }
+  }
 
   // generator constructor
-  // TODO: for now only check for the index 0. This is because C++11 doesn't
-  // have index_sequence, and it's hard to check for all indicies without using
-  // index_sequence.
   template <class _Generator,
-            int = decltype(simd(std::declval<_Generator>()(
-                               std::integral_constant<size_t, 0>())),
-                           int())()>
-  explicit simd(_Generator&&);
+            int = typename std::enable_if<
+                __can_generate<_Generator>(std::make_index_sequence<size()>()),
+                int>::type()>
+  explicit simd(_Generator&& __g) {
+    __generator_init(std::forward<_Generator>(__g),
+                     std::make_index_sequence<size()>());
+  }
 
   // load constructor
-  template <class _Up, class _Flags>
-  simd(const _Up*, _Flags);
+  template <
+      class _Up, class _Flags,
+      class = typename std::enable_if<__vectorizable<_Up>()>::type,
+      class = typename std::enable_if<is_simd_flag_type<_Flags>::value>::type>
+  simd(const _Up* __buffer, _Flags) {
+    // TODO: optimize for overaligned flags
+    for (size_t __i = 0; __i < size(); __i++) {
+      (*this)[__i] = static_cast<_Tp>(__buffer[__i]);
+    }
+  }
 
   // loads [simd.load]
   template <class _Up, class _Flags>
-  void copy_from(const _Up*, _Flags);
+  typename std::enable_if<__vectorizable<_Up>() &&
+                          is_simd_flag_type<_Flags>::value>::type
+  copy_from(const _Up* __buffer, _Flags) {
+    *this = simd(__buffer, _Flags());
+  }
 
   // stores [simd.store]
   template <class _Up, class _Flags>
-  void copy_to(_Up*, _Flags) const;
+  typename std::enable_if<__vectorizable<_Up>() &&
+                          is_simd_flag_type<_Flags>::value>::type
+  copy_to(_Up* __buffer, _Flags) const {
+    // TODO: optimize for overaligned flags
+    for (size_t __i = 0; __i < size(); __i++) {
+      __buffer[__i] = static_cast<_Up>((*this)[__i]);
+    }
+  }
 
   // scalar access [simd.subscr]
-  reference operator[](size_t);
-  value_type operator[](size_t) const;
+  reference operator[](size_t __i) { return __s_.__ref(__i); }
+
+  value_type operator[](size_t __i) const { return __s_.__get(__i); }
 
   // unary operators [simd.unary]
   simd& operator++();
Index: libcxx/include/utility
===================================================================
--- libcxx/include/utility
+++ libcxx/include/utility
@@ -833,7 +833,7 @@
 
 #endif
 
-#if _LIBCPP_STD_VER > 11
+#if _LIBCPP_STD_VER >= 11
 
 template<class _Tp, _Tp... _Ip>
 struct _LIBCPP_TEMPLATE_VIS integer_sequence
@@ -886,7 +886,7 @@
 template<class... _Tp>
     using index_sequence_for = make_index_sequence<sizeof...(_Tp)>;
 
-#endif  // _LIBCPP_STD_VER > 11
+#endif  // _LIBCPP_STD_VER >= 11
 
 #if _LIBCPP_STD_VER > 11
 template<class _T1, class _T2 = _T1>
Index: libcxx/test/std/experimental/simd/simd.abi/vector_extension.pass.cpp
===================================================================
--- /dev/null
+++ libcxx/test/std/experimental/simd/simd.abi/vector_extension.pass.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++98, c++03
+
+// <experimental/simd>
+//
+// [simd.abi]
+
+#include <experimental/simd>
+#include <cstdint>
+
+using namespace std::experimental::parallelism_v2;
+
+constexpr inline int reg_width() {
+#if defined(__AVX__)
+  return 32;
+#else
+  return 16;
+#endif
+}
+
+#ifndef _LIBCPP_HAS_NO_VECTOR_EXTENSION
+
+static_assert(sizeof(simd<char, __simd_abi<_StorageKind::_VecExt, 1>>) == 1,
+              "");
+static_assert(sizeof(simd<char, __simd_abi<_StorageKind::_VecExt, 2>>) == 2,
+              "");
+static_assert(sizeof(simd<char, __simd_abi<_StorageKind::_VecExt, 3>>) == 4,
+              "");
+static_assert(sizeof(simd<char, __simd_abi<_StorageKind::_VecExt, 12>>) == 16,
+              "");
+static_assert(sizeof(simd<int32_t, __simd_abi<_StorageKind::_VecExt, 3>>) == 16,
+              "");
+static_assert(sizeof(simd<int32_t, __simd_abi<_StorageKind::_VecExt, 5>>) == 32,
+              "");
+static_assert(
+    std::is_same<simd_abi::native<int8_t>,
+                 __simd_abi<_StorageKind::_VecExt, reg_width()>>::value,
+    "");
+#else
+static_assert(
+    std::is_same<simd_abi::native<int8_t>,
+                 __simd_abi<_StorageKind::_Array, reg_width()>>::value,
+    "");
+
+#endif
+
+static_assert(std::is_same<simd_abi::compatible<int8_t>,
+                           __simd_abi<_StorageKind::_Array, 16>>::value,
+              "");
+
+int main() {}
Index: libcxx/test/std/experimental/simd/simd.access/default.pass.cpp
===================================================================
--- /dev/null
+++ libcxx/test/std/experimental/simd/simd.access/default.pass.cpp
@@ -0,0 +1,213 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++98, c++03
+
+// <experimental/simd>
+//
+// scalar access [simd.subscr]
+// reference operator[](size_t);
+// value_type operator[](size_t) const;
+
+#include <experimental/simd>
+#include <cassert>
+#include <cstdint>
+
+using namespace std::experimental::parallelism_v2;
+
+void test_access() {
+  {
+    native_simd<int> a(42), b(4);
+    static_assert(std::is_convertible<decltype(a[0]), int8_t>::value, "");
+
+    assert(a[0] == 42);
+    assert(!a[0] == !42);
+    assert(~a[0] == ~42);
+    assert(+a[0] == +42);
+    assert(-a[0] == -42);
+    assert(a[0] + b[0] == 42 + 4);
+    assert(a[0] - b[0] == 42 - 4);
+    assert(a[0] * b[0] == 42 * 4);
+    assert(a[0] / b[0] == 42 / 4);
+    assert(a[0] % b[0] == 42 % 4);
+    assert(a[0] << b[0] == (42 << 4));
+    assert(a[0] >> b[0] == (42 >> 4));
+    assert(a[0] < b[0] == false);
+    assert(a[0] <= b[0] == false);
+    assert(a[0] > b[0] == true);
+    assert(a[0] >= b[0] == true);
+    assert(a[0] == b[0] == false);
+    assert(a[0] != b[0] == true);
+    assert((a[0] & b[0]) == (42 & 4));
+    assert((a[0] | b[0]) == (42 | 4));
+    assert((a[0] ^ b[0]) == (42 ^ 4));
+    assert((a[0] && b[0]) == true);
+    assert((a[0] || b[0]) == true);
+
+    {
+      auto c = a;
+      ++c[0];
+      assert(c[0] == 42 + 1);
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      auto ret = c[0]++;
+      assert(ret == 42);
+      assert(c[0] == 42 + 1);
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      --c[0];
+      assert(c[0] == 42 - 1);
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      auto ret = c[0]--;
+      assert(ret == 42);
+      assert(c[0] == 42 - 1);
+      assert(c[1] == 42);
+    }
+
+    {
+      auto c = a;
+      c[0] += b[0];
+      assert(c[0] == 42 + 4);
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      c[0] -= b[0];
+      assert(c[0] == 42 - 4);
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      c[0] *= b[0];
+      assert(c[0] == 42 * 4);
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      c[0] /= b[0];
+      assert(c[0] == 42 / 4);
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      c[0] %= b[0];
+      assert(c[0] == 42 % 4);
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      c[0] >>= b[0];
+      assert(c[0] == (42 >> 4));
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      c[0] <<= b[0];
+      assert(c[0] == (42 << 4));
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      c[0] &= b[0];
+      assert(c[0] == (42 & 4));
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      c[0] |= b[0];
+      assert(c[0] == (42 | 4));
+      assert(c[1] == 42);
+    }
+    {
+      auto c = a;
+      c[0] ^= b[0];
+      assert(c[0] == (42 ^ 4));
+      assert(c[1] == 42);
+    }
+
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] += a[0]));
+    }
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] -= a[0]));
+    }
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] *= a[0]));
+    }
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] /= a[0]));
+    }
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] %= a[0]));
+    }
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] >>= a[0]));
+    }
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] <<= a[0]));
+    }
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] &= a[0]));
+    }
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] |= a[0]));
+    }
+    {
+      auto c = a;
+      (void)(a[0] + (c[0] ^= a[0]));
+    }
+  }
+  {
+    const native_simd<int> a(42);
+    const native_simd<int> b(4);
+    static_assert(std::is_same<decltype(a[0]), int>::value, "");
+
+    assert(a[0] == 42);
+    assert(!a[0] == !42);
+    assert(~a[0] == ~42);
+    assert(+a[0] == +42);
+    assert(-a[0] == -42);
+    assert(a[0] + b[0] == 42 + 4);
+    assert(a[0] - b[0] == 42 - 4);
+    assert(a[0] * b[0] == 42 * 4);
+    assert(a[0] / b[0] == 42 / 4);
+    assert(a[0] % b[0] == 42 % 4);
+    assert(a[0] << b[0] == (42 << 4));
+    assert(a[0] >> b[0] == (42 >> 4));
+    assert(a[0] < b[0] == false);
+    assert(a[0] <= b[0] == false);
+    assert(a[0] > b[0] == true);
+    assert(a[0] >= b[0] == true);
+    assert(a[0] == b[0] == false);
+    assert(a[0] != b[0] == true);
+    assert((a[0] & b[0]) == (42 & 4));
+    assert((a[0] | b[0]) == (42 | 4));
+    assert((a[0] ^ b[0]) == (42 ^ 4));
+    assert((a[0] && b[0]) == true);
+    assert((a[0] || b[0]) == true);
+  }
+}
+
+int main() { test_access(); }
Index: libcxx/test/std/experimental/simd/simd.casts/simd_cast.pass.cpp
===================================================================
--- libcxx/test/std/experimental/simd/simd.casts/simd_cast.pass.cpp
+++ libcxx/test/std/experimental/simd/simd.casts/simd_cast.pass.cpp
@@ -13,6 +13,7 @@
 //
 // [simd.casts]
 // template <class T, class U, class Abi> see below simd_cast(const simd<U, Abi>&);
+
 #include <experimental/simd>
 #include <cstdint>
 
Index: libcxx/test/std/experimental/simd/simd.cons/broadcast.pass.cpp
===================================================================
--- libcxx/test/std/experimental/simd/simd.cons/broadcast.pass.cpp
+++ libcxx/test/std/experimental/simd/simd.cons/broadcast.pass.cpp
@@ -52,4 +52,26 @@
   not_supported_native_simd_ctor<int>(3.);
 }
 
-int main() {}
+void compile_convertible() {
+  struct ConvertibleToInt {
+    operator int64_t() const;
+  };
+  supported_native_simd_ctor<int64_t>(ConvertibleToInt());
+
+  struct NotConvertibleToInt {};
+  not_supported_native_simd_ctor<int64_t>(NotConvertibleToInt());
+}
+
+void compile_unsigned() {
+  not_supported_native_simd_ctor<int>(3u);
+  supported_native_simd_ctor<uint16_t>(3u);
+}
+
+void test_broadcast() {
+  native_simd<int> a(3);
+  for (size_t i = 0; i < a.size(); i++) {
+    assert(a[i] == 3);
+  }
+}
+
+int main() { test_broadcast(); }
Index: libcxx/test/std/experimental/simd/simd.cons/default.pass.cpp
===================================================================
--- /dev/null
+++ libcxx/test/std/experimental/simd/simd.cons/default.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++98, c++03
+
+// <experimental/simd>
+//
+// [simd.class]
+// template <class U> simd() = default;
+
+#include <cstdint>
+#include <experimental/simd>
+
+using namespace std::experimental::parallelism_v2;
+
+int main() { (void)native_simd<int32_t>(); }
Index: libcxx/test/std/experimental/simd/simd.cons/geneartor.pass.cpp
===================================================================
--- libcxx/test/std/experimental/simd/simd.cons/geneartor.pass.cpp
+++ libcxx/test/std/experimental/simd/simd.cons/geneartor.pass.cpp
@@ -14,30 +14,69 @@
 // [simd.class]
 // template <class G> explicit simd(G&& gen);
 
-#include <cstdint>
 #include <experimental/simd>
+#include <cstdint>
 
 using namespace std::experimental::parallelism_v2;
 
 template <class T, class... Args>
-auto not_supported_native_simd_ctor(Args&&... args)
-    -> decltype(native_simd<T>(std::forward<Args>(args)...), void()) = delete;
+auto not_supported_simd128_ctor(Args&&... args)
+    -> decltype(fixed_size_simd<T, 16 / sizeof(T)>(std::forward<Args>(args)...),
+                void()) = delete;
 
 template <class T>
-void not_supported_native_simd_ctor(...) {}
+void not_supported_simd128_ctor(...) {}
 
 template <class T, class... Args>
-auto supported_native_simd_ctor(Args&&... args)
-    -> decltype(native_simd<T>(std::forward<Args>(args)...), void()) {}
+auto supported_simd128_ctor(Args&&... args)
+    -> decltype(fixed_size_simd<T, 16 / sizeof(T)>(std::forward<Args>(args)...),
+                void()) {}
 
 template <class T>
-void supported_native_simd_ctor(...) = delete;
+void supported_simd128_ctor(...) = delete;
+
+struct identity {
+  template <size_t value>
+  int operator()(std::integral_constant<size_t, value>) const {
+    return value;
+  }
+};
 
 void compile_generator() {
-  supported_native_simd_ctor<int>([](int i) { return i; });
-  not_supported_native_simd_ctor<int>([](int i) { return float(i); });
-  not_supported_native_simd_ctor<int>([](intptr_t i) { return (int*)(i); });
-  not_supported_native_simd_ctor<int>([](int* i) { return i; });
+  supported_simd128_ctor<int>(identity());
+  not_supported_simd128_ctor<int>([](int i) { return float(i); });
+  not_supported_simd128_ctor<int>([](intptr_t i) { return (int*)(i); });
+  not_supported_simd128_ctor<int>([](int* i) { return i; });
+}
+
+struct limited_identity {
+  template <size_t value>
+  typename std::conditional<value <= 2, int32_t, int64_t>::type
+  operator()(std::integral_constant<size_t, value>) const {
+    return value;
+  }
+};
+
+void compile_limited_identity() {
+  supported_simd128_ctor<int64_t>(limited_identity());
+  not_supported_simd128_ctor<int32_t>(limited_identity());
+}
+
+void test_generator() {
+  {
+    fixed_size_simd<int32_t, 4> a([](int i) { return i; });
+    assert(a[0] == 0);
+    assert(a[1] == 1);
+    assert(a[2] == 2);
+    assert(a[3] == 3);
+  }
+  {
+    fixed_size_simd<int32_t, 4> a([](int i) { return 2 * i - 1; });
+    assert(a[0] == -1);
+    assert(a[1] == 1);
+    assert(a[2] == 3);
+    assert(a[3] == 5);
+  }
 }
 
-int main() {}
+int main() { test_generator(); }
Index: libcxx/test/std/experimental/simd/simd.cons/load.pass.cpp
===================================================================
--- /dev/null
+++ libcxx/test/std/experimental/simd/simd.cons/load.pass.cpp
@@ -0,0 +1,108 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++98, c++03
+
+// <experimental/simd>
+//
+// [simd.class]
+// template <class U, class Flags> simd(const U* mem, Flags f);
+
+#include <cstdint>
+#include <experimental/simd>
+
+using namespace std::experimental::parallelism_v2;
+
+template <class T, class... Args>
+auto not_supported_native_simd_ctor(Args&&... args)
+    -> decltype(native_simd<T>(std::forward<Args>(args)...), void()) = delete;
+
+template <class T>
+void not_supported_native_simd_ctor(...) {}
+
+template <class T, class... Args>
+auto supported_native_simd_ctor(Args&&... args)
+    -> decltype(native_simd<T>(std::forward<Args>(args)...), void()) {}
+
+template <class T>
+void supported_native_simd_ctor(...) = delete;
+
+void compile_load_ctor() {
+  supported_native_simd_ctor<int>((int*)nullptr, element_aligned_tag());
+  supported_native_simd_ctor<uint32_t>((int*)nullptr, element_aligned_tag());
+  supported_native_simd_ctor<double>((float*)nullptr, element_aligned_tag());
+  supported_native_simd_ctor<uint16_t>((unsigned int*)nullptr,
+                                       element_aligned_tag());
+  supported_native_simd_ctor<uint32_t>((float*)nullptr, element_aligned_tag());
+
+  not_supported_native_simd_ctor<int>((int*)nullptr, int());
+}
+
+void test_load_ctor() {
+  alignas(32) int32_t buffer[] = {4, 3, 2, 1};
+  {
+    fixed_size_simd<int32_t, 4> a(buffer, element_aligned_tag());
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+  {
+    fixed_size_simd<int32_t, 4> a(buffer, vector_aligned_tag());
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+  {
+    fixed_size_simd<int32_t, 4> a(buffer, overaligned_tag<32>());
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+
+#if TEST_STD_VER > 14
+  {
+    fixed_size_simd<int32_t, 4> a(buffer, element_aligned);
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+  {
+    fixed_size_simd<int32_t, 4> a(buffer, vector_aligned);
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+  {
+    fixed_size_simd<int32_t, 4> a(buffer, overaligned);
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+#endif
+}
+
+void test_converting_load_ctor() {
+  float buffer[] = {1., 2., 4., 8.};
+  fixed_size_simd<int32_t, 4> a(buffer, element_aligned_tag());
+  assert(a[0] == 1);
+  assert(a[1] == 2);
+  assert(a[2] == 4);
+  assert(a[3] == 8);
+}
+
+int main() {
+  test_load_ctor();
+  test_converting_load_ctor();
+}
Index: libcxx/test/std/experimental/simd/simd.mem/load.pass.cpp
===================================================================
--- /dev/null
+++ libcxx/test/std/experimental/simd/simd.mem/load.pass.cpp
@@ -0,0 +1,116 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++98, c++03
+
+// <experimental/simd>
+//
+// loads [simd.load]
+// template <class U, class Flags> void copy_from(const U* mem, Flags f);
+
+#include <experimental/simd>
+#include <cstdint>
+
+using namespace std::experimental::parallelism_v2;
+
+template <class T, class... Args>
+auto not_supported_load(Args&&... args) -> decltype(
+    std::declval<native_simd<T>>().copy_from(std::forward<Args>(args)...),
+    void()) = delete;
+
+template <class T>
+void not_supported_load(...) {}
+
+template <class T, class... Args>
+auto supported_load(Args&&... args) -> decltype(
+    std::declval<native_simd<T>>().copy_from(std::forward<Args>(args)...),
+    void()) {}
+
+template <class T>
+void supported_load(...) = delete;
+
+void compile_load() {
+  supported_load<int>((int*)nullptr, element_aligned_tag());
+  supported_load<uint32_t>((int*)nullptr, element_aligned_tag());
+  supported_load<double>((float*)nullptr, element_aligned_tag());
+  supported_load<uint16_t>((unsigned int*)nullptr, element_aligned_tag());
+  supported_load<uint32_t>((float*)nullptr, element_aligned_tag());
+
+  not_supported_load<int>((int*)nullptr, int());
+}
+
+void test_load() {
+  alignas(32) int32_t buffer[] = {4, 3, 2, 1};
+  {
+    fixed_size_simd<int32_t, 4> a;
+    a.copy_from(buffer, element_aligned_tag());
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+  {
+    fixed_size_simd<int32_t, 4> a;
+    a.copy_from(buffer, vector_aligned_tag());
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+  {
+    fixed_size_simd<int32_t, 4> a;
+    a.copy_from(buffer, overaligned_tag<32>());
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+
+#if TEST_STD_VER > 14
+  {
+    fixed_size_simd<int32_t, 4> a;
+    a.copy_from(buffer, element_aligned);
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+  {
+    fixed_size_simd<int32_t, 4> a;
+    a.copy_from(buffer, vector_aligned);
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+  {
+    fixed_size_simd<int32_t, 4> a;
+    a.copy_from(buffer, overaligned);
+    assert(a[0] == 4);
+    assert(a[1] == 3);
+    assert(a[2] == 2);
+    assert(a[3] == 1);
+  }
+#endif
+}
+
+void test_converting_load() {
+  float buffer[] = {1., 2., 4., 8.};
+  fixed_size_simd<int32_t, 4> a;
+  a.copy_from(buffer, element_aligned_tag());
+  assert(a[0] == 1);
+  assert(a[1] == 2);
+  assert(a[2] == 4);
+  assert(a[3] == 8);
+}
+
+int main() {
+  test_load();
+  test_converting_load();
+}
Index: libcxx/test/std/experimental/simd/simd.mem/store.pass.cpp
===================================================================
--- /dev/null
+++ libcxx/test/std/experimental/simd/simd.mem/store.pass.cpp
@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++98, c++03
+
+// <experimental/simd>
+//
+// // stores [simd.store]
+// template <class U, class Flags> void copy_to(U* mem, Flags f) const;
+
+#include <experimental/simd>
+#include <cstdint>
+
+using namespace std::experimental::parallelism_v2;
+
+void test_store() {
+  fixed_size_simd<int32_t, 4> a([](int i) { return 4 - i; });
+  {
+    alignas(32) int32_t buffer[4] = {0};
+    a.copy_to(buffer, element_aligned_tag());
+    assert(buffer[0] == 4);
+    assert(buffer[1] == 3);
+    assert(buffer[2] == 2);
+    assert(buffer[3] == 1);
+  }
+  {
+    alignas(32) int32_t buffer[4] = {0};
+    a.copy_to(buffer, vector_aligned_tag());
+    assert(buffer[0] == 4);
+    assert(buffer[1] == 3);
+    assert(buffer[2] == 2);
+    assert(buffer[3] == 1);
+  }
+  {
+    alignas(32) int32_t buffer[4] = {0};
+    a.copy_to(buffer, overaligned_tag<32>());
+    assert(buffer[0] == 4);
+    assert(buffer[1] == 3);
+    assert(buffer[2] == 2);
+    assert(buffer[3] == 1);
+  }
+
+#if TEST_STD_VER > 14
+  {
+    alignas(32) int32_t buffer[4] = {0};
+    a.copy_to(buffer, element_aligned);
+    assert(buffer[0] == 4);
+    assert(buffer[1] == 3);
+    assert(buffer[2] == 2);
+    assert(buffer[3] == 1);
+  }
+  {
+    alignas(32) int32_t buffer[4] = {0};
+    a.copy_to(buffer, vector_aligned);
+    assert(buffer[0] == 4);
+    assert(buffer[1] == 3);
+    assert(buffer[2] == 2);
+    assert(buffer[3] == 1);
+  }
+  {
+    alignas(32) int32_t buffer[4] = {0};
+    a.copy_to(buffer, overaligned);
+    assert(buffer[0] == 4);
+    assert(buffer[1] == 3);
+    assert(buffer[2] == 2);
+    assert(buffer[3] == 1);
+  }
+#endif
+}
+
+void test_converting_store() {
+  float buffer[4] = {0.};
+  fixed_size_simd<int32_t, 4> a([](int i) { return 1 << i; });
+  a.copy_to(buffer, element_aligned_tag());
+  assert(buffer[0] == 1.);
+  assert(buffer[1] == 2.);
+  assert(buffer[2] == 4.);
+  assert(buffer[3] == 8.);
+}
+
+int main() {
+  test_store();
+  test_converting_store();
+}